From e52dcc7395bbcaea0530febbdbc576ce11d6c109 Mon Sep 17 00:00:00 2001 From: zhangwm Date: Tue, 6 Sep 2022 09:38:39 +0800 Subject: [PATCH] SHL: version 2.0 --- CMakeLists.txt | 160 +- Makefile | 26 +- README.md | 14 +- include/csi_c906.h | 520 -- include/csi_debug.h | 287 - include/csi_e804.h | 84 - include/csi_gref.h | 556 -- include/csi_i805.h | 146 - include/csi_memory.h | 30 - include/csi_nn.h | 1275 ++-- include/csi_ref.h | 1195 ---- include/csi_ref_i805.h | 84 - include/csi_thead_rvv.h | 389 - include/csi_utils.h | 108 - ...{csi_internal.h => csinn_data_structure.h} | 384 +- include/csinn_runtime.h | 87 + include/include_xt800/csi_i805_nnfunction.h | 346 - include/include_xt800/csi_instance.h | 87 - include/include_xt800/csi_nn_tables.h | 54 - .../include_xt800/csi_nnsupportfunctions.h | 320 - include/include_xt800/csky_dsp2_nnfunctions.h | 745 -- include/{csi_c860.h => shl_c860.h} | 14 +- include/shl_c906.h | 519 ++ include/shl_c908.h | 338 + include/shl_debug.h | 293 + include/shl_e804.h | 82 + include/shl_gref.h | 604 ++ include/shl_i805.h | 144 + include/shl_memory.h | 33 + include/{csi_node.h => shl_node.h} | 40 +- include/shl_ref.h | 1206 ++++ include/shl_ref_i805.h | 77 + include/shl_thead_rvv.h | 668 ++ include/shl_utils.h | 97 + ..._u8_to_f32_c860.S => shl_c860_u8_to_f32.S} | 12 +- source/c860_opt/utils.S | 12 +- source/c906_opt/abs.c | 15 +- source/c906_opt/add.c | 80 +- source/c906_opt/avgpool.c | 99 +- source/c906_opt/broadcast_to.c | 9 +- source/c906_opt/cache_conv1d.c | 29 +- source/c906_opt/cache_matmul.c | 45 +- source/c906_opt/clip.c | 15 +- source/c906_opt/concat.c | 22 +- source/c906_opt/convolution.c | 111 +- source/c906_opt/convolution_1x1_fp16.c | 64 +- ...nvolution_1x1.c => convolution_1x1_fp32.c} | 59 +- source/c906_opt/convolution_3x3_fp16.c | 1734 +++-- source/c906_opt/convolution_3x3_fp32.c | 2378 +++--- source/c906_opt/convolution_gemm_fp16.c | 82 +- source/c906_opt/convolution_relu.c | 56 +- ...ution_sgemm.c => convolution_sgemm_fp32.c} | 83 +- source/c906_opt/depthwise_convolution_3x3.c | 970 --- .../c906_opt/depthwise_convolution_3x3_fp16.c | 41 +- .../c906_opt/depthwise_convolution_3x3_fp32.c | 968 +++ .../depthwise_convolution_3x3_pack4.c | 1487 ---- .../depthwise_convolution_3x3_pack4_fp32.c | 1487 ++++ .../depthwise_convolution_3x3_pack8_fp16.c | 21 +- ...5x5.c => depthwise_convolution_5x5_fp32.c} | 126 +- source/c906_opt/depthwise_convolution_fp16.c | 16 +- ... => depthwise_convolution_relu_3x3_fp32.c} | 8 +- ...pthwise_convolution_relu_3x3_pack4_fp32.c} | 9 +- ... => depthwise_convolution_relu_5x5_fp32.c} | 9 +- source/c906_opt/div.c | 17 +- source/c906_opt/fullyconnected.c | 110 +- source/c906_opt/gather.c | 15 +- source/c906_opt/gemm_fp16.c | 37 +- source/c906_opt/gemm_fp32.c | 3459 +++++++++ source/c906_opt/gemv_fp16.c | 26 +- source/c906_opt/{sgemv.c => gemv_fp32.c} | 5 +- source/c906_opt/global_avgpool.c | 15 +- source/c906_opt/global_maxpool.c | 14 +- source/c906_opt/hpm.c | 31 +- source/c906_opt/layer_norm.c | 19 +- source/c906_opt/leaky_relu.c | 15 +- source/c906_opt/lrn.c | 12 +- source/c906_opt/matmul.c | 24 +- source/c906_opt/maxpool.c | 103 +- source/c906_opt/minimum.c | 76 +- source/c906_opt/mul.c | 78 +- source/c906_opt/pad.c | 16 +- source/c906_opt/prelu.c | 36 +- source/c906_opt/relu.c | 15 +- source/c906_opt/relu1.c | 15 +- source/c906_opt/relu6.c | 15 +- source/c906_opt/reshape.c | 12 +- source/c906_opt/setup.c | 562 +- source/c906_opt/sgemm.c | 3165 -------- source/c906_opt/shl_c906_u8_to_f32.S | 134 + source/c906_opt/split.c | 19 +- source/c906_opt/sub.c | 244 +- source/c906_opt/sum.c | 9 +- source/c906_opt/transpose.c | 10 +- source/c906_opt/utils.c | 29 +- source/c908_opt/avgpool.c | 221 + source/c908_opt/convolution.c | 408 ++ source/c908_opt/convolution_1x1_fp16.c | 87 + .../c908_opt/convolution_1x1_fp16_pack1ton.c | 81 + source/c908_opt/convolution_1x1_fp16_packn.c | 69 + .../c908_opt/convolution_1x1_fp16_packnto1.c | 74 + source/c908_opt/convolution_1x1_fp32.c | 87 + .../c908_opt/convolution_1x1_fp32_pack1ton.c | 83 + source/c908_opt/convolution_1x1_fp32_packn.c | 80 + .../c908_opt/convolution_1x1_fp32_packnto1.c | 74 + source/c908_opt/convolution_1x1_int8.c | 110 + .../c908_opt/convolution_1x1_int8_pack1ton.c | 126 + source/c908_opt/convolution_1x1_int8_packn.c | 84 + .../c908_opt/convolution_1x1_int8_packnto1.c | 90 + source/c908_opt/convolution_3x3_fp16.c | 2834 ++++++++ source/c908_opt/convolution_3x3_fp16_packn.c | 1044 +++ .../c908_opt/convolution_3x3_fp16_packn_1.c | 2310 ++++++ source/c908_opt/convolution_3x3_fp32.c | 1690 +++++ source/c908_opt/convolution_3x3_fp32_packn.c | 1048 +++ .../c908_opt/convolution_3x3_fp32_packn_1.c | 2029 ++++++ source/c908_opt/convolution_3x3_int8.c | 2801 ++++++++ source/c908_opt/convolution_3x3_int8_packn.c | 630 ++ .../c908_opt/convolution_3x3_int8_packn_1.c | 1060 +++ source/c908_opt/convolution_gemm_fp16.c | 128 + .../c908_opt/convolution_gemm_fp16_pack1ton.c | 128 + source/c908_opt/convolution_gemm_fp16_packn.c | 124 + .../c908_opt/convolution_gemm_fp16_packnto1.c | 126 + source/c908_opt/convolution_gemm_fp32.c | 128 + .../c908_opt/convolution_gemm_fp32_pack1ton.c | 128 + source/c908_opt/convolution_gemm_fp32_packn.c | 127 + .../c908_opt/convolution_gemm_fp32_packnto1.c | 127 + source/c908_opt/convolution_gemm_int8.c | 151 + .../c908_opt/convolution_gemm_int8_pack1ton.c | 225 + source/c908_opt/convolution_gemm_int8_packn.c | 193 + .../c908_opt/convolution_gemm_int8_packnto1.c | 222 + source/c908_opt/depthwise_convolution.c | 209 + source/c908_opt/fullyconnected.c | 87 + source/c908_opt/gemm_fp16.c | 3679 ++++++++++ source/c908_opt/gemm_fp16_packn.c | 54 + source/c908_opt/gemm_fp16_v256.c | 3247 +++++++++ source/c908_opt/gemm_fp32.c | 3247 +++++++++ source/c908_opt/gemm_fp32_packn.c | 54 + source/c908_opt/gemm_fp32_v256.c | 3246 +++++++++ source/c908_opt/gemm_int16_packn.c | 37 + source/c908_opt/gemm_int8.c | 4083 +++++++++++ source/c908_opt/gemm_int8_packn.c | 47 + source/c908_opt/gemm_int8_v256.c | 1714 +++++ .../c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S | 1308 ++++ .../c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S | 1309 ++++ .../c908_opt/gemm_kernel/gemm_int16_ncxhwx.S | 452 ++ .../c908_opt/gemm_kernel/gemm_int4_ncxhwx.S | 870 +++ .../c908_opt/gemm_kernel/gemm_int8_ncxhwx.S | 1078 +++ source/c908_opt/maxpool.c | 270 + source/c908_opt/reorder.c | 1128 +++ source/c908_opt/setup.c | 126 + ..._q15.S => shl_xt800p_nn_activations_q15.S} | 22 +- ...ns_q7.S => shl_xt800p_nn_activations_q7.S} | 22 +- ...t800p_relu_q15.S => shl_xt800p_relu_q15.S} | 20 +- ..._xt800p_relu_q7.S => shl_xt800p_relu_q7.S} | 20 +- source/e804_opt/avgpool.c | 37 +- source/e804_opt/convolution.c | 197 +- ... => shl_xt800p_convolve_1x1_HWC_q7_fast.S} | 20 +- ....S => shl_xt800p_convolve_HWC_q15_basic.S} | 22 +- ...RGB.S => shl_xt800p_convolve_HWC_q7_RGB.S} | 22 +- ...c.S => shl_xt800p_convolve_HWC_q7_basic.S} | 22 +- ..._xt800p_depthwise_separable_conv_HWC_q7.S} | 20 +- .../e804_opt/e804_function.h | 325 +- ...l_xt800p_fully_connected_mat_q7_vec_q15.S} | 23 +- ...q15.S => shl_xt800p_fully_connected_q15.S} | 23 +- ...d_q7.S => shl_xt800p_fully_connected_q7.S} | 23 +- source/e804_opt/fullyconnected.c | 23 +- source/e804_opt/maxpool.c | 39 +- .../e804_opt/nn-support/csi_xt800p_nntables.c | 289 - .../e804_opt/nn-support/shl_xt800p_nntables.c | 156 + ...pool_q7_HWC.S => shl_xt800p_pool_q7_HWC.S} | 36 +- source/e804_opt/relu.c | 20 +- source/e804_opt/setup.c | 111 +- source/e804_opt/sigmoid.c | 20 +- source/e804_opt/softmax.c | 20 +- ...softmax_q15.S => shl_xt800p_softmax_q15.S} | 21 +- ...p_softmax_q7.S => shl_xt800p_softmax_q7.S} | 21 +- source/e804_opt/tanh.c | 20 +- source/graph_ref/abs.c | 12 +- source/graph_ref/acos.c | 12 +- source/graph_ref/acosh.c | 11 +- source/graph_ref/add.c | 13 +- source/graph_ref/all.c | 11 +- source/graph_ref/and.c | 13 +- source/graph_ref/any.c | 12 +- source/graph_ref/arange.c | 10 +- source/graph_ref/argmax.c | 12 +- source/graph_ref/argmin.c | 12 +- source/graph_ref/asin.c | 12 +- source/graph_ref/asinh.c | 12 +- source/graph_ref/atan.c | 12 +- source/graph_ref/atanh.c | 12 +- source/graph_ref/avgpool.c | 12 +- source/graph_ref/avgpool3d.c | 19 +- source/graph_ref/batch_to_space.c | 12 +- source/graph_ref/batch_to_space_nd.c | 12 +- source/graph_ref/bn.c | 18 +- source/graph_ref/broadcast_to.c | 12 +- source/graph_ref/cache_conv1d.c | 12 +- source/graph_ref/cache_matmul.c | 12 +- source/graph_ref/ceil.c | 12 +- source/graph_ref/clip.c | 12 +- source/graph_ref/col2im.c | 13 +- source/graph_ref/concat.c | 31 +- source/graph_ref/convolution.c | 75 +- source/graph_ref/convolution1d.c | 14 +- source/graph_ref/convolution3d.c | 14 +- source/graph_ref/cos.c | 11 +- source/graph_ref/cosh.c | 12 +- source/graph_ref/crop.c | 11 +- source/graph_ref/cumprod.c | 11 +- source/graph_ref/cumsum.c | 11 +- source/graph_ref/data_convert.c | 28 + source/graph_ref/deconvolution.c | 24 +- source/graph_ref/deconvolution3d.c | 14 +- source/graph_ref/depth_to_space.c | 11 +- source/graph_ref/div.c | 12 +- source/graph_ref/elu.c | 11 +- source/graph_ref/equal.c | 12 +- source/graph_ref/erf.c | 11 +- source/graph_ref/exp.c | 11 +- source/graph_ref/expand_dims.c | 11 +- source/graph_ref/expm1.c | 11 +- source/graph_ref/flatten.c | 11 +- source/graph_ref/floor.c | 11 +- source/graph_ref/floor_divide.c | 12 +- source/graph_ref/floor_mod.c | 12 +- source/graph_ref/fsmn.c | 47 +- source/graph_ref/fullyconnected.c | 14 +- source/graph_ref/gather.c | 13 +- source/graph_ref/gather_nd.c | 13 +- source/graph_ref/global_averagepool.c | 11 +- source/graph_ref/global_maxpool.c | 11 +- source/graph_ref/greater.c | 12 +- source/graph_ref/greater_equal.c | 12 +- source/graph_ref/hard_sigmoid.c | 11 +- source/graph_ref/im2col.c | 11 +- source/graph_ref/isnan.c | 11 +- source/graph_ref/l2_normalization.c | 11 +- source/graph_ref/l2pool.c | 11 +- source/graph_ref/layer_norm.c | 14 +- source/graph_ref/leaky_relu.c | 11 +- source/graph_ref/less.c | 12 +- source/graph_ref/less_equal.c | 12 +- source/graph_ref/log.c | 11 +- source/graph_ref/log1p.c | 11 +- source/graph_ref/log_softmax.c | 11 +- source/graph_ref/logical_and.c | 12 +- source/graph_ref/logical_not.c | 11 +- source/graph_ref/logical_or.c | 12 +- source/graph_ref/logical_xor.c | 12 +- source/graph_ref/lrn.c | 11 +- source/graph_ref/matmul.c | 12 +- source/graph_ref/max.c | 12 +- source/graph_ref/maximum.c | 12 +- source/graph_ref/maxpool.c | 12 +- source/graph_ref/maxpool2d_locat.c | 12 +- source/graph_ref/maxpool3d.c | 11 +- source/graph_ref/mean.c | 12 +- source/graph_ref/min.c | 11 +- source/graph_ref/minimum.c | 12 +- source/graph_ref/mod.c | 12 +- source/graph_ref/mul.c | 12 +- source/graph_ref/ndarray_size.c | 11 +- source/graph_ref/negative.c | 12 +- source/graph_ref/non_max_suppression.c | 13 +- source/graph_ref/not.c | 11 +- source/graph_ref/not_equal.c | 12 +- source/graph_ref/or.c | 12 +- source/graph_ref/pad.c | 11 +- source/graph_ref/power.c | 12 +- source/graph_ref/prelu.c | 12 +- source/graph_ref/prod.c | 11 +- source/graph_ref/proposal.c | 14 +- source/graph_ref/psroipooling.c | 12 +- source/graph_ref/reduce_logsumexp.c | 11 +- source/graph_ref/reduce_max.c | 11 +- source/graph_ref/reduce_mean.c | 11 +- source/graph_ref/reduce_min.c | 11 +- source/graph_ref/reduce_prod.c | 11 +- source/graph_ref/reduce_sum.c | 11 +- source/graph_ref/relu.c | 12 +- source/graph_ref/relu1.c | 11 +- source/graph_ref/relu6.c | 11 +- source/graph_ref/relun.c | 11 +- source/graph_ref/reorg.c | 11 +- source/graph_ref/reshape.c | 12 +- source/graph_ref/resize.c | 11 +- source/graph_ref/reverse.c | 11 +- source/graph_ref/roialign.c | 12 +- source/graph_ref/roipool.c | 12 +- source/graph_ref/round.c | 11 +- source/graph_ref/rsqrt.c | 11 +- source/graph_ref/scatter.c | 14 +- source/graph_ref/segment_max.c | 12 +- source/graph_ref/segment_mean.c | 12 +- source/graph_ref/segment_min.c | 12 +- source/graph_ref/segment_prod.c | 12 +- source/graph_ref/segment_sum.c | 12 +- source/graph_ref/select.c | 14 +- source/graph_ref/sequence_mask.c | 12 +- source/graph_ref/setup.c | 807 ++- source/graph_ref/shape.c | 11 +- source/graph_ref/shuffle_channel.c | 14 +- source/graph_ref/sigmoid.c | 11 +- source/graph_ref/sign.c | 11 +- source/graph_ref/sin.c | 11 +- source/graph_ref/sinh.c | 11 +- source/graph_ref/slice.c | 11 +- source/graph_ref/softmax.c | 11 +- source/graph_ref/softplus.c | 11 +- source/graph_ref/softrelu.c | 11 +- source/graph_ref/softsign.c | 11 +- source/graph_ref/space_to_batch.c | 11 +- source/graph_ref/space_to_batch_nd.c | 11 +- source/graph_ref/space_to_depth.c | 11 +- source/graph_ref/split.c | 26 +- source/graph_ref/sqrt.c | 11 +- source/graph_ref/square.c | 11 +- source/graph_ref/squeeze.c | 11 +- source/graph_ref/stack.c | 11 +- source/graph_ref/strided_slice.c | 11 +- source/graph_ref/sub.c | 12 +- source/graph_ref/subgraph.c | 517 +- source/graph_ref/sum.c | 11 +- source/graph_ref/tan.c | 11 +- source/graph_ref/tanh.c | 11 +- source/graph_ref/threshold_relu.c | 12 +- source/graph_ref/tile.c | 11 +- source/graph_ref/topk.c | 12 +- source/graph_ref/transpose.c | 12 +- source/graph_ref/trunc.c | 12 +- source/graph_ref/unpooling.c | 13 +- source/graph_ref/unstack.c | 11 +- source/graph_ref/utils.c | 93 +- source/graph_ref/where.c | 13 +- source/graph_ref/xor.c | 12 +- source/graph_ref/yuv_rgb_scale.c | 11 +- .../{csi_i805_clip_8.S => shl_i805_clip_8.S} | 18 +- ...{csi_i805_relu6_8.S => shl_i805_relu6_8.S} | 18 +- .../{csi_i805_relu_8.S => shl_i805_relu_8.S} | 18 +- ..._q15.S => shl_xt800v_nn_activations_q15.S} | 18 +- ...S => shl_xt800v_nn_activations_q15_fast.S} | 23 +- ...ns_q7.S => shl_xt800v_nn_activations_q7.S} | 18 +- ....S => shl_xt800v_nn_activations_q7_fast.S} | 23 +- ...t800v_relu_q15.S => shl_xt800v_relu_q15.S} | 22 +- ..._xt800v_relu_q7.S => shl_xt800v_relu_q7.S} | 22 +- source/i805_opt/add.c | 39 +- source/i805_opt/avgpool.c | 39 +- ...e_add_8.S => shl_i805_elementwise_add_8.S} | 18 +- ...e_mul_8.S => shl_i805_elementwise_mul_8.S} | 18 +- source/i805_opt/clip.c | 26 +- source/i805_opt/convolution.c | 288 +- ...n_1x1_8.S => shl_i805_convolution_1x1_8.S} | 18 +- ...nvolution_8.S => shl_i805_convolution_8.S} | 18 +- ...8.S => shl_i805_depthwise_convolution_8.S} | 18 +- ... => shl_xt800v_convolve_1x1_HWC_q7_fast.S} | 20 +- ....S => shl_xt800v_convolve_HWC_q15_basic.S} | 22 +- ...RGB.S => shl_xt800v_convolve_HWC_q7_RGB.S} | 22 +- ...c.S => shl_xt800v_convolve_HWC_q7_basic.S} | 22 +- ...l_xt800v_convolve_HWC_q7_fast_nonsquare.S} | 20 +- ..._xt800v_depthwise_separable_conv_HWC_q7.S} | 20 +- ...pthwise_separable_conv_HWC_q7_nonsquare.S} | 22 +- ...nected_8.S => shl_i805_fullyconnected_8.S} | 18 +- ...l_xt800v_fully_connected_mat_q7_vec_q15.S} | 23 +- ...q15.S => shl_xt800v_fully_connected_q15.S} | 23 +- ...x4.S => shl_xt800v_fully_connected_q7x4.S} | 23 +- source/i805_opt/fullyconnected.c | 56 +- ...lt_nt_t_8.S => shl_i805_mat_mult_nt_t_8.S} | 18 +- ...mat_mult_8.S => shl_i805_vec_mat_mult_8.S} | 18 +- source/i805_opt/i805_function.h | 1081 +++ source/i805_opt/maxpool.c | 55 +- source/i805_opt/mul.c | 31 +- .../i805_opt/nn-support/csi_xt800v_nntables.c | 290 - .../i805_opt/nn-support/shl_xt800v_nntables.c | 156 + ..._i805_maxpool_8.S => shl_i805_maxpool_8.S} | 18 +- ... => shl_xt800v_avepool_q7_HWC_nonsquare.S} | 19 +- ...pool_q7_HWC.S => shl_xt800v_pool_q7_HWC.S} | 37 +- source/i805_opt/relu.c | 43 +- source/i805_opt/relu6.c | 26 +- source/i805_opt/reshape.c | 15 +- ..._i805_reshape_8.S => shl_i805_reshape_8.S} | 18 +- source/i805_opt/setup.c | 139 +- source/i805_opt/sigmoid.c | 20 +- source/i805_opt/softmax.c | 20 +- ...softmax_q15.S => shl_xt800v_softmax_q15.S} | 21 +- ...v_softmax_q7.S => shl_xt800v_softmax_q7.S} | 21 +- source/i805_opt/tanh.c | 20 +- .../activation/csi_nn_activations_q15.c | 97 - .../activation/csi_nn_activations_q7.c | 86 - source/i805_ref/activation/csi_relu_q15.c | 97 - source/i805_ref/activation/csi_relu_q7.c | 101 - .../i805_ref/activation/shl_activations_q15.c | 91 + .../i805_ref/activation/shl_activations_q7.c | 82 + source/i805_ref/activation/shl_relu_q15.c | 46 + source/i805_ref/activation/shl_relu_q7.c | 46 + source/i805_ref/avgpool.c | 46 +- source/i805_ref/convolution.c | 208 +- .../csi_convolve_1x1_HWC_q7_fast.c | 210 - .../convolution/csi_convolve_HWC_q15_basic.c | 201 - .../convolution/csi_convolve_HWC_q15_fast.c | 251 - .../convolution/csi_convolve_HWC_q7_RGB.c | 265 - .../convolution/csi_convolve_HWC_q7_basic.c | 226 - .../convolution/csi_convolve_HWC_q7_fast.c | 428 -- .../csi_convolve_HWC_q7_fast_nonsquare.c | 384 - .../csi_depthwise_separable_conv_HWC_q7.c | 287 - ...epthwise_separable_conv_HWC_q7_nonsquare.c | 299 - .../csi_nn_mat_mult_kernel_q7_q15.c | 180 - .../csi_nn_mat_mult_kernel_q7_q15_reordered.c | 132 - .../shl_convolve_1x1_HWC_q7_fast.c | 90 + .../convolution/shl_convolve_HWC_q15_basic.c | 91 + .../convolution/shl_convolve_HWC_q15_fast.c | 102 + .../convolution/shl_convolve_HWC_q7_RGB.c | 97 + .../convolution/shl_convolve_HWC_q7_basic.c | 91 + .../convolution/shl_convolve_HWC_q7_fast.c | 106 + .../shl_convolve_HWC_q7_fast_nonsquare.c | 96 + .../shl_depthwise_separable_conv_HWC_q7.c | 103 + ...epthwise_separable_conv_HWC_q7_nonsquare.c | 98 + .../csi_fully_connected_mat_q7_vec_q15.c | 187 - .../csi_fully_connected_mat_q7_vec_q15_opt.c | 313 - .../fully-connect/csi_fully_connected_q15.c | 176 - .../csi_fully_connected_q15_opt.c | 287 - .../fully-connect/csi_fully_connected_q7.c | 192 - .../csi_fully_connected_q7_opt.c | 360 - .../shl_fully_connected_mat_q7_vec_q15.c | 64 + .../shl_fully_connected_mat_q7_vec_q15_opt.c | 179 + .../fully-connect/shl_fully_connected_q15.c | 56 + .../shl_fully_connected_q15_opt.c | 154 + .../fully-connect/shl_fully_connected_q7.c | 66 + .../shl_fully_connected_q7_opt.c | 213 + source/i805_ref/fullyconnected.c | 26 +- .../i805_ref/i805_ref_function.h | 55 +- source/i805_ref/maxpool.c | 39 +- source/i805_ref/nn-support/csi_nntables.c | 291 - .../nn-support/csi_q7_to_q15_no_shift.c | 125 - .../csi_q7_to_q15_reordered_no_shift.c | 136 - source/i805_ref/nn-support/i805_ref_support.h | 113 + source/i805_ref/nn-support/shl_nntables.c | 147 + .../pooling/csi_avepool_q7_HWC_nonsquare.c | 163 - source/i805_ref/pooling/csi_pool_q7_HWC.c | 472 -- .../pooling/shl_avepool_q7_HWC_nonsquare.c | 60 + source/i805_ref/pooling/shl_pool_q7_HWC.c | 135 + source/i805_ref/relu.c | 24 +- source/i805_ref/setup.c | 113 +- source/i805_ref/sigmoid.c | 24 +- source/i805_ref/softmax.c | 24 +- .../{csi_softmax_q15.c => shl_softmax_q15.c} | 78 +- .../{csi_softmax_q7.c => shl_softmax_q7.c} | 81 +- source/i805_ref/tanh.c | 24 +- source/nn2/abs.c | 28 +- source/nn2/acos.c | 28 +- source/nn2/acosh.c | 28 +- source/nn2/add.c | 37 +- source/nn2/all.c | 28 +- source/nn2/and.c | 30 +- source/nn2/any.c | 28 +- source/nn2/arange.c | 24 +- source/nn2/argmax.c | 29 +- source/nn2/argmin.c | 28 +- source/nn2/asin.c | 28 +- source/nn2/asinh.c | 28 +- source/nn2/atan.c | 28 +- source/nn2/atanh.c | 28 +- source/nn2/averagepool.c | 37 +- source/nn2/averagepool3d.c | 28 +- source/nn2/batch_normalization.c | 41 +- source/nn2/batch_to_space.c | 28 +- source/nn2/batch_to_space_nd.c | 28 +- source/nn2/broadcast_to.c | 28 +- source/nn2/cache_conv1d.c | 30 +- source/nn2/cache_matmul.c | 30 +- source/nn2/ceil.c | 28 +- source/nn2/clip.c | 35 +- source/nn2/col2im.c | 31 +- source/nn2/concat.c | 28 +- source/nn2/convolution.c | 93 +- source/nn2/convolution1d.c | 40 +- source/nn2/convolution3d.c | 37 +- source/nn2/convolution_relu.c | 76 +- source/nn2/convolution_relu6.c | 49 +- source/nn2/cos.c | 28 +- source/nn2/cosh.c | 28 +- source/nn2/crop.c | 28 +- source/nn2/cumprod.c | 28 +- source/nn2/cumsum.c | 28 +- source/nn2/data_convert.c | 25 +- source/nn2/deconvolution.c | 45 +- source/nn2/deconvolution3d.c | 36 +- source/nn2/depth_to_space.c | 28 +- source/nn2/depthwise_conv2d.c | 49 + source/nn2/depthwise_conv2d_relu.c | 51 + source/nn2/div.c | 30 +- source/nn2/elu.c | 28 +- source/nn2/equal.c | 30 +- source/nn2/erf.c | 28 +- source/nn2/exp.c | 28 +- source/nn2/expand_dims.c | 28 +- source/nn2/expm1.c | 28 +- source/nn2/flatten.c | 37 +- source/nn2/floor.c | 28 +- source/nn2/floor_divide.c | 30 +- source/nn2/floor_mod.c | 30 +- source/nn2/format.c | 231 + source/nn2/fsmn.c | 41 +- source/nn2/fullyconnected.c | 41 +- source/nn2/gather.c | 31 +- source/nn2/gather_nd.c | 31 +- source/nn2/global_averagepool.c | 28 +- source/nn2/global_maxpool.c | 28 +- source/nn2/greater.c | 30 +- source/nn2/greater_equal.c | 30 +- source/nn2/group_conv2d.c | 50 + source/nn2/hard_sigmoid.c | 28 +- source/nn2/im2col.c | 28 +- source/nn2/isnan.c | 28 +- source/nn2/l2_normalization.c | 28 +- source/nn2/l2pool.c | 28 +- source/nn2/layer_norm.c | 40 +- source/nn2/leaky_relu.c | 28 +- source/nn2/less.c | 30 +- source/nn2/less_equal.c | 30 +- source/nn2/log.c | 28 +- source/nn2/log1p.c | 28 +- source/nn2/log_softmax.c | 28 +- source/nn2/logical_and.c | 30 +- source/nn2/logical_not.c | 28 +- source/nn2/logical_or.c | 30 +- source/nn2/logical_xor.c | 30 +- source/nn2/lrn.c | 28 +- source/nn2/matmul.c | 30 +- source/nn2/max.c | 33 +- source/nn2/maximum.c | 30 +- source/nn2/maxpool.c | 36 +- source/nn2/maxpool2d_locat.c | 29 +- source/nn2/maxpool3d.c | 33 +- source/nn2/mean.c | 29 +- source/nn2/min.c | 31 +- source/nn2/minimum.c | 30 +- source/nn2/mod.c | 30 +- source/nn2/mul.c | 37 +- source/nn2/ndarray_size.c | 28 +- source/nn2/negative.c | 29 +- source/nn2/node.c | 69 +- source/nn2/non_max_suppression.c | 33 +- source/nn2/not.c | 28 +- source/nn2/not_equal.c | 30 +- source/nn2/one_hot.c | 20 +- source/nn2/or.c | 30 +- source/nn2/pad.c | 30 +- source/nn2/power.c | 30 +- source/nn2/prelu.c | 30 +- source/nn2/prod.c | 32 +- source/nn2/proposal.c | 34 +- source/nn2/psroipooling.c | 30 +- source/nn2/reduce_logsumexp.c | 28 +- source/nn2/reduce_max.c | 28 +- source/nn2/reduce_mean.c | 28 +- source/nn2/reduce_min.c | 28 +- source/nn2/reduce_prod.c | 28 +- source/nn2/reduce_sum.c | 28 +- source/nn2/relu.c | 36 +- source/nn2/relu1.c | 28 +- source/nn2/relu6.c | 35 +- source/nn2/relun.c | 28 +- source/nn2/reorg.c | 28 +- source/nn2/reshape.c | 37 +- source/nn2/resize.c | 28 +- source/nn2/reverse.c | 28 +- source/nn2/roialign.c | 30 +- source/nn2/roipool.c | 30 +- source/nn2/round.c | 28 +- source/nn2/rsqrt.c | 28 +- source/nn2/scatter.c | 35 +- source/nn2/segment_max.c | 38 +- source/nn2/segment_mean.c | 37 +- source/nn2/segment_min.c | 39 +- source/nn2/segment_prod.c | 39 +- source/nn2/segment_sum.c | 39 +- source/nn2/select.c | 34 +- source/nn2/sequence_mask.c | 22 +- source/nn2/setup.c | 335 +- source/nn2/shape.c | 28 +- source/nn2/shuffle_channel.c | 31 +- source/nn2/sigmoid.c | 28 +- source/nn2/sign.c | 28 +- source/nn2/sin.c | 28 +- source/nn2/sinh.c | 28 +- source/nn2/slice.c | 30 +- source/nn2/softmax.c | 28 +- source/nn2/softplus.c | 28 +- source/nn2/softrelu.c | 28 +- source/nn2/softsign.c | 28 +- source/nn2/space_to_batch.c | 28 +- source/nn2/space_to_batch_nd.c | 28 +- source/nn2/space_to_depth.c | 28 +- source/nn2/split.c | 28 +- source/nn2/sqrt.c | 28 +- source/nn2/square.c | 28 +- source/nn2/squeeze.c | 28 +- source/nn2/stack.c | 28 +- source/nn2/strided_slice.c | 28 +- source/nn2/sub.c | 30 +- source/nn2/sum.c | 31 +- source/nn2/tan.c | 28 +- source/nn2/tanh.c | 28 +- source/nn2/threshold_relu.c | 29 +- source/nn2/tile.c | 28 +- source/nn2/topk.c | 30 +- source/nn2/transpose.c | 37 +- source/nn2/trunc.c | 29 +- source/nn2/unpooling.c | 29 +- source/nn2/unstack.c | 28 +- source/nn2/utils.c | 601 +- source/nn2/where.c | 24 +- source/nn2/xor.c | 30 +- source/nn2/yuv_rgb_scale.c | 28 +- source/reference/abs.c | 15 +- source/reference/acos.c | 16 +- source/reference/acosh.c | 16 +- source/reference/add.c | 18 +- source/reference/and.c | 22 +- source/reference/arange.c | 24 +- source/reference/argmax.c | 22 +- source/reference/argmin.c | 22 +- source/reference/asin.c | 16 +- source/reference/asinh.c | 16 +- source/reference/atan.c | 16 +- source/reference/atanh.c | 16 +- source/reference/averagepool.c | 50 +- source/reference/averagepool3d.c | 30 +- source/reference/batch_normalization.c | 48 +- source/reference/batch_to_space.c | 22 +- source/reference/broadcast_to.c | 18 +- source/reference/cache_conv1d.c | 48 +- source/reference/cache_matmul.c | 412 +- source/reference/ceil.c | 14 +- source/reference/clip.c | 14 +- source/reference/col2im.c | 8 +- source/reference/concat.c | 28 +- source/reference/conv_avx.h | 58 +- source/reference/convolution.c | 258 +- source/reference/convolution1d.c | 24 +- source/reference/convolution3d.c | 24 +- source/reference/convolution_channel.c | 332 +- source/reference/convolution_relu.c | 72 +- source/reference/convolution_relu6.c | 52 +- source/reference/cos.c | 15 +- source/reference/cosh.c | 16 +- source/reference/cumprod.c | 14 +- source/reference/cumsum.c | 14 +- source/reference/data_convert.c | 14 +- source/reference/deconvolution.c | 114 +- source/reference/deconvolution3d.c | 30 +- source/reference/depth_to_space.c | 59 +- source/reference/div.c | 18 +- source/reference/elu.c | 13 +- source/reference/equal.c | 24 +- source/reference/erf.c | 13 +- source/reference/exp.c | 13 +- source/reference/expand_dims.c | 14 +- source/reference/expm1.c | 14 +- source/reference/flatten.c | 27 +- source/reference/floor.c | 14 +- source/reference/floor_divide.c | 16 +- source/reference/floor_mod.c | 14 +- source/reference/fsmn.c | 47 +- source/reference/fullyconnected.c | 38 +- source/reference/gather.c | 24 +- source/reference/gather_nd.c | 24 +- source/reference/global_averagepool.c | 16 +- source/reference/global_maxpool.c | 16 +- source/reference/greater.c | 15 +- source/reference/greater_equal.c | 15 +- source/reference/hard_sigmoid.c | 15 +- source/reference/im2col.c | 31 +- source/reference/isnan.c | 9 +- source/reference/l2_normalization.c | 15 +- source/reference/l2pool.c | 21 +- source/reference/layer_norm.c | 37 +- source/reference/leaky_relu.c | 15 +- source/reference/less.c | 15 +- source/reference/less_equal.c | 15 +- source/reference/log.c | 14 +- source/reference/log1p.c | 15 +- source/reference/log_softmax.c | 15 +- source/reference/logical_and.c | 15 +- source/reference/logical_not.c | 15 +- source/reference/logical_or.c | 15 +- source/reference/logical_xor.c | 15 +- source/reference/lrn.c | 42 +- source/reference/matmul.c | 15 +- source/reference/max.c | 19 +- source/reference/maximum.c | 15 +- source/reference/maxpool.c | 51 +- source/reference/maxpool2d_locat.c | 55 +- source/reference/maxpool3d.c | 31 +- source/reference/mean.c | 29 +- source/reference/min.c | 19 +- source/reference/minimum.c | 19 +- source/reference/mod.c | 19 +- source/reference/mul.c | 19 +- source/reference/ndarray_size.c | 29 +- source/reference/negative.c | 15 +- source/reference/non_max_suppression.c | 15 +- source/reference/not.c | 19 +- source/reference/not_equal.c | 17 +- source/reference/or.c | 22 +- source/reference/pad.c | 26 +- source/reference/power.c | 19 +- source/reference/prelu.c | 17 +- source/reference/prod.c | 19 +- source/reference/proposal.c | 64 +- source/reference/psroipooling.c | 30 +- source/reference/reduce_logsumexp.c | 15 +- source/reference/reduce_max.c | 15 +- source/reference/reduce_mean.c | 15 +- source/reference/reduce_min.c | 15 +- source/reference/reduce_prod.c | 15 +- source/reference/reduce_sum.c | 15 +- source/reference/relu.c | 15 +- source/reference/relu1.c | 15 +- source/reference/relu6.c | 15 +- source/reference/relun.c | 15 +- source/reference/reshape.c | 28 +- source/reference/resize.c | 77 +- source/reference/reverse.c | 21 +- source/reference/roialign.c | 8 +- source/reference/roipool.c | 29 +- source/reference/round.c | 15 +- source/reference/rsqrt.c | 17 +- source/reference/scatter.c | 37 +- source/reference/segment_max.c | 59 +- source/reference/segment_mean.c | 60 +- source/reference/segment_min.c | 59 +- source/reference/segment_prod.c | 60 +- source/reference/segment_sum.c | 59 +- source/reference/select.c | 29 +- source/reference/setup.c | 892 ++- source/reference/shape.c | 17 +- source/reference/shuffle_channel.c | 44 +- source/reference/sigmoid.c | 15 +- source/reference/sign.c | 15 +- source/reference/sin.c | 16 +- source/reference/sinh.c | 17 +- source/reference/slice.c | 23 +- source/reference/softmax.c | 15 +- source/reference/softplus.c | 15 +- source/reference/softrelu.c | 15 +- source/reference/softsign.c | 15 +- source/reference/space_to_batch.c | 23 +- source/reference/space_to_depth.c | 23 +- source/reference/split.c | 27 +- source/reference/sqrt.c | 15 +- source/reference/square.c | 9 +- source/reference/squeeze.c | 11 +- source/reference/stack.c | 29 +- source/reference/strided_slice.c | 27 +- source/reference/sub.c | 19 +- source/reference/sum.c | 19 +- source/reference/tan.c | 14 +- source/reference/tanh.c | 23 +- source/reference/threshold_relu.c | 15 +- source/reference/tile.c | 19 +- source/reference/topk.c | 29 +- source/reference/transpose.c | 60 +- source/reference/trunc.c | 15 +- source/reference/unpooling.c | 51 +- source/reference/unstack.c | 29 +- source/reference/utils.c | 338 +- source/reference/xor.c | 22 +- source/reference/yuv_rgb_scale.c | 15 +- source/thead_rvv/add.c | 88 +- source/thead_rvv/avgpool.c | 231 +- source/thead_rvv/avgpool_2x2_fp16.c | 12 +- source/thead_rvv/avgpool_2x2_fp16_packn.c | 84 + .../{avgpool_2x2.c => avgpool_2x2_fp32.c} | 12 +- source/thead_rvv/avgpool_2x2_fp32_packn.c | 84 + source/thead_rvv/avgpool_3x3_fp16.c | 16 +- source/thead_rvv/avgpool_3x3_fp16_packn.c | 157 + .../{avgpool_3x3.c => avgpool_3x3_fp32.c} | 16 +- source/thead_rvv/avgpool_3x3_fp32_packn.c | 157 + source/thead_rvv/concat.c | 22 +- source/thead_rvv/convolution.c | 492 +- source/thead_rvv/convolution_1x1_fp16.c | 28 +- .../thead_rvv/convolution_1x1_fp16_pack1ton.c | 80 + source/thead_rvv/convolution_1x1_fp16_packn.c | 69 + .../thead_rvv/convolution_1x1_fp16_packnto1.c | 74 + ...nvolution_1x1.c => convolution_1x1_fp32.c} | 28 +- .../thead_rvv/convolution_1x1_fp32_pack1ton.c | 80 + source/thead_rvv/convolution_1x1_fp32_packn.c | 69 + .../thead_rvv/convolution_1x1_fp32_packnto1.c | 74 + source/thead_rvv/convolution_1x1_int4.c | 43 +- source/thead_rvv/convolution_1x1_int4_packn.c | 95 + source/thead_rvv/convolution_1x1_int8.c | 42 +- .../thead_rvv/convolution_1x1_int8_pack1ton.c | 125 + source/thead_rvv/convolution_1x1_int8_packn.c | 86 + .../thead_rvv/convolution_1x1_int8_packnto1.c | 91 + source/thead_rvv/convolution_3x3.c | 807 --- source/thead_rvv/convolution_3x3_fp16.c | 1680 +++-- source/thead_rvv/convolution_3x3_fp32.c | 1320 ++++ source/thead_rvv/convolution_3x3_int8.c | 682 ++ source/thead_rvv/convolution_gemm_fp16.c | 40 +- .../convolution_gemm_fp16_pack1ton.c | 215 + .../thead_rvv/convolution_gemm_fp16_packn.c | 187 + .../convolution_gemm_fp16_packnto1.c | 210 + ...olution_gemm.c => convolution_gemm_fp32.c} | 40 +- .../convolution_gemm_fp32_pack1ton.c | 215 + .../thead_rvv/convolution_gemm_fp32_packn.c | 188 + .../convolution_gemm_fp32_packnto1.c | 211 + source/thead_rvv/convolution_gemm_int4.c | 60 +- .../thead_rvv/convolution_gemm_int4_packn.c | 43 + source/thead_rvv/convolution_gemm_int8.c | 48 +- .../convolution_gemm_int8_pack1ton.c | 222 + .../thead_rvv/convolution_gemm_int8_packn.c | 194 + .../convolution_gemm_int8_packnto1.c | 223 + source/thead_rvv/data_convert.c | 83 + source/thead_rvv/depthwise_convolution.c | 208 + .../depthwise_convolution_3x3_fp16.c | 28 +- .../depthwise_convolution_3x3_fp16_packn.c | 798 +++ ...3x3.c => depthwise_convolution_3x3_fp32.c} | 28 +- .../depthwise_convolution_3x3_fp32_packn.c | 802 +++ .../depthwise_convolution_3x3_int4.c | 67 +- .../depthwise_convolution_3x3_int8.c | 45 +- ...depthwise_convolution_3x3_int8_dot_packn.c | 71 + .../depthwise_convolution_3x3_int8_packn.c | 905 +++ source/thead_rvv/fullyconnected.c | 146 +- source/thead_rvv/fullyconnected_fp16.c | 68 +- source/thead_rvv/fullyconnected_fp32.c | 111 + source/thead_rvv/fullyconnected_int4.c | 145 + source/thead_rvv/fullyconnected_int8.c | 205 +- source/thead_rvv/gemm_fp16.c | 209 +- source/thead_rvv/gemm_fp16_packn.c | 944 +++ source/thead_rvv/{sgemm.c => gemm_fp32.c} | 160 +- source/thead_rvv/gemm_fp32_packn.c | 946 +++ source/thead_rvv/gemm_int4.c | 212 +- source/thead_rvv/gemm_int4_packn.c | 374 + source/thead_rvv/gemm_int8.c | 282 +- source/thead_rvv/gemm_int8_packn.c | 681 ++ source/thead_rvv/global_avgpool.c | 12 +- source/thead_rvv/global_avgpool_packn.c | 133 + source/thead_rvv/global_maxpool.c | 12 +- source/thead_rvv/global_maxpool_packn.c | 119 + source/thead_rvv/leaky_relu.c | 24 +- source/thead_rvv/maxpool.c | 284 +- source/thead_rvv/maxpool_2x2_fp16.c | 12 +- source/thead_rvv/maxpool_2x2_fp16_packn.c | 84 + .../{maxpool_2x2.c => maxpool_2x2_fp32.c} | 12 +- source/thead_rvv/maxpool_2x2_fp32_packn.c | 90 + source/thead_rvv/maxpool_2x2_int8.c | 12 +- source/thead_rvv/maxpool_2x2_int8_packn.c | 89 + source/thead_rvv/maxpool_3x3_fp16.c | 16 +- source/thead_rvv/maxpool_3x3_fp16_packn.c | 155 + .../{maxpool_3x3.c => maxpool_3x3_fp32.c} | 16 +- source/thead_rvv/maxpool_3x3_fp32_packn.c | 155 + source/thead_rvv/maxpool_3x3_int8.c | 16 +- source/thead_rvv/maxpool_3x3_int8_packn.c | 167 + source/thead_rvv/mul.c | 84 +- source/thead_rvv/pad.c | 501 ++ source/thead_rvv/relu.c | 61 +- source/thead_rvv/relu6.c | 75 + source/thead_rvv/reorder.c | 1976 +++++ source/thead_rvv/setup.c | 482 +- source/thead_rvv/sigmoid.c | 19 +- source/thead_rvv/softmax.c | 8 +- source/thead_rvv/sum.c | 10 +- source/thead_rvv/utils.c | 218 +- source/utils/atat_malloc.c | 46 +- source/utils/debug.c | 1132 ++- source/utils/memory.c | 109 +- tests/Makefile | 11 +- tests/autotest/conftest.py | 6 +- tests/autotest/interface_test.py | 137 +- tests/python_ref/add.py | 14 + tests/python_ref/averagepool_nchw.py | 20 +- tests/python_ref/averagepool_vlen.py | 200 + tests/python_ref/batch_norm_nchw.py | 2 +- tests/python_ref/convolution_nchw.py | 11 +- tests/python_ref/convolution_vlen.py | 205 + .../python_ref/depthwise_convolution_nchw.py | 3 + .../python_ref/depthwise_convolution_vlen.py | 165 + tests/python_ref/global_avgpool_vlen.py | 88 + tests/python_ref/global_maxpool_vlen.py | 85 + tests/python_ref/group_convolution_nchw.py | 27 +- tests/python_ref/l2_norm_anole.py | 66 + tests/python_ref/maxpool_nchw.py | 27 +- tests/python_ref/maxpool_vlen.py | 195 + tests/python_ref/mean_graph.py | 2 +- tests/python_ref/relu.py | 19 +- tests/unit_test/Makefile.rvv | 2 +- tests/unit_test/add.c | 38 +- tests/unit_test/avgpool.c | 87 +- tests/unit_test/concat.c | 46 +- tests/unit_test/conv2d_1x1s1_gemm.c | 88 +- tests/unit_test/conv2d_im2col_gemm.c | 88 +- tests/unit_test/conv2d_winograd.c | 93 +- tests/unit_test/dwconv2d.c | 68 +- tests/unit_test/fullyconnected.c | 59 +- tests/unit_test/gemm.c | 31 +- tests/unit_test/leaky_relu.c | 34 +- tests/unit_test/maxpool.c | 99 +- tests/unit_test/mul.c | 38 +- tests/unit_test/pad.c | 15 +- tests/unit_test/relu.c | 32 +- tests/unit_test/valid_data/conv2d.dat | 6364 ++++++++--------- tests/unit_test/valid_data/fullyconnected.dat | 56 +- tests/utils/math_snr.c | 2 +- tests/utils/math_snr.h | 2 +- tests/utils/test_utils.c | 122 +- tests/utils/test_utils.h | 32 +- tests/validation/Makefile.c860 | 4 +- tests/validation/Makefile.c906 | 2 +- tests/validation/Makefile.ref | 4 +- tests/validation/Makefile.ref_x86 | 4 +- tests/validation/abs_f32.c | 37 +- tests/validation/abs_i8.c | 58 +- tests/validation/abs_u8.c | 58 +- tests/validation/acos_f32.c | 27 +- tests/validation/acos_i8.c | 52 +- tests/validation/acos_u8.c | 52 +- tests/validation/acosh_f32.c | 27 +- tests/validation/acosh_i8.c | 41 +- tests/validation/acosh_u8.c | 43 +- tests/validation/add_f32.c | 41 +- tests/validation/add_i8.c | 88 +- tests/validation/add_u8.c | 88 +- tests/validation/and_u32.c | 31 +- tests/validation/arange_f32.c | 31 +- tests/validation/arange_i8.c | 48 +- tests/validation/arange_u8.c | 48 +- tests/validation/argmax_stride_f32.c | 75 +- tests/validation/argmax_stride_u8.c | 79 +- tests/validation/argmin_stride_f32.c | 75 +- tests/validation/argmin_stride_u8.c | 81 +- tests/validation/asin_f32.c | 27 +- tests/validation/asin_i8.c | 52 +- tests/validation/asin_u8.c | 52 +- tests/validation/asinh_f32.c | 27 +- tests/validation/asinh_i8.c | 52 +- tests/validation/asinh_u8.c | 52 +- tests/validation/atan_f32.c | 27 +- tests/validation/atan_i8.c | 42 +- tests/validation/atan_u8.c | 42 +- tests/validation/atanh_f32.c | 27 +- tests/validation/atanh_i8.c | 43 +- tests/validation/atanh_u8.c | 43 +- tests/validation/averagepool3d_f32.c | 61 +- tests/validation/averagepool3d_i8.c | 90 +- tests/validation/averagepool3d_u8.c | 88 +- tests/validation/averagepool_f32.c | 49 +- tests/validation/averagepool_i8.c | 79 +- tests/validation/averagepool_nchw_f32.c | 49 +- tests/validation/averagepool_nchw_i8.c | 76 +- tests/validation/averagepool_nchw_u8.c | 76 +- tests/validation/averagepool_u8.c | 77 +- tests/validation/batch_norm_f32.c | 53 +- tests/validation/batch_norm_i8.c | 166 +- tests/validation/batch_norm_u8.c | 167 +- tests/validation/batch_to_space_f32.c | 46 +- tests/validation/batch_to_space_i8.c | 78 +- tests/validation/batch_to_space_u8.c | 74 +- tests/validation/broadcast_to_f32.c | 42 +- tests/validation/broadcast_to_i8.c | 63 +- tests/validation/broadcast_to_u8.c | 63 +- tests/validation/ceil_f32.c | 37 +- tests/validation/ceil_i8.c | 59 +- tests/validation/ceil_u8.c | 59 +- tests/validation/clip_f32.c | 37 +- tests/validation/clip_i8.c | 60 +- tests/validation/clip_u8.c | 61 +- tests/validation/concat_f32.c | 67 +- tests/validation/concat_i8.c | 82 +- tests/validation/concat_u8.c | 82 +- tests/validation/convolution3d_f32.c | 114 +- tests/validation/convolution3d_i8.c | 186 +- tests/validation/convolution3d_u8.c | 186 +- .../validation/convolution_channel_nchw_i8.c | 128 +- .../validation/convolution_channel_nchw_u8.c | 130 +- tests/validation/convolution_f32.c | 84 +- tests/validation/convolution_i8.c | 154 +- tests/validation/convolution_nchw_f32.c | 87 +- tests/validation/convolution_nchw_i8.c | 148 +- tests/validation/convolution_nchw_u8.c | 146 +- tests/validation/convolution_relu6_i8.c | 156 +- tests/validation/convolution_relu6_nchw_i8.c | 158 +- tests/validation/convolution_relu6_nchw_u8.c | 156 +- tests/validation/convolution_relu6_u8.c | 153 +- tests/validation/convolution_relu_i8.c | 154 +- tests/validation/convolution_relu_nchw_i8.c | 158 +- tests/validation/convolution_relu_nchw_u8.c | 158 +- tests/validation/convolution_relu_u8.c | 156 +- tests/validation/convolution_u8.c | 155 +- tests/validation/cos_f32.c | 27 +- tests/validation/cos_i8.c | 43 +- tests/validation/cos_u8.c | 43 +- tests/validation/cosh_f32.c | 27 +- tests/validation/cosh_i8.c | 40 +- tests/validation/cosh_u8.c | 42 +- tests/validation/cumprod_f32.c | 38 +- tests/validation/cumprod_i8.c | 63 +- tests/validation/cumprod_u8.c | 63 +- tests/validation/cumsum_f32.c | 40 +- tests/validation/cumsum_i8.c | 64 +- tests/validation/cumsum_u8.c | 66 +- tests/validation/deconvolution3d_f32.c | 124 +- tests/validation/deconvolution3d_u8.c | 152 +- tests/validation/deconvolution_f32.c | 80 +- tests/validation/deconvolution_i8.c | 156 +- tests/validation/deconvolution_nchw_f32.c | 89 +- tests/validation/deconvolution_nchw_i8.c | 151 +- tests/validation/deconvolution_nchw_u8.c | 151 +- tests/validation/deconvolution_u8.c | 156 +- tests/validation/depth_to_space_f32.c | 38 +- tests/validation/depth_to_space_i8.c | 65 +- tests/validation/depth_to_space_u8.c | 65 +- tests/validation/depthwise_convolution_f32.c | 87 +- tests/validation/depthwise_convolution_i8.c | 164 +- .../depthwise_convolution_nchw_f32.c | 92 +- .../depthwise_convolution_nchw_i8.c | 153 +- .../depthwise_convolution_nchw_u8.c | 154 +- .../depthwise_convolution_relu6_i8.c | 164 +- .../depthwise_convolution_relu6_nchw_i8.c | 152 +- .../depthwise_convolution_relu6_nchw_u8.c | 152 +- .../depthwise_convolution_relu6_u8.c | 164 +- .../depthwise_convolution_relu_i8.c | 164 +- .../depthwise_convolution_relu_nchw_i8.c | 153 +- .../depthwise_convolution_relu_nchw_u8.c | 153 +- .../depthwise_convolution_relu_u8.c | 164 +- tests/validation/depthwise_convolution_u8.c | 163 +- .../validation/depthwise_deconvolution_f32.c | 91 +- tests/validation/depthwise_deconvolution_i8.c | 159 +- .../depthwise_deconvolution_nchw_f32.c | 92 +- .../depthwise_deconvolution_nchw_u8.c | 122 +- tests/validation/depthwise_deconvolution_u8.c | 159 +- tests/validation/dequantize_f32.c | 30 +- tests/validation/div_f32.c | 42 +- tests/validation/div_i8.c | 87 +- tests/validation/div_u8.c | 87 +- tests/validation/elu_f32.c | 37 +- tests/validation/elu_i8.c | 46 +- tests/validation/elu_u8.c | 45 +- tests/validation/equal_f32.c | 33 +- tests/validation/equal_i8.c | 73 +- tests/validation/equal_u8.c | 74 +- tests/validation/erf_f32.c | 27 +- tests/validation/erf_i8.c | 50 +- tests/validation/erf_u8.c | 50 +- tests/validation/exp_f32.c | 27 +- tests/validation/exp_i8.c | 50 +- tests/validation/exp_u8.c | 50 +- tests/validation/expand_dims_f32.c | 32 +- tests/validation/expand_dims_i8.c | 60 +- tests/validation/expand_dims_u8.c | 60 +- tests/validation/expm1_f32.c | 28 +- tests/validation/expm1_i8.c | 53 +- tests/validation/expm1_u8.c | 53 +- tests/validation/flatten_f32.c | 28 +- tests/validation/flatten_i8.c | 54 +- tests/validation/flatten_u8.c | 54 +- tests/validation/floor_div_f32.c | 37 +- tests/validation/floor_div_i8.c | 79 +- tests/validation/floor_div_u8.c | 81 +- tests/validation/floor_f32.c | 37 +- tests/validation/floor_i8.c | 63 +- tests/validation/floor_mod_f32.c | 47 +- tests/validation/floor_mod_i8.c | 98 +- tests/validation/floor_mod_u8.c | 98 +- tests/validation/floor_u8.c | 63 +- tests/validation/fullyconnected_f32.c | 50 +- tests/validation/fullyconnected_i8.c | 85 +- tests/validation/fullyconnected_u8.c | 78 +- tests/validation/gather_f32.c | 39 +- tests/validation/gather_i8.c | 61 +- tests/validation/gather_nd_f32.c | 41 +- tests/validation/gather_nd_i8.c | 64 +- tests/validation/gather_nd_u8.c | 62 +- tests/validation/gather_u8.c | 61 +- tests/validation/global_avgpool_i8.c | 72 +- tests/validation/global_avgpool_nchw_i8.c | 73 +- tests/validation/global_avgpool_nchw_u8.c | 72 +- tests/validation/global_avgpool_u8.c | 70 +- tests/validation/global_maxpool_i8.c | 70 +- tests/validation/global_maxpool_nchw_i8.c | 72 +- tests/validation/global_maxpool_nchw_u8.c | 72 +- tests/validation/global_maxpool_u8.c | 70 +- tests/validation/greater_equal_f32.c | 39 +- tests/validation/greater_equal_i8.c | 95 +- tests/validation/greater_equal_u8.c | 95 +- tests/validation/greater_f32.c | 39 +- tests/validation/greater_i8.c | 67 +- tests/validation/greater_u8.c | 67 +- tests/validation/group_convolution_f32.c | 86 +- tests/validation/group_convolution_i8.c | 157 +- tests/validation/group_convolution_nchw_f32.c | 82 +- tests/validation/group_convolution_nchw_i8.c | 148 +- tests/validation/group_convolution_nchw_u8.c | 148 +- tests/validation/group_convolution_relu6_i8.c | 154 +- .../group_convolution_relu6_nchw_i8.c | 157 +- .../group_convolution_relu6_nchw_u8.c | 157 +- tests/validation/group_convolution_relu6_u8.c | 158 +- tests/validation/group_convolution_relu_i8.c | 154 +- .../group_convolution_relu_nchw_i8.c | 157 +- .../group_convolution_relu_nchw_u8.c | 157 +- tests/validation/group_convolution_relu_u8.c | 157 +- tests/validation/group_convolution_u8.c | 147 +- tests/validation/hard_sigmoid_f32.c | 28 +- tests/validation/hard_sigmoid_i8.c | 53 +- tests/validation/hard_sigmoid_u8.c | 51 +- tests/validation/im2col_f32.c | 67 +- tests/validation/im2col_i8.c | 110 +- tests/validation/im2col_u8.c | 112 +- tests/validation/is_nan_f32.c | 27 +- tests/validation/l2_norm_f32.c | 35 +- tests/validation/l2_norm_i8.c | 53 +- tests/validation/l2_norm_u8.c | 55 +- tests/validation/leaky_relu_f32.c | 39 +- tests/validation/leaky_relu_i8.c | 62 +- tests/validation/leaky_relu_u8.c | 64 +- tests/validation/less_equal_f32.c | 39 +- tests/validation/less_equal_i8.c | 98 +- tests/validation/less_equal_u8.c | 98 +- tests/validation/less_f32.c | 41 +- tests/validation/less_i8.c | 94 +- tests/validation/less_u8.c | 94 +- tests/validation/log1p_f32.c | 33 +- tests/validation/log1p_i8.c | 55 +- tests/validation/log1p_u8.c | 57 +- tests/validation/log_f32.c | 31 +- tests/validation/log_i8.c | 57 +- tests/validation/log_softmax_f32.c | 32 +- tests/validation/log_softmax_i8.c | 54 +- tests/validation/log_softmax_u8.c | 54 +- tests/validation/log_u8.c | 55 +- tests/validation/logical_and_f32.c | 39 +- tests/validation/logical_and_i8.c | 93 +- tests/validation/logical_and_u8.c | 91 +- tests/validation/logical_not_f32.c | 31 +- tests/validation/logical_not_i8.c | 57 +- tests/validation/logical_not_u8.c | 57 +- tests/validation/logical_or_f32.c | 39 +- tests/validation/logical_or_i8.c | 93 +- tests/validation/logical_or_u8.c | 93 +- tests/validation/logical_xor_f32.c | 48 +- tests/validation/logical_xor_i8.c | 95 +- tests/validation/logical_xor_u8.c | 95 +- tests/validation/lrn_f32.c | 41 +- tests/validation/lrn_i8.c | 58 +- tests/validation/lrn_u8.c | 58 +- tests/validation/matmul_f32.c | 34 +- tests/validation/matmul_i8.c | 69 +- tests/validation/matmul_u8.c | 68 +- tests/validation/max_stride_f32.c | 75 +- tests/validation/max_stride_u8.c | 78 +- tests/validation/maximum_f32.c | 33 +- tests/validation/maximum_i8.c | 75 +- tests/validation/maximum_u8.c | 75 +- tests/validation/maxpool3d_f32.c | 59 +- tests/validation/maxpool3d_i8.c | 90 +- tests/validation/maxpool3d_u8.c | 88 +- tests/validation/maxpool_f32.c | 49 +- tests/validation/maxpool_nchw_f32.c | 49 +- tests/validation/maxpool_u8.c | 61 +- tests/validation/mean_stride_f32.c | 75 +- tests/validation/mean_stride_u8.c | 78 +- tests/validation/min_stride_f32.c | 75 +- tests/validation/min_stride_u8.c | 78 +- tests/validation/minimum_f32.c | 35 +- tests/validation/minimum_i8.c | 74 +- tests/validation/minimum_u8.c | 74 +- tests/validation/mod_f32.c | 41 +- tests/validation/mod_i8.c | 90 +- tests/validation/mod_u8.c | 90 +- tests/validation/mul_f32.c | 43 +- tests/validation/mul_i8.c | 91 +- tests/validation/mul_u8.c | 89 +- tests/validation/ndarray_size_f32.c | 27 +- tests/validation/ndarray_size_i8.c | 38 +- tests/validation/ndarray_size_u8.c | 38 +- tests/validation/negative_f32.c | 27 +- tests/validation/negative_i8.c | 53 +- tests/validation/negative_u8.c | 53 +- tests/validation/non_max_suppression_f32.c | 38 +- tests/validation/not_equal_f32.c | 39 +- tests/validation/not_equal_i8.c | 98 +- tests/validation/not_equal_u8.c | 100 +- tests/validation/not_f32.c | 27 +- tests/validation/not_u32.c | 29 +- tests/validation/or_u32.c | 32 +- tests/validation/pad_f32.c | 49 +- tests/validation/pad_nchw_f32.c | 49 +- tests/validation/pad_nchw_u8.c | 60 +- tests/validation/pad_u8.c | 59 +- tests/validation/pow_f32.c | 47 +- tests/validation/pow_i8.c | 97 +- tests/validation/pow_u8.c | 97 +- tests/validation/prelu_f32.c | 43 +- tests/validation/prelu_i8.c | 79 +- tests/validation/prelu_nhwc_f32.c | 44 +- tests/validation/prelu_nhwc_i8.c | 79 +- tests/validation/prelu_nhwc_u8.c | 79 +- tests/validation/prelu_u8.c | 79 +- tests/validation/prod_stride_f32.c | 75 +- tests/validation/prod_stride_u8.c | 78 +- tests/validation/psroipooling_f32.c | 55 +- tests/validation/psroipooling_u8.c | 69 +- tests/validation/reduce_logsumexp_f32.c | 52 +- tests/validation/reduce_logsumexp_i8.c | 73 +- tests/validation/reduce_logsumexp_u8.c | 73 +- tests/validation/reduce_max_f32.c | 53 +- tests/validation/reduce_max_i8.c | 75 +- tests/validation/reduce_max_u8.c | 73 +- tests/validation/reduce_mean_f32.c | 52 +- tests/validation/reduce_mean_i8.c | 76 +- tests/validation/reduce_mean_u8.c | 78 +- tests/validation/reduce_min_f32.c | 52 +- tests/validation/reduce_min_i8.c | 75 +- tests/validation/reduce_min_u8.c | 73 +- tests/validation/reduce_prod_f32.c | 50 +- tests/validation/reduce_prod_i8.c | 74 +- tests/validation/reduce_prod_u8.c | 74 +- tests/validation/reduce_sum_f32.c | 52 +- tests/validation/reduce_sum_i8.c | 76 +- tests/validation/reduce_sum_u8.c | 75 +- tests/validation/relu1_f32.c | 35 +- tests/validation/relu1_i8.c | 56 +- tests/validation/relu1_u8.c | 56 +- tests/validation/relu6_f32.c | 35 +- tests/validation/relu6_i8.c | 57 +- tests/validation/relu6_u8.c | 57 +- tests/validation/relu_f32.c | 35 +- tests/validation/relu_i8.c | 58 +- tests/validation/relu_u8.c | 58 +- tests/validation/relun_f32.c | 37 +- tests/validation/relun_i8.c | 60 +- tests/validation/relun_u8.c | 60 +- tests/validation/reshape_f32.c | 44 +- tests/validation/resize_bilinear_f32.c | 52 +- tests/validation/resize_bilinear_i8.c | 76 +- tests/validation/resize_bilinear_u8.c | 76 +- tests/validation/resize_nearestneighbor_f32.c | 52 +- tests/validation/resize_nearestneighbor_i8.c | 74 +- .../resize_nearestneighbor_nchw_f32.c | 52 +- .../resize_nearestneighbor_nchw_i8.c | 75 +- .../resize_nearestneighbor_nchw_u8.c | 75 +- tests/validation/resize_nearestneighbor_u8.c | 74 +- tests/validation/reverse_f32.c | 36 +- tests/validation/reverse_i8.c | 61 +- tests/validation/reverse_u8.c | 61 +- tests/validation/riscv_xt9xx/relu_fp16.c | 18 +- tests/validation/roialign_f32.c | 51 +- tests/validation/roipooling_f32.c | 49 +- tests/validation/roipooling_u8.c | 69 +- tests/validation/round_f32.c | 37 +- tests/validation/round_i8.c | 59 +- tests/validation/round_u8.c | 59 +- tests/validation/rsqrt_f32.c | 35 +- tests/validation/rsqrt_i8.c | 58 +- tests/validation/rsqrt_u8.c | 58 +- tests/validation/segment_max_f32.c | 56 +- tests/validation/segment_max_i8.c | 83 +- tests/validation/segment_max_u8.c | 83 +- tests/validation/segment_mean_f32.c | 56 +- tests/validation/segment_mean_i8.c | 81 +- tests/validation/segment_mean_u8.c | 82 +- tests/validation/segment_min_f32.c | 56 +- tests/validation/segment_min_i8.c | 79 +- tests/validation/segment_min_u8.c | 86 +- tests/validation/segment_prod_f32.c | 56 +- tests/validation/segment_prod_i8.c | 81 +- tests/validation/segment_prod_u8.c | 81 +- tests/validation/segment_sum_f32.c | 56 +- tests/validation/segment_sum_i8.c | 83 +- tests/validation/segment_sum_u8.c | 81 +- tests/validation/select_f32.c | 42 +- tests/validation/select_i8.c | 109 +- tests/validation/select_u8.c | 110 +- tests/validation/shuffle_channel_f32.c | 35 +- tests/validation/shuffle_channel_i8.c | 54 +- tests/validation/shuffle_channel_nchw_f32.c | 35 +- tests/validation/shuffle_channel_nchw_i8.c | 54 +- tests/validation/shuffle_channel_nchw_u8.c | 54 +- tests/validation/shuffle_channel_u8.c | 54 +- tests/validation/sigmoid_f32.c | 36 +- tests/validation/sigmoid_i8.c | 58 +- tests/validation/sigmoid_u8.c | 58 +- tests/validation/sign_f32.c | 27 +- tests/validation/sin_f32.c | 27 +- tests/validation/sin_i8.c | 44 +- tests/validation/sin_u8.c | 44 +- tests/validation/sinh_f32.c | 27 +- tests/validation/sinh_i8.c | 53 +- tests/validation/sinh_u8.c | 53 +- tests/validation/slice_f32.c | 59 +- tests/validation/slice_i8.c | 79 +- tests/validation/slice_u8.c | 79 +- tests/validation/softmax_f32.c | 40 +- tests/validation/softmax_i8.c | 64 +- tests/validation/softmax_u8.c | 64 +- tests/validation/softplus_f32.c | 37 +- tests/validation/softplus_i8.c | 62 +- tests/validation/softplus_u8.c | 60 +- tests/validation/softrelu_f32.c | 37 +- tests/validation/softrelu_i8.c | 59 +- tests/validation/softrelu_u8.c | 59 +- tests/validation/softsign_f32.c | 37 +- tests/validation/softsign_i8.c | 58 +- tests/validation/softsign_u8.c | 60 +- tests/validation/space_to_batch_f32.c | 46 +- tests/validation/space_to_batch_i8.c | 72 +- tests/validation/space_to_batch_u8.c | 72 +- tests/validation/space_to_depth_f32.c | 39 +- tests/validation/space_to_depth_i8.c | 69 +- tests/validation/space_to_depth_u8.c | 71 +- tests/validation/split_f32.c | 68 +- tests/validation/sqrt_f32.c | 37 +- tests/validation/sqrt_i8.c | 61 +- tests/validation/sqrt_u8.c | 59 +- tests/validation/square_f32.c | 37 +- tests/validation/squeeze_f32.c | 42 +- tests/validation/squeeze_i8.c | 67 +- tests/validation/squeeze_u8.c | 65 +- tests/validation/stack_f32.c | 49 +- tests/validation/stack_i8.c | 62 +- tests/validation/stack_u8.c | 65 +- tests/validation/strided_slice_f32.c | 63 +- tests/validation/strided_slice_i8.c | 85 +- tests/validation/strided_slice_u8.c | 85 +- tests/validation/sub_f32.c | 47 +- tests/validation/sub_i8.c | 95 +- tests/validation/sub_u8.c | 95 +- tests/validation/sum_stride_f32.c | 75 +- tests/validation/sum_stride_u8.c | 78 +- tests/validation/tan_f32.c | 27 +- tests/validation/tan_i8.c | 43 +- tests/validation/tan_u8.c | 43 +- tests/validation/tanh_f32.c | 27 +- tests/validation/tanh_i8.c | 41 +- tests/validation/tanh_u8.c | 43 +- tests/validation/threshold_relu_f32.c | 37 +- tests/validation/threshold_relu_i8.c | 60 +- tests/validation/threshold_relu_u8.c | 60 +- tests/validation/tile_f32.c | 39 +- tests/validation/tile_i8.c | 64 +- tests/validation/tile_u8.c | 66 +- tests/validation/topk_f32.c | 44 +- tests/validation/topk_i8.c | 59 +- tests/validation/topk_u8.c | 60 +- tests/validation/transpose_f32.c | 40 +- tests/validation/transpose_i8.c | 64 +- tests/validation/transpose_u8.c | 65 +- tests/validation/trunc_f32.c | 37 +- tests/validation/trunc_i8.c | 59 +- tests/validation/trunc_u8.c | 59 +- tests/validation/unsorted_segment_max_f32.c | 56 +- tests/validation/unsorted_segment_max_i8.c | 85 +- tests/validation/unsorted_segment_max_u8.c | 85 +- tests/validation/unsorted_segment_mean_f32.c | 56 +- tests/validation/unsorted_segment_mean_i8.c | 79 +- tests/validation/unsorted_segment_mean_u8.c | 79 +- tests/validation/unsorted_segment_min_f32.c | 56 +- tests/validation/unsorted_segment_min_i8.c | 83 +- tests/validation/unsorted_segment_min_u8.c | 83 +- tests/validation/unsorted_segment_prod_f32.c | 56 +- tests/validation/unsorted_segment_prod_i8.c | 81 +- tests/validation/unsorted_segment_prod_u8.c | 81 +- tests/validation/unsorted_segment_sum_f32.c | 56 +- tests/validation/unsorted_segment_sum_i8.c | 81 +- tests/validation/unsorted_segment_sum_u8.c | 81 +- tests/validation/unstack_f32.c | 51 +- tests/validation/unstack_i8.c | 74 +- tests/validation/unstack_u8.c | 75 +- tests/validation/xor_u32.c | 31 +- tests/validation/yuv_rgb_scale_f32.c | 37 +- tests/validation/yuv_rgb_scale_i8.c | 59 +- tests/validation/yuv_rgb_scale_u8.c | 59 +- tests/validation_graph/Makefile.anole | 124 + tests/validation_graph/Makefile.pnna | 74 + tests/validation_graph/add.c | 71 +- tests/validation_graph/argmax.c | 85 +- tests/validation_graph/avgpool.c | 106 +- tests/validation_graph/batch_normalization.c | 95 +- tests/validation_graph/batch_to_space_nd.c | 90 +- tests/validation_graph/c906/Makefile | 6 +- tests/validation_graph/c906/add.c | 51 +- tests/validation_graph/c906/avgpool.c | 43 +- tests/validation_graph/c906/concat.c | 47 +- tests/validation_graph/c906/convolution.c | 52 +- tests/validation_graph/c906/deconvolution.c | 54 +- tests/validation_graph/c906/depth_to_space.c | 45 +- .../c906/depthwise_convolution.c | 52 +- tests/validation_graph/c906/div.c | 51 +- tests/validation_graph/c906/flatten.c | 44 +- tests/validation_graph/c906/fullyconnected.c | 52 +- tests/validation_graph/c906/global_avgpool.c | 44 +- tests/validation_graph/c906/global_maxpool.c | 43 +- .../validation_graph/c906/group_convolution.c | 54 +- tests/validation_graph/c906/leaky_relu.c | 43 +- tests/validation_graph/c906/maximum.c | 53 +- tests/validation_graph/c906/maxpool.c | 43 +- tests/validation_graph/c906/minimum.c | 53 +- tests/validation_graph/c906/pad.c | 43 +- tests/validation_graph/c906/relu.c | 43 +- tests/validation_graph/c906/relu1.c | 43 +- tests/validation_graph/c906/relu6.c | 43 +- tests/validation_graph/c906/reshape.c | 43 +- tests/validation_graph/c906/resize_bilinear.c | 43 +- .../c906/resize_nearest_neighbor.c | 44 +- tests/validation_graph/c906/sigmoid.c | 43 +- tests/validation_graph/c906/space_to_depth.c | 43 +- tests/validation_graph/c906/split.c | 43 +- tests/validation_graph/c906/squeeze.c | 43 +- tests/validation_graph/c906/sub.c | 54 +- tests/validation_graph/c906/tanh.c | 43 +- tests/validation_graph/c906/transpose.c | 43 +- tests/validation_graph/concat.c | 92 +- tests/validation_graph/convolution.c | 89 +- tests/validation_graph/crop.c | 90 +- tests/validation_graph/deconvolution.c | 118 +- tests/validation_graph/depth_to_space.c | 76 +- .../validation_graph/depthwise_convolution.c | 91 +- tests/validation_graph/div.c | 71 +- tests/validation_graph/flatten.c | 68 +- tests/validation_graph/fullyconnected.c | 87 +- tests/validation_graph/global_avgpool.c | 79 +- tests/validation_graph/global_maxpool.c | 79 +- tests/validation_graph/group_convolution.c | 94 +- tests/validation_graph/l2_normalization.c | 91 +- tests/validation_graph/leaky_relu.c | 76 +- tests/validation_graph/lrn.c | 83 +- tests/validation_graph/maximum.c | 85 +- tests/validation_graph/maxpool.c | 97 +- tests/validation_graph/mean.c | 102 +- tests/validation_graph/minimum.c | 85 +- tests/validation_graph/mul.c | 95 +- tests/validation_graph/negative.c | 70 +- tests/validation_graph/pad.c | 87 +- tests/validation_graph/prelu.c | 80 +- tests/validation_graph/relu.c | 73 +- tests/validation_graph/relu1.c | 75 +- tests/validation_graph/relu6.c | 75 +- tests/validation_graph/reshape.c | 82 +- tests/validation_graph/resize_bilinear.c | 78 +- .../resize_nearest_neighbor.c | 78 +- tests/validation_graph/sigmoid.c | 74 +- tests/validation_graph/softmax.c | 68 +- tests/validation_graph/space_to_batch_nd.c | 91 +- tests/validation_graph/space_to_depth.c | 76 +- tests/validation_graph/split.c | 103 +- tests/validation_graph/squeeze.c | 80 +- tests/validation_graph/strided_slice.c | 97 +- tests/validation_graph/sub.c | 94 +- tests/validation_graph/tanh.c | 73 +- tests/validation_graph/transpose.c | 78 +- tests/validation_layer/Makefile.c906 | 2 +- tests/validation_layer/Makefile.c908 | 44 + tests/validation_layer/Makefile.rvv | 2 +- tests/validation_layer/abs.cpp | 22 +- tests/validation_layer/acos.c | 31 +- tests/validation_layer/acosh.c | 32 +- tests/validation_layer/add.cpp | 55 +- tests/validation_layer/and.c | 35 +- tests/validation_layer/arange.c | 28 +- tests/validation_layer/argmax.c | 81 +- tests/validation_layer/argmin.c | 81 +- tests/validation_layer/asin.c | 33 +- tests/validation_layer/asinh.c | 33 +- tests/validation_layer/atan.c | 33 +- tests/validation_layer/atanh.c | 33 +- tests/validation_layer/averagepool.cpp | 69 +- tests/validation_layer/averagepool3d.c | 67 +- tests/validation_layer/batch_norm.c | 65 +- tests/validation_layer/batch_to_space.c | 53 +- tests/validation_layer/broadcast_to.c | 34 +- tests/validation_layer/ceil.c | 41 +- tests/validation_layer/clip.c | 43 +- tests/validation_layer/concat.cpp | 61 +- tests/validation_layer/convolution.cpp | 66 +- tests/validation_layer/convolution3d.c | 56 +- tests/validation_layer/convolution_relu.c | 93 +- tests/validation_layer/convolution_relu6.c | 93 +- tests/validation_layer/cos.c | 33 +- tests/validation_layer/cosh.c | 33 +- tests/validation_layer/cumprod.c | 42 +- tests/validation_layer/cumsum.c | 42 +- tests/validation_layer/deconvolution.c | 94 +- tests/validation_layer/deconvolution3d.c | 64 +- tests/validation_layer/depth_to_space.c | 44 +- .../depthwise_convolution.cpp | 59 +- .../depthwise_convolution_relu.c | 102 +- .../depthwise_convolution_relu6.c | 102 +- .../depthwise_deconvolution.c | 95 +- tests/validation_layer/div.c | 43 +- tests/validation_layer/elu.c | 41 +- tests/validation_layer/equal.c | 36 +- tests/validation_layer/erf.c | 33 +- tests/validation_layer/exp.c | 33 +- tests/validation_layer/expand_dims.c | 38 +- tests/validation_layer/expm1.c | 33 +- tests/validation_layer/flatten.c | 32 +- tests/validation_layer/floor.c | 45 +- tests/validation_layer/floor_div.c | 44 +- tests/validation_layer/floor_mod.c | 44 +- tests/validation_layer/fullyconnected.cpp | 48 +- tests/validation_layer/gather.c | 46 +- tests/validation_layer/gather_nd.c | 45 +- tests/validation_layer/global_avgpool.cpp | 41 +- tests/validation_layer/global_maxpool.cpp | 41 +- tests/validation_layer/greater.c | 44 +- tests/validation_layer/greater_equal.c | 44 +- tests/validation_layer/group_convolution.cpp | 58 +- .../validation_layer/group_convolution_relu.c | 95 +- .../group_convolution_relu6.c | 97 +- tests/validation_layer/hard_sigmoid.c | 32 +- tests/validation_layer/im2col.c | 71 +- tests/validation_layer/l2_norm.c | 39 +- tests/validation_layer/layer/common.c | 357 +- tests/validation_layer/layer/common.h | 820 +-- tests/validation_layer/leaky_relu.cpp | 46 +- tests/validation_layer/less.c | 44 +- tests/validation_layer/less_equal.c | 44 +- tests/validation_layer/log.c | 41 +- tests/validation_layer/log1p.c | 41 +- tests/validation_layer/log_softmax.c | 36 +- tests/validation_layer/logical_and.c | 44 +- tests/validation_layer/logical_not.c | 41 +- tests/validation_layer/logical_or.c | 44 +- tests/validation_layer/logical_xor.c | 44 +- tests/validation_layer/lrn.c | 45 +- tests/validation_layer/matmul.c | 40 +- tests/validation_layer/max_stride.c | 78 +- tests/validation_layer/maximum.c | 40 +- tests/validation_layer/maxpool.cpp | 53 +- tests/validation_layer/maxpool3d.c | 65 +- tests/validation_layer/mean_stride.c | 78 +- tests/validation_layer/min_stride.c | 78 +- tests/validation_layer/minimum.c | 38 +- tests/validation_layer/mod.c | 46 +- tests/validation_layer/mul.cpp | 39 +- tests/validation_layer/negative.c | 31 +- tests/validation_layer/non_max_suppression.c | 45 +- tests/validation_layer/not.c | 46 +- tests/validation_layer/not_equal.c | 44 +- tests/validation_layer/pad.cpp | 36 +- tests/validation_layer/power.c | 44 +- tests/validation_layer/prelu.c | 43 +- tests/validation_layer/prod_stride.c | 78 +- tests/validation_layer/psroipooling.c | 63 +- tests/validation_layer/reduce_logsumexp.c | 54 +- tests/validation_layer/reduce_max.c | 56 +- tests/validation_layer/reduce_mean.c | 54 +- tests/validation_layer/reduce_min.c | 54 +- tests/validation_layer/reduce_prod.c | 54 +- tests/validation_layer/reduce_sum.c | 54 +- tests/validation_layer/relu.cpp | 30 +- tests/validation_layer/relu1.c | 41 +- tests/validation_layer/relu6.c | 39 +- tests/validation_layer/relun.c | 41 +- tests/validation_layer/reshape.c | 48 +- tests/validation_layer/resize_bilinear.c | 56 +- .../validation_layer/resize_nearestneighbor.c | 56 +- tests/validation_layer/reverse.c | 40 +- tests/validation_layer/roialign.c | 57 +- tests/validation_layer/roipooling.c | 55 +- tests/validation_layer/round.c | 39 +- tests/validation_layer/rsqrt.c | 39 +- tests/validation_layer/segment_max.c | 58 +- tests/validation_layer/segment_mean.c | 62 +- tests/validation_layer/segment_min.c | 62 +- tests/validation_layer/segment_prod.c | 60 +- tests/validation_layer/segment_sum.c | 58 +- tests/validation_layer/select.c | 58 +- tests/validation_layer/shuffle_channel.c | 43 +- tests/validation_layer/sigmoid.cpp | 36 +- tests/validation_layer/sign.c | 31 +- tests/validation_layer/sin.c | 31 +- tests/validation_layer/sinh.c | 31 +- tests/validation_layer/slice.c | 59 +- tests/validation_layer/softmax.cpp | 44 +- tests/validation_layer/softplus.c | 39 +- tests/validation_layer/softrelu.c | 31 +- tests/validation_layer/softsign.c | 39 +- tests/validation_layer/space_to_batch.c | 54 +- tests/validation_layer/space_to_depth.c | 51 +- tests/validation_layer/split.c | 67 +- tests/validation_layer/sqrt.c | 39 +- tests/validation_layer/square.c | 39 +- tests/validation_layer/squeeze.c | 49 +- tests/validation_layer/stack.c | 51 +- tests/validation_layer/strided_slice.c | 60 +- tests/validation_layer/sub.c | 44 +- tests/validation_layer/sum_stride.cpp | 42 +- tests/validation_layer/tan.c | 31 +- tests/validation_layer/tanh.c | 31 +- tests/validation_layer/testutil.h | 162 +- tests/validation_layer/threshold_relu.c | 41 +- tests/validation_layer/tile.c | 52 +- tests/validation_layer/topk.c | 63 +- tests/validation_layer/transpose.c | 48 +- tests/validation_layer/trunc.c | 39 +- tests/validation_layer/unsorted_segment_max.c | 60 +- .../validation_layer/unsorted_segment_mean.c | 59 +- tests/validation_layer/unsorted_segment_min.c | 61 +- .../validation_layer/unsorted_segment_prod.c | 58 +- tests/validation_layer/unsorted_segment_sum.c | 58 +- tests/validation_layer/unstack.c | 60 +- tests/validation_layer/xor.c | 36 +- tests/validation_layer/yuv_rgb_scale.c | 39 +- tests/validation_xt800/Makefile.e804 | 4 +- tests/validation_xt800/Makefile.i805 | 4 +- tests/validation_xt800/Makefile.ref_i805 | 4 +- .../validation_xt800/avgpool_nonsquare_q7_1.c | 67 +- .../validation_xt800/avgpool_nonsquare_q7_2.c | 69 +- tests/validation_xt800/avgpool_q7_1.c | 67 +- tests/validation_xt800/avgpool_q7_2.c | 69 +- tests/validation_xt800/convolution_1x1_q7_1.c | 72 +- tests/validation_xt800/convolution_1x1_q7_2.c | 55 +- tests/validation_xt800/convolution_RGB_q7.c | 101 +- .../validation_xt800/convolution_basic_q7_1.c | 52 +- .../validation_xt800/convolution_basic_q7_2.c | 60 +- .../validation_xt800/convolution_basic_q7_3.c | 60 +- .../convolution_nonsquare_q7_1.c | 53 +- .../convolution_nonsquare_q7_2.c | 60 +- .../convolution_nonsquare_q7_3.c | 60 +- tests/validation_xt800/convolution_q15.c | 71 +- .../depthwise_convolution_nonsquare_q7.c | 97 +- .../depthwise_convolution_q7.c | 96 +- tests/validation_xt800/fullyconnected_q15.c | 83 +- tests/validation_xt800/fullyconnected_q7.c | 85 +- tests/validation_xt800/maxpool_q7_1.c | 65 +- tests/validation_xt800/maxpool_q7_2.c | 65 +- tests/validation_xt800/relu_q15.c | 40 +- tests/validation_xt800/relu_q7.c | 40 +- tests/validation_xt800/sigmoid_q15.c | 44 +- tests/validation_xt800/sigmoid_q7.c | 44 +- tests/validation_xt800/softmax_q15.c | 41 +- tests/validation_xt800/softmax_q7.c | 41 +- tests/validation_xt800/tanh_q15.c | 43 +- tests/validation_xt800/tanh_q7.c | 43 +- tests/validation_xt800/u8_testcases/add_u8.c | 49 +- tests/validation_xt800/u8_testcases/clip_u8.c | 46 +- .../u8_testcases/convolution_1x1_u8.c | 104 +- .../u8_testcases/convolution_u8.c | 103 +- .../u8_testcases/depthwise_convolution_u8.c | 101 +- .../u8_testcases/fullyconnected_u8.c | 61 +- .../u8_testcases/maxpool_u8.c | 70 +- tests/validation_xt800/u8_testcases/mul_u8.c | 47 +- .../validation_xt800/u8_testcases/relu6_u8.c | 42 +- tests/validation_xt800/u8_testcases/relu_u8.c | 39 +- .../u8_testcases/reshape_u8.c | 41 +- tests/validation_xt800/verify_avgpool_q7.c | 68 +- .../validation_xt800/verify_convolution_q15.c | 88 +- .../validation_xt800/verify_convolution_q7.c | 87 +- .../verify_depthwise_conv2d_q7.c | 89 +- tests/validation_xt800/verify_maxpool_q7.c | 66 +- version | 2 +- 1652 files changed, 120453 insertions(+), 64181 deletions(-) delete mode 100644 include/csi_c906.h delete mode 100644 include/csi_debug.h delete mode 100644 include/csi_e804.h delete mode 100644 include/csi_gref.h delete mode 100644 include/csi_i805.h delete mode 100644 include/csi_memory.h delete mode 100644 include/csi_ref.h delete mode 100644 include/csi_ref_i805.h delete mode 100644 include/csi_thead_rvv.h delete mode 100644 include/csi_utils.h rename include/{csi_internal.h => csinn_data_structure.h} (72%) create mode 100644 include/csinn_runtime.h delete mode 100644 include/include_xt800/csi_i805_nnfunction.h delete mode 100644 include/include_xt800/csi_instance.h delete mode 100644 include/include_xt800/csi_nn_tables.h delete mode 100644 include/include_xt800/csi_nnsupportfunctions.h delete mode 100644 include/include_xt800/csky_dsp2_nnfunctions.h rename include/{csi_c860.h => shl_c860.h} (77%) create mode 100644 include/shl_c906.h create mode 100644 include/shl_c908.h create mode 100644 include/shl_debug.h create mode 100644 include/shl_e804.h create mode 100644 include/shl_gref.h create mode 100644 include/shl_i805.h create mode 100644 include/shl_memory.h rename include/{csi_node.h => shl_node.h} (50%) create mode 100644 include/shl_ref.h create mode 100644 include/shl_ref_i805.h create mode 100644 include/shl_thead_rvv.h create mode 100644 include/shl_utils.h rename source/c860_opt/{csi_u8_to_f32_c860.S => shl_c860_u8_to_f32.S} (91%) rename source/c906_opt/{convolution_1x1.c => convolution_1x1_fp32.c} (50%) rename source/c906_opt/{convolution_sgemm.c => convolution_sgemm_fp32.c} (54%) delete mode 100644 source/c906_opt/depthwise_convolution_3x3.c create mode 100644 source/c906_opt/depthwise_convolution_3x3_fp32.c delete mode 100644 source/c906_opt/depthwise_convolution_3x3_pack4.c create mode 100644 source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c rename source/c906_opt/{depthwise_convolution_5x5.c => depthwise_convolution_5x5_fp32.c} (56%) rename source/c906_opt/{depthwise_convolution_relu_5x5.c => depthwise_convolution_relu_3x3_fp32.c} (79%) rename source/c906_opt/{depthwise_convolution_relu_3x3_pack4.c => depthwise_convolution_relu_3x3_pack4_fp32.c} (77%) rename source/c906_opt/{depthwise_convolution_relu_3x3.c => depthwise_convolution_relu_5x5_fp32.c} (79%) create mode 100644 source/c906_opt/gemm_fp32.c rename source/c906_opt/{sgemv.c => gemv_fp32.c} (92%) delete mode 100644 source/c906_opt/sgemm.c create mode 100644 source/c906_opt/shl_c906_u8_to_f32.S create mode 100644 source/c908_opt/avgpool.c create mode 100644 source/c908_opt/convolution.c create mode 100644 source/c908_opt/convolution_1x1_fp16.c create mode 100644 source/c908_opt/convolution_1x1_fp16_pack1ton.c create mode 100644 source/c908_opt/convolution_1x1_fp16_packn.c create mode 100644 source/c908_opt/convolution_1x1_fp16_packnto1.c create mode 100644 source/c908_opt/convolution_1x1_fp32.c create mode 100644 source/c908_opt/convolution_1x1_fp32_pack1ton.c create mode 100644 source/c908_opt/convolution_1x1_fp32_packn.c create mode 100644 source/c908_opt/convolution_1x1_fp32_packnto1.c create mode 100644 source/c908_opt/convolution_1x1_int8.c create mode 100644 source/c908_opt/convolution_1x1_int8_pack1ton.c create mode 100644 source/c908_opt/convolution_1x1_int8_packn.c create mode 100644 source/c908_opt/convolution_1x1_int8_packnto1.c create mode 100644 source/c908_opt/convolution_3x3_fp16.c create mode 100644 source/c908_opt/convolution_3x3_fp16_packn.c create mode 100644 source/c908_opt/convolution_3x3_fp16_packn_1.c create mode 100644 source/c908_opt/convolution_3x3_fp32.c create mode 100644 source/c908_opt/convolution_3x3_fp32_packn.c create mode 100644 source/c908_opt/convolution_3x3_fp32_packn_1.c create mode 100644 source/c908_opt/convolution_3x3_int8.c create mode 100644 source/c908_opt/convolution_3x3_int8_packn.c create mode 100644 source/c908_opt/convolution_3x3_int8_packn_1.c create mode 100644 source/c908_opt/convolution_gemm_fp16.c create mode 100644 source/c908_opt/convolution_gemm_fp16_pack1ton.c create mode 100644 source/c908_opt/convolution_gemm_fp16_packn.c create mode 100644 source/c908_opt/convolution_gemm_fp16_packnto1.c create mode 100644 source/c908_opt/convolution_gemm_fp32.c create mode 100644 source/c908_opt/convolution_gemm_fp32_pack1ton.c create mode 100644 source/c908_opt/convolution_gemm_fp32_packn.c create mode 100644 source/c908_opt/convolution_gemm_fp32_packnto1.c create mode 100644 source/c908_opt/convolution_gemm_int8.c create mode 100644 source/c908_opt/convolution_gemm_int8_pack1ton.c create mode 100644 source/c908_opt/convolution_gemm_int8_packn.c create mode 100644 source/c908_opt/convolution_gemm_int8_packnto1.c create mode 100644 source/c908_opt/depthwise_convolution.c create mode 100644 source/c908_opt/fullyconnected.c create mode 100644 source/c908_opt/gemm_fp16.c create mode 100644 source/c908_opt/gemm_fp16_packn.c create mode 100644 source/c908_opt/gemm_fp16_v256.c create mode 100644 source/c908_opt/gemm_fp32.c create mode 100644 source/c908_opt/gemm_fp32_packn.c create mode 100644 source/c908_opt/gemm_fp32_v256.c create mode 100644 source/c908_opt/gemm_int16_packn.c create mode 100644 source/c908_opt/gemm_int8.c create mode 100644 source/c908_opt/gemm_int8_packn.c create mode 100644 source/c908_opt/gemm_int8_v256.c create mode 100644 source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S create mode 100644 source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S create mode 100644 source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S create mode 100644 source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S create mode 100644 source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S create mode 100644 source/c908_opt/maxpool.c create mode 100644 source/c908_opt/reorder.c create mode 100644 source/c908_opt/setup.c rename source/e804_opt/activation/{csi_xt800p_nn_activations_q15.S => shl_xt800p_nn_activations_q15.S} (86%) rename source/e804_opt/activation/{csi_xt800p_nn_activations_q7.S => shl_xt800p_nn_activations_q7.S} (82%) rename source/e804_opt/activation/{csi_xt800p_relu_q15.S => shl_xt800p_relu_q15.S} (78%) rename source/e804_opt/activation/{csi_xt800p_relu_q7.S => shl_xt800p_relu_q7.S} (81%) rename source/e804_opt/convolution/{csi_xt800p_convolve_1x1_HWC_q7_fast.S => shl_xt800p_convolve_1x1_HWC_q7_fast.S} (93%) rename source/e804_opt/convolution/{csi_xt800p_convolve_HWC_q15_basic.S => shl_xt800p_convolve_HWC_q15_basic.S} (94%) rename source/e804_opt/convolution/{csi_xt800p_convolve_HWC_q7_RGB.S => shl_xt800p_convolve_HWC_q7_RGB.S} (94%) rename source/e804_opt/convolution/{csi_xt800p_convolve_HWC_q7_basic.S => shl_xt800p_convolve_HWC_q7_basic.S} (94%) rename source/e804_opt/convolution/{csi_xt800p_depthwise_separable_conv_HWC_q7.S => shl_xt800p_depthwise_separable_conv_HWC_q7.S} (92%) rename include/include_xt800/csky_vdsp2_nnfunctions.h => source/e804_opt/e804_function.h (56%) rename source/e804_opt/fully-connect/{csi_xt800p_fully_connected_mat_q7_vec_q15.S => shl_xt800p_fully_connected_mat_q7_vec_q15.S} (88%) rename source/e804_opt/fully-connect/{csi_xt800p_fully_connected_q15.S => shl_xt800p_fully_connected_q15.S} (90%) rename source/e804_opt/fully-connect/{csi_xt800p_fully_connected_q7.S => shl_xt800p_fully_connected_q7.S} (90%) delete mode 100644 source/e804_opt/nn-support/csi_xt800p_nntables.c create mode 100644 source/e804_opt/nn-support/shl_xt800p_nntables.c rename source/e804_opt/pooling/{csi_xt800p_pool_q7_HWC.S => shl_xt800p_pool_q7_HWC.S} (93%) rename source/e804_opt/softmax/{csi_xt800p_softmax_q15.S => shl_xt800p_softmax_q15.S} (91%) rename source/e804_opt/softmax/{csi_xt800p_softmax_q7.S => shl_xt800p_softmax_q7.S} (91%) create mode 100644 source/graph_ref/data_convert.c rename source/i805_opt/activation/{csi_i805_clip_8.S => shl_i805_clip_8.S} (91%) rename source/i805_opt/activation/{csi_i805_relu6_8.S => shl_i805_relu6_8.S} (89%) rename source/i805_opt/activation/{csi_i805_relu_8.S => shl_i805_relu_8.S} (89%) rename source/i805_opt/activation/{csi_xt800v_nn_activations_q15.S => shl_xt800v_nn_activations_q15.S} (92%) rename source/i805_opt/activation/{csi_xt800v_nn_activations_q15_fast.S => shl_xt800v_nn_activations_q15_fast.S} (84%) rename source/i805_opt/activation/{csi_xt800v_nn_activations_q7.S => shl_xt800v_nn_activations_q7.S} (90%) rename source/i805_opt/activation/{csi_xt800v_nn_activations_q7_fast.S => shl_xt800v_nn_activations_q7_fast.S} (75%) rename source/i805_opt/activation/{csi_xt800v_relu_q15.S => shl_xt800v_relu_q15.S} (78%) rename source/i805_opt/activation/{csi_xt800v_relu_q7.S => shl_xt800v_relu_q7.S} (79%) rename source/i805_opt/basic_math/{csi_i805_elementwise_add_8.S => shl_i805_elementwise_add_8.S} (92%) rename source/i805_opt/basic_math/{csi_i805_elementwise_mul_8.S => shl_i805_elementwise_mul_8.S} (90%) rename source/i805_opt/convolution/{csi_i805_convolution_1x1_8.S => shl_i805_convolution_1x1_8.S} (95%) rename source/i805_opt/convolution/{csi_i805_convolution_8.S => shl_i805_convolution_8.S} (97%) rename source/i805_opt/convolution/{csi_i805_depthwise_convolution_8.S => shl_i805_depthwise_convolution_8.S} (98%) rename source/i805_opt/convolution/{csi_xt800v_convolve_1x1_HWC_q7_fast.S => shl_xt800v_convolve_1x1_HWC_q7_fast.S} (94%) rename source/i805_opt/convolution/{csi_xt800v_convolve_HWC_q15_basic.S => shl_xt800v_convolve_HWC_q15_basic.S} (94%) rename source/i805_opt/convolution/{csi_xt800v_convolve_HWC_q7_RGB.S => shl_xt800v_convolve_HWC_q7_RGB.S} (94%) rename source/i805_opt/convolution/{csi_xt800v_convolve_HWC_q7_basic.S => shl_xt800v_convolve_HWC_q7_basic.S} (94%) rename source/i805_opt/convolution/{csi_xt800v_convolve_HWC_q7_fast_nonsquare.S => shl_xt800v_convolve_HWC_q7_fast_nonsquare.S} (98%) rename source/i805_opt/convolution/{csi_xt800v_depthwise_separable_conv_HWC_q7.S => shl_xt800v_depthwise_separable_conv_HWC_q7.S} (93%) rename source/i805_opt/convolution/{csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S => shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S} (93%) rename source/i805_opt/fully-connect/{csi_i805_fullyconnected_8.S => shl_i805_fullyconnected_8.S} (94%) rename source/i805_opt/fully-connect/{csi_xt800v_fully_connected_mat_q7_vec_q15.S => shl_xt800v_fully_connected_mat_q7_vec_q15.S} (93%) rename source/i805_opt/fully-connect/{csi_xt800v_fully_connected_q15.S => shl_xt800v_fully_connected_q15.S} (90%) rename source/i805_opt/fully-connect/{csi_xt800v_fully_connected_q7x4.S => shl_xt800v_fully_connected_q7x4.S} (89%) rename source/i805_opt/gemm/{csi_i805_mat_mult_nt_t_8.S => shl_i805_mat_mult_nt_t_8.S} (95%) rename source/i805_opt/gemm/{csi_i805_vec_mat_mult_8.S => shl_i805_vec_mat_mult_8.S} (94%) create mode 100644 source/i805_opt/i805_function.h delete mode 100644 source/i805_opt/nn-support/csi_xt800v_nntables.c create mode 100644 source/i805_opt/nn-support/shl_xt800v_nntables.c rename source/i805_opt/pooling/{csi_i805_maxpool_8.S => shl_i805_maxpool_8.S} (94%) rename source/i805_opt/pooling/{csi_xt800v_avepool_q7_HWC_nonsquare.S => shl_xt800v_avepool_q7_HWC_nonsquare.S} (95%) rename source/i805_opt/pooling/{csi_xt800v_pool_q7_HWC.S => shl_xt800v_pool_q7_HWC.S} (93%) rename source/i805_opt/reshape/{csi_i805_reshape_8.S => shl_i805_reshape_8.S} (82%) rename source/i805_opt/softmax/{csi_xt800v_softmax_q15.S => shl_xt800v_softmax_q15.S} (93%) rename source/i805_opt/softmax/{csi_xt800v_softmax_q7.S => shl_xt800v_softmax_q7.S} (92%) delete mode 100644 source/i805_ref/activation/csi_nn_activations_q15.c delete mode 100644 source/i805_ref/activation/csi_nn_activations_q7.c delete mode 100644 source/i805_ref/activation/csi_relu_q15.c delete mode 100644 source/i805_ref/activation/csi_relu_q7.c create mode 100644 source/i805_ref/activation/shl_activations_q15.c create mode 100644 source/i805_ref/activation/shl_activations_q7.c create mode 100644 source/i805_ref/activation/shl_relu_q15.c create mode 100644 source/i805_ref/activation/shl_relu_q7.c delete mode 100644 source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c delete mode 100644 source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c delete mode 100644 source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c delete mode 100644 source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c delete mode 100644 source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c delete mode 100644 source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c create mode 100644 source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c create mode 100644 source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c create mode 100644 source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c create mode 100644 source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_q15.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_q7.c delete mode 100644 source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_q15.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_q7.c create mode 100644 source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c rename include/include_xt800/csi_nnfunctions.h => source/i805_ref/i805_ref_function.h (92%) delete mode 100644 source/i805_ref/nn-support/csi_nntables.c delete mode 100644 source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c delete mode 100644 source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c create mode 100644 source/i805_ref/nn-support/i805_ref_support.h create mode 100644 source/i805_ref/nn-support/shl_nntables.c delete mode 100644 source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c delete mode 100644 source/i805_ref/pooling/csi_pool_q7_HWC.c create mode 100644 source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c create mode 100644 source/i805_ref/pooling/shl_pool_q7_HWC.c rename source/i805_ref/softmax/{csi_softmax_q15.c => shl_softmax_q15.c} (59%) rename source/i805_ref/softmax/{csi_softmax_q7.c => shl_softmax_q7.c} (57%) create mode 100644 source/nn2/depthwise_conv2d.c create mode 100644 source/nn2/depthwise_conv2d_relu.c create mode 100644 source/nn2/format.c create mode 100644 source/nn2/group_conv2d.c create mode 100644 source/thead_rvv/avgpool_2x2_fp16_packn.c rename source/thead_rvv/{avgpool_2x2.c => avgpool_2x2_fp32.c} (96%) create mode 100644 source/thead_rvv/avgpool_2x2_fp32_packn.c create mode 100644 source/thead_rvv/avgpool_3x3_fp16_packn.c rename source/thead_rvv/{avgpool_3x3.c => avgpool_3x3_fp32.c} (97%) create mode 100644 source/thead_rvv/avgpool_3x3_fp32_packn.c create mode 100644 source/thead_rvv/convolution_1x1_fp16_pack1ton.c create mode 100644 source/thead_rvv/convolution_1x1_fp16_packn.c create mode 100644 source/thead_rvv/convolution_1x1_fp16_packnto1.c rename source/thead_rvv/{convolution_1x1.c => convolution_1x1_fp32.c} (67%) create mode 100644 source/thead_rvv/convolution_1x1_fp32_pack1ton.c create mode 100644 source/thead_rvv/convolution_1x1_fp32_packn.c create mode 100644 source/thead_rvv/convolution_1x1_fp32_packnto1.c create mode 100644 source/thead_rvv/convolution_1x1_int4_packn.c create mode 100644 source/thead_rvv/convolution_1x1_int8_pack1ton.c create mode 100644 source/thead_rvv/convolution_1x1_int8_packn.c create mode 100644 source/thead_rvv/convolution_1x1_int8_packnto1.c delete mode 100644 source/thead_rvv/convolution_3x3.c create mode 100644 source/thead_rvv/convolution_3x3_fp32.c create mode 100644 source/thead_rvv/convolution_3x3_int8.c create mode 100644 source/thead_rvv/convolution_gemm_fp16_pack1ton.c create mode 100644 source/thead_rvv/convolution_gemm_fp16_packn.c create mode 100644 source/thead_rvv/convolution_gemm_fp16_packnto1.c rename source/thead_rvv/{convolution_gemm.c => convolution_gemm_fp32.c} (73%) create mode 100644 source/thead_rvv/convolution_gemm_fp32_pack1ton.c create mode 100644 source/thead_rvv/convolution_gemm_fp32_packn.c create mode 100644 source/thead_rvv/convolution_gemm_fp32_packnto1.c create mode 100644 source/thead_rvv/convolution_gemm_int4_packn.c create mode 100644 source/thead_rvv/convolution_gemm_int8_pack1ton.c create mode 100644 source/thead_rvv/convolution_gemm_int8_packn.c create mode 100644 source/thead_rvv/convolution_gemm_int8_packnto1.c create mode 100644 source/thead_rvv/data_convert.c create mode 100644 source/thead_rvv/depthwise_convolution.c create mode 100644 source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c rename source/thead_rvv/{depthwise_convolution_3x3.c => depthwise_convolution_3x3_fp32.c} (95%) create mode 100644 source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c create mode 100644 source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c create mode 100644 source/thead_rvv/depthwise_convolution_3x3_int8_packn.c create mode 100644 source/thead_rvv/fullyconnected_fp32.c create mode 100644 source/thead_rvv/fullyconnected_int4.c create mode 100644 source/thead_rvv/gemm_fp16_packn.c rename source/thead_rvv/{sgemm.c => gemm_fp32.c} (86%) create mode 100644 source/thead_rvv/gemm_fp32_packn.c create mode 100644 source/thead_rvv/gemm_int4_packn.c create mode 100644 source/thead_rvv/gemm_int8_packn.c create mode 100644 source/thead_rvv/global_avgpool_packn.c create mode 100644 source/thead_rvv/global_maxpool_packn.c create mode 100644 source/thead_rvv/maxpool_2x2_fp16_packn.c rename source/thead_rvv/{maxpool_2x2.c => maxpool_2x2_fp32.c} (95%) create mode 100644 source/thead_rvv/maxpool_2x2_fp32_packn.c create mode 100644 source/thead_rvv/maxpool_2x2_int8_packn.c create mode 100644 source/thead_rvv/maxpool_3x3_fp16_packn.c rename source/thead_rvv/{maxpool_3x3.c => maxpool_3x3_fp32.c} (97%) create mode 100644 source/thead_rvv/maxpool_3x3_fp32_packn.c create mode 100644 source/thead_rvv/maxpool_3x3_int8_packn.c create mode 100644 source/thead_rvv/pad.c create mode 100644 source/thead_rvv/relu6.c create mode 100644 source/thead_rvv/reorder.c create mode 100644 tests/python_ref/averagepool_vlen.py create mode 100644 tests/python_ref/convolution_vlen.py create mode 100644 tests/python_ref/depthwise_convolution_vlen.py create mode 100644 tests/python_ref/global_avgpool_vlen.py create mode 100644 tests/python_ref/global_maxpool_vlen.py create mode 100644 tests/python_ref/l2_norm_anole.py create mode 100644 tests/python_ref/maxpool_vlen.py create mode 100644 tests/validation_graph/Makefile.anole create mode 100644 tests/validation_graph/Makefile.pnna create mode 100644 tests/validation_layer/Makefile.c908 diff --git a/CMakeLists.txt b/CMakeLists.txt index 86ced272..8786692b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project(CSI-NN2) enable_language(ASM) -option(USE_CSI_NN2_DEBUG "option for debug" ON) +option(USE_SHL_DEBUG "option for debug" ON) option(BUILD_X86 "build x86" OFF) option(BUILD_RISCV "build riscv" OFF) @@ -11,20 +11,18 @@ option(BUILD_RISCV_ELF "build riscv elf" OFF) option(BUILD_CSKY "build csky" OFF) option(BUILD_CSKY_ELF "build csky elf" OFF) +if (NOT USE_COMPILER_PATH) + # riscv linux compiler if (BUILD_RISCV) - if(IS_DIRECTORY $ENV{RISCV_GNU_GCC_PATH}) - set(RISCV_GNU_GCC $ENV{RISCV_GNU_GCC_PATH}) - else() - set(RISCV_GNU_GCC "${PROJECT_SOURCE_DIR}/tools/gcc-toolchain/bin") - endif() - - set(CMAKE_C_COMPILER ${RISCV_GNU_GCC}/riscv64-unknown-linux-gnu-gcc) - set(CMAKE_CXX_COMPILER ${RISCV_GNU_GCC}/riscv64-unknown-linux-gnu-g++) + set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc) + set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++) + set(CMAKE_ASM_COMPILER riscv64-unknown-linux-gnu-gcc) endif() # riscv elf compiler if (BUILD_RISCV_ELF) + set(CMAKE_ASM_COMPILER riscv64-unknown-elf-gcc) set(CMAKE_C_COMPILER riscv64-unknown-elf-gcc) endif() @@ -40,9 +38,11 @@ if (BUILD_CSKY_ELF) set(CMAKE_ASM_COMPILER csky-abiv2-elf-gcc) endif() -# CSI-NN2 debug module -if(USE_CSI_NN2_DEBUG) - add_definitions(-D CSI_DEBUG) +endif() + +# SHL debug module +if(USE_SHL_DEBUG) + add_definitions(-D SHL_DEBUG) endif() # reduce elf size @@ -55,42 +55,65 @@ file(GLOB_RECURSE NN2_SRCS source/nn2/*.c source/utils/*.c) file(GLOB_RECURSE REF_SRCS source/reference/*.c) file(GLOB_RECURSE GREF_SRCS source/graph_ref/*.c) file(GLOB_RECURSE OPENVX_SRCS source/openvx/*.c) -file(GLOB_RECURSE C906_SRCS source/c906_opt/*.c) -file(GLOB_RECURSE C908_SRCS source/c908/*.c) +file(GLOB_RECURSE PNNA_SRCS source/pnna/*.c source/pnna/*.cpp) +file(GLOB_RECURSE C906_SRCS source/c906_opt/*.c source/c906_opt/*.S) +file(GLOB_RECURSE C908_SRCS source/c908_opt/*.c source/c908_opt/gemm_kernel/*.S) file(GLOB_RECURSE THEAD_RVV_SRCS source/thead_rvv/*.c) file(GLOB_RECURSE C860_SRCS source/c860_opt/*.S) file(GLOB_RECURSE I805_REF_SRCS source/i805_ref/*.c) file(GLOB_RECURSE I805_SRCS source/i805_opt/*.c source/i805_opt/*.S) file(GLOB_RECURSE E804_SRCS source/e804_opt/*.c source/e804_opt/*.S) +file(GLOB_RECURSE ASP_SRCS source/asp/*.c) include_directories(include) -option(CSINN_LAYER_BENCHMARK "Layer information and performance" OFF) -if(CSINN_LAYER_BENCHMARK) - add_definitions(-DCSINN_LAYER_BENCHMARK) +option(SHL_LAYER_BENCHMARK "Layer information and performance" OFF) +if(SHL_LAYER_BENCHMARK) + add_definitions(-DSHL_LAYER_BENCHMARK) message(STATUS "Print the execution time of each layer - ON") endif() -set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/install") - if(BUILD_X86) # build x86_ref so LIST(APPEND X86_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS}) - add_library(x86_share SHARED ${X86_LST}) - SET_TARGET_PROPERTIES(x86_share PROPERTIES OUTPUT_NAME "csi_nn2_ref_x86") - set(X86_BUILD_FLAGS -DCSI_AVX_OPT -DCSI_BUILD_REF -DCSI_BUILD_GREF -mavx -mfma -fopenmp) - target_compile_options(x86_share PRIVATE ${X86_BUILD_FLAGS}) - - install(TARGETS x86_share DESTINATION lib) - + add_library(x86_static STATIC ${X86_LST}) + SET_TARGET_PROPERTIES(x86_static PROPERTIES OUTPUT_NAME "shl_ref_x86") + set(X86_BUILD_FLAGS -DSHL_AVX_OPT -DSHL_BUILD_REF -DSHL_BUILD_GREF -fPIC -mavx -mfma -fopenmp) + target_compile_options(x86_static PRIVATE ${X86_BUILD_FLAGS}) + + install(TARGETS x86_static DESTINATION lib) + + # build pnna x86 simulate so + LIST(APPEND PNNA_LST ${NN2_SRCS} ${REF_SRCS} ${PNNA_SRCS}) + add_library(pnna_share SHARED ${PNNA_LST}) + SET_TARGET_PROPERTIES(pnna_share PROPERTIES OUTPUT_NAME "shl_pnna_x86") + set(PNNA_BUILD_FLAGS -DSHL_BUILD_PNNA) + target_compile_options(pnna_share PRIVATE ${PNNA_BUILD_FLAGS}) + target_include_directories(pnna_share PRIVATE module/nna_ddk_install/include/) + set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/x86/) + target_link_libraries(pnna_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn_csim -lnnasession_csim) + + install(TARGETS pnna_share DESTINATION lib) + + # build heterogeneous pnna x86 simulate so + LIST(APPEND HLIGHT_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${PNNA_SRCS}) + add_library(hlight_share SHARED ${HLIGHT_LST}) + SET_TARGET_PROPERTIES(hlight_share PROPERTIES OUTPUT_NAME "shl_hlight_x86") + set(HLIGHT_BUILD_FLAGS -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_PNNA) + target_compile_options(hlight_share PRIVATE ${HLIGHT_BUILD_FLAGS}) + target_include_directories(hlight_share PRIVATE module/nna_ddk_install/include/) + set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/x86/) + target_link_libraries(hlight_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn_csim -lnnasession_csim) + + install(TARGETS hlight_share DESTINATION lib) endif() if(BUILD_RISCV) # build rvv a LIST(APPEND RVV_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS}) add_library(rvv_static STATIC ${RVV_LST}) - SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "csi_nn2_rvv") - set(RVV_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_RVV -DCSI_BUILD_REF -DCSI_BUILD_GREF) + SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "shl_rvv") + set(RVV_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_RVV -DSHL_BUILD_REF -DSHL_BUILD_GREF) target_compile_options(rvv_static PRIVATE ${RVV_BUILD_FLAGS}) install(TARGETS rvv_static DESTINATION lib) @@ -98,26 +121,58 @@ if(BUILD_RISCV) # build c906 a LIST(APPEND C906_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C906_SRCS}) add_library(c906_static STATIC ${C906_LST}) - SET_TARGET_PROPERTIES(c906_static PROPERTIES OUTPUT_NAME "csi_nn2_c906") - set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_C906 -DCSI_BUILD_REF -DCSI_BUILD_GREF) + SET_TARGET_PROPERTIES(c906_static PROPERTIES OUTPUT_NAME "shl_c906") + set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C906 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV) target_compile_options(c906_static PRIVATE ${C906_BUILD_FLAGS}) install(TARGETS c906_static DESTINATION lib) add_library(c906_share SHARED ${C906_LST}) - SET_TARGET_PROPERTIES(c906_share PROPERTIES OUTPUT_NAME "csi_nn2_c906") + SET_TARGET_PROPERTIES(c906_share PROPERTIES OUTPUT_NAME "shl_c906") target_compile_options(c906_share PRIVATE ${C906_BUILD_FLAGS}) install(TARGETS c906_share DESTINATION lib) + # build c908 a + LIST(APPEND C908_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C908_SRCS}) + add_library(c908_static STATIC ${C908_LST}) + SET_TARGET_PROPERTIES(c908_static PROPERTIES OUTPUT_NAME "shl_c908") + set(C908_BUILD_FLAGS -march=rv64gcv_zfh_xtheadc_xtheadv -mabi=lp64d -DSHL_BUILD_C908 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV) + target_compile_options(c908_static PRIVATE ${C908_BUILD_FLAGS}) + + install(TARGETS c908_static DESTINATION lib) + + # build pnna so + LIST(APPEND PNNA_LST ${NN2_SRCS} ${REF_SRCS} ${PNNA_SRCS}) + add_library(pnna_share SHARED ${PNNA_LST}) + SET_TARGET_PROPERTIES(pnna_share PROPERTIES OUTPUT_NAME "shl_pnna") + set(PNNA_BUILD_FLAGS -DSHL_BUILD_PNNA) + target_compile_options(pnna_share PRIVATE ${PNNA_BUILD_FLAGS}) + target_include_directories(pnna_share PRIVATE module/nna_ddk_install/include/) + set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/light/) + target_link_libraries(pnna_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn -lnnasession) + + install(TARGETS pnna_share DESTINATION lib) + + # build heterogeneous pnna so + LIST(APPEND HLIGHT_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${PNNA_SRCS}) + add_library(hlight_share SHARED ${HLIGHT_LST}) + SET_TARGET_PROPERTIES(hlight_share PROPERTIES OUTPUT_NAME "shl_hlight") + set(HLIGHT_BUILD_FLAGS -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_PNNA) + target_compile_options(hlight_share PRIVATE ${HLIGHT_BUILD_FLAGS}) + target_include_directories(hlight_share PRIVATE module/nna_ddk_install/include/) + set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/light/) + target_link_libraries(hlight_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn -lnnasession) + + install(TARGETS hlight_share DESTINATION lib) endif() if(BUILD_RISCV_ELF) # build c906 elf a LIST(APPEND C906_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C906_SRCS}) add_library(c906_elf_static STATIC ${C906_LST}) - SET_TARGET_PROPERTIES(c906_elf_static PROPERTIES OUTPUT_NAME "csi_nn2_c906_rtos") - set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_C906 -DCSI_BUILD_REF -DCSI_BUILD_GREF -DCSI_BUILD_RTOS) + SET_TARGET_PROPERTIES(c906_elf_static PROPERTIES OUTPUT_NAME "shl_c906_rtos") + set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mcmodel=medany -DSHL_BUILD_C906 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RTOS) target_compile_options(c906_elf_static PRIVATE ${C906_BUILD_FLAGS}) install(TARGETS c906_elf_static DESTINATION lib) @@ -125,8 +180,8 @@ if(BUILD_RISCV_ELF) # build ASP elf a LIST(APPEND ASP_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${ASP_SRCS}) add_library(asp_elf_static STATIC ${ASP_LST}) - SET_TARGET_PROPERTIES(asp_elf_static PROPERTIES OUTPUT_NAME "csi_nn2_asp") - set(ASP_BUILD_FLAGS -march=rv32imafdcp -mabi=ilp32d -DCSI_BUILD_ASP -DCSI_BUILD_REF -DCSI_BUILD_GREF -DCSI_BUILD_RTOS) + SET_TARGET_PROPERTIES(asp_elf_static PROPERTIES OUTPUT_NAME "shl_asp") + set(ASP_BUILD_FLAGS -march=rv32imafdcp -mabi=ilp32d -DSHL_BUILD_ASP -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_USE_ATAT_MALLOC -DSHL_BUILD_RTOS) target_compile_options(asp_elf_static PRIVATE ${ASP_BUILD_FLAGS}) install(TARGETS asp_elf_static DESTINATION lib) @@ -136,8 +191,8 @@ if(BUILD_CSKY) # build openvx so LIST(APPEND OPENVX_LST ${NN2_SRCS} ${OPENVX_SRCS}) add_library(openvx_share SHARED ${OPENVX_LST}) - SET_TARGET_PROPERTIES(openvx_share PROPERTIES OUTPUT_NAME "csi_nn2_openvx") - set(OPENVX_BUILD_FLAGS -mcpu=c860v -fPIC -DCSI_BUILD_OPENVX -mhard-float) + SET_TARGET_PROPERTIES(openvx_share PROPERTIES OUTPUT_NAME "shl_openvx") + set(OPENVX_BUILD_FLAGS -mcpu=c860v -fPIC -DSHL_BUILD_OPENVX -mhard-float) target_compile_options(openvx_share PRIVATE ${OPENVX_BUILD_FLAGS}) set(OPENVX_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/acuity-driver/driver/build/sdk/drivers) target_link_libraries(openvx_share PRIVATE -mcpu=c860v -fPIC -mhard-float -L${OPENVX_LINK_DIR} -lArchModelSw -lNNArchPerf -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lNNGPUBinary -lovxlib -lOvx12VXCBinary) @@ -149,8 +204,8 @@ if(BUILD_CSKY) # build c860 a LIST(APPEND C860_LST ${NN2_SRCS} ${REF_SRCS} ${C860_SRCS}) add_library(c860_static STATIC ${C860_LST}) - SET_TARGET_PROPERTIES(c860_static PROPERTIES OUTPUT_NAME "csi_nn2_c860") - set(C860_BUILD_FLAGS -mcpu=c860v -DCSI_BUILD_REF) + SET_TARGET_PROPERTIES(c860_static PROPERTIES OUTPUT_NAME "shl_c860") + set(C860_BUILD_FLAGS -mcpu=c860v -DSHL_BUILD_REF) target_compile_options(c860_static PRIVATE ${C860_BUILD_FLAGS}) install(TARGETS c860_static DESTINATION lib) @@ -160,32 +215,41 @@ if(BUILD_CSKY_ELF) # build i805 ref a LIST(APPEND I805_REF_LST ${NN2_SRCS} ${REF_SRCS} ${I805_REF_SRCS}) add_library(i805_ref_static STATIC ${I805_REF_LST}) - SET_TARGET_PROPERTIES(i805_ref_static PROPERTIES OUTPUT_NAME "csi_nn2_ref_i805") - set(I805_REF_BUILD_FLAGS -DCSI_BUILD_REF_I805 -DCSI_MATH_DSP -DCSI_BUILD_RTOS -mcpu=i805) + SET_TARGET_PROPERTIES(i805_ref_static PROPERTIES OUTPUT_NAME "shl_ref_i805") + set(I805_REF_BUILD_FLAGS -DSHL_BUILD_REF_I805 -DSHL_BUILD_RTOS -mcpu=i805) target_compile_options(i805_ref_static PRIVATE ${I805_REF_BUILD_FLAGS}) - target_include_directories(i805_ref_static PRIVATE include/include_xt800) + target_include_directories(i805_ref_static PRIVATE source/i805_ref) install(TARGETS i805_ref_static DESTINATION lib) # build i805 a LIST(APPEND I805_LST ${NN2_SRCS} ${REF_SRCS} ${I805_SRCS}) add_library(i805_static STATIC ${I805_LST}) - SET_TARGET_PROPERTIES(i805_static PROPERTIES OUTPUT_NAME "csi_nn2_i805") - set(I805_BUILD_FLAGS -DCSI_BUILD_I805 -DCSI_MATH_DSP -DCSI_BUILD_RTOS -mcpu=ck805ef -mhard-float) + SET_TARGET_PROPERTIES(i805_static PROPERTIES OUTPUT_NAME "shl_i805") + set(I805_BUILD_FLAGS -DSHL_BUILD_I805 -DSHL_BUILD_RTOS -mcpu=ck805ef -mhard-float) target_compile_options(i805_static PRIVATE ${I805_BUILD_FLAGS}) - target_include_directories(i805_static PRIVATE include/include_xt800) + target_include_directories(i805_static PRIVATE source/i805_opt) install(TARGETS i805_static DESTINATION lib) # build e804 a LIST(APPEND E804_LST ${NN2_SRCS} ${REF_SRCS} ${E804_SRCS}) add_library(e804_static STATIC ${E804_LST}) - SET_TARGET_PROPERTIES(e804_static PROPERTIES OUTPUT_NAME "csi_nn2_e804") - set(E804_BUILD_FLAGS -DCSI_BUILD_E804 -mcpu=e804d -DCSI_BUILD_RTOS -mno-required-attr-fpu-abi) + SET_TARGET_PROPERTIES(e804_static PROPERTIES OUTPUT_NAME "shl_e804") + set(E804_BUILD_FLAGS -DSHL_BUILD_E804 -mcpu=e804d -DSHL_BUILD_RTOS -mno-required-attr-fpu-abi) target_compile_options(e804_static PRIVATE ${E804_BUILD_FLAGS}) - target_include_directories(e804_static PRIVATE include/include_xt800) + target_include_directories(e804_static PRIVATE source/e804_opt) install(TARGETS e804_static DESTINATION lib) endif() +# coverage options +OPTION(ENABLE_GCOV "Enable gcov" OFF) +if(ENABLE_GCOV) + SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage") + SET(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -lgcov") +endif() + + install(DIRECTORY "include/." DESTINATION "include" FILES_MATCHING PATTERN "*.h") diff --git a/Makefile b/Makefile index 6c829c49..002d5280 100644 --- a/Makefile +++ b/Makefile @@ -15,19 +15,37 @@ nn2_c906_so: nn2_c906_elf: mkdir -p riscv_elf_build; cd riscv_elf_build; cmake ../ -DBUILD_RISCV_ELF=ON -DCMAKE_BUILD_TYPE=Release; make c906_elf_static -j8; cd - +nn2_asp_elf: + mkdir -p riscv_elf_build; cd riscv_elf_build; cmake ../ -DBUILD_RISCV_ELF=ON -DCMAKE_BUILD_TYPE=Release; make asp_elf_static -j8; cd - + nn2_c908: mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make c908_static -j8; cd - nn2_ref_x86: - mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make x86_share -j8; cd - + mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make x86_static -j8; cd - + +nn2_openvx: + mkdir -p csky_build; cd csky_build; cmake ../ -DBUILD_CSKY=ON -DCMAKE_BUILD_TYPE=Release; make openvx_share -j8; cd - + +nn2_pnna: + mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make pnna_share -j8; cd - + +nn2_pnna_x86: + mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make pnna_share -j8; cd - + +nn2_hlight_x86: + mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make hlight_share -j8; cd - + +nn2_hlight: + mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make hlight_share -j8; cd - .PHONY: install_nn2 install_nn2: include mkdir -p install_nn2/lib cp include install_nn2 -r - -cp riscv_build/libcsi_nn2_* install_nn2/lib -rf - -cp csky_build/libcsi_nn2_* install_nn2/lib -rf - -cp x86_build/libcsi_nn2_* install_nn2/lib -rf + -cp riscv_build/libshl_* install_nn2/lib -rf + -cp csky_build/libshl_* install_nn2/lib -rf + -cp x86_build/libshl_* install_nn2/lib -rf cp version install_nn2/ -rf clint: diff --git a/README.md b/README.md index 7da4a7cc..bd023d51 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,19 @@ ## 简介 -CSI-NN2 是 T-HEAD 提供的一组针对无剑 SoC 平台的神经网络库 API。抽象了各种常用的网络层的接口,并且提供一系列已优化的二进制库。 +SHL(曾用名CSI-NN2) 是 T-HEAD 提供的一组针对玄铁 CPU 平台的神经网络库 API。抽象了各种常用的网络层的接口,并且提供一系列已优化的二进制库。 -CSI-NN2 的特性: +SHL 的特性: - C 代码版本的参考实现。 - 提供玄铁系列 CPU 的汇编优化实现。 - 支持对称量化和非对称量化。 - 支持8位定点,16位定点和16位浮点等数据类型。 - 兼容 NCHW 和 NHWC 格式。 -- 搭配 [HHB](https://occ.t-head.cn/development/series/index?spm=a2cl5.14300690.0.0.4aca475a4yHCxV&id=3865005559921381376&type=kind) 实现代码自动调用。 -- 覆盖 CPU,NPU 架构。 -- 附加一些辅助接口,参考使用。 +- 搭配 [HHB](https://www.yuque.com/za4k4z/oxlbxl) 实现代码自动调用。 +- 覆盖 CPU,NPU 等不同体系结构。 +- 附加异构参考实现。 -CSI-NN2 提供了完成的接口声明和接口的参考实现,各个设备提供商可以依此针对性的完成各个接口的优化工作。 +SHL 提供了完成的接口声明和接口的参考实现,各个设备提供商可以依此针对性的完成各个接口的优化工作。 ## 文档说明 @@ -21,7 +21,7 @@ CSI-NN2 提供了完成的接口声明和接口的参考实现,各个设备提 ## 致谢 -CSI-NN2 参考、借鉴了下列项目: +SHL 参考、借鉴了下列项目: - [Caffe](https://github.com/BVLC/caffe) - [Tensorflow](https://github.com/tensorflow/tensorflow) - [ncnn](https://github.com/Tencent/ncnn) diff --git a/include/csi_c906.h b/include/csi_c906.h deleted file mode 100644 index ba17fb04..00000000 --- a/include/csi_c906.h +++ /dev/null @@ -1,520 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_C906_H_ -#define INCLUDE_CSI_C906_H_ - -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_ref.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" - -/************************** f32 func declaration ***************************/ -int csi_c906_abs_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_c906_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_broadcast_to_f32(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); - -int csi_c906_clip_f32(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_c906_concat_f32(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_c906_split_f32(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); - -int csi_c906_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_pad_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params); - -int csi_c906_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, - struct csi_tensor *output, struct prelu_params *params); - -int csi_c906_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_relu1_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_relu6_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params); - -int csi_c906_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_div_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -/* pack */ -void csi_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx); - -void csi_c906_reorder_input(float *b, float *sb, int k, int n, int ldx); - -void csi_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx); - -/* gemm */ -void csi_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n, - int ldc, float *bias, bool fuse_relu); - -/* kernel transform */ -void csi_c906_conv1x1s1_sgemm_transform_kernel(struct csi_tensor *kernel, - struct conv2d_params *params); - -void csi_c906_conv_im2col_sgemm_transform_kernel(struct csi_tensor *kernel, - struct conv2d_params *params); - -void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -/* convolution optimization */ -int csi_c906_conv1x1s1_sgemm(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv1x1s1_sgemm_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv_im2col_sgemm(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv_im2col_sgemm_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_c906_conv3x3s1(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_c906_conv3x3s2(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -/* depthwise convolution optimization */ -int csi_c906_dwconv3x3s1(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv5x5s1(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv5x5s2(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s1_pack4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2_pack4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -/* depthwise convolution fuse relu */ -int csi_c906_dwconv3x3s1_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv5x5s1_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv5x5s2_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s1_pack4_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2_pack4_fuse_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -/************************** fp16 func declaration ***************************/ -int csi_c906_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_sub_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_minimum_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_c906_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_c906_pad_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params); - -int csi_c906_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_relu1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_relu6_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_prelu_fp16(struct csi_tensor *input, struct csi_tensor *alpha, - struct csi_tensor *output, struct prelu_params *params); - -int csi_c906_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_c906_abs_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_c906_clip_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_c906_concat_fp16(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_c906_split_fp16(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); - -int csi_c906_fullyconnected_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, struct fc_params *params); - -void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx); - -void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx); - -/* pack fp16 */ -void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); -void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); - -void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx); - -void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx); -void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx); - -/* gemm fp16 */ -void csi_c906_sgemm_kernel_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, - int n, int ldc, __fp16 *bias); -void csi_c906_sgemm_kernel_fp16_1(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, - int n, int ldc, __fp16 *bias); - -/* gemv fp16 */ -void csi_c906_gemv_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, - int ldc, __fp16 *bias); -void csi_c906_gemv_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, - int ldc, __fp16 *bias); - -void csi_c906_gemv_trans_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, - int ldc, __fp16 *bias); -void csi_c906_gemv_trans_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, - int ldc, __fp16 *bias); - -/* kernel transform fp16 */ -void csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params); -void csi_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params); - -void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -/* convolution optimization fp16 */ -int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_c906_conv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_c906_conv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -/* depthwise convolution optimization for fp16*/ -int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_c906_dwconv3x3s2_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -/* utils */ -void csi_c906_memcpy(void *dst, const void *src, size_t n); - -void csi_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left); - -void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w, - int wino_h, int wino_w); - -void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left); - -void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h, - int out_w, int wino_h, int wino_w); - -/*asr related fuctions*/ -int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_c906_cache_matmul_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params); - -int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params); - -int csi_c906_reshape_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); - -int csi_c906_gather_fp16(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params); - -int csi_c906_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -int csi_c906_cache_conv1d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params); - -void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth); - -void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, size_t len); - -void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size_t len); - -void *asr_buffer_get_buffer_c906(struct asr_buffer_t *buffer); - -void asr_buffer_reset_c906(struct asr_buffer_t *buffer); - -void csi_c906_reset_fcsr(); -int csi_c906_get_fcsr(); - -/* hardware performance */ -struct csi_c906_hpm { - size_t inst; - size_t cycle; - size_t l1_icache_access; - size_t l1_icache_miss; - size_t store_inst; - size_t l1_dcache_raccess; - size_t l1_dcache_rmiss; - size_t l1_dcache_waccess; - size_t l1_dcache_wmiss; -}; - -uint64_t csi_c906_get_inst(); -uint64_t csi_c906_get_cycle(); -uint64_t csi_c906_get_l1_icache_access(); -uint64_t csi_c906_get_l1_icache_miss(); -uint64_t csi_c906_get_cb_miss(); -uint64_t csi_c906_get_cb_inst(); -uint64_t csi_c906_get_store_inst(); -uint64_t csi_c906_get_l1_dcache_raccess(); -uint64_t csi_c906_get_l1_dcache_rmiss(); -uint64_t csi_c906_get_l1_dcache_waccess(); -uint64_t csi_c906_get_l1_dcache_wmiss(); - -struct csi_c906_hpm csi_c906_get_hw_perf(); - -int csi_c906_sum_stride_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_nn_c906_register_op_init(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc); -int csi_nn_c906_register_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc); - -void csi_nn_c906_bc_init_reg(); -void csi_nn_c906_bc_reg(); - -#endif // INCLUDE_CSI_C906_H_ diff --git a/include/csi_debug.h b/include/csi_debug.h deleted file mode 100644 index 8fc25c60..00000000 --- a/include/csi_debug.h +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ -#ifndef INCLUDE_CSI_DEBUG_H_ -#define INCLUDE_CSI_DEBUG_H_ -#include "csi_internal.h" -#include "csi_node.h" - -enum csinn_debug_enum { - CSI_DEBUG_LEVEL_DEBUG = -2, - CSI_DEBUG_LEVEL_INFO, - CSI_DEBUG_LEVEL_WARNING, - CSI_DEBUG_LEVEL_ERROR, - CSI_DEBUG_LEVEL_FATAL, -}; - -#ifdef CSI_DEBUG -#define CSI_DEBUG_CALL(func) func -void csi_debug_debug(const char *format, ...); -void csi_debug_info(const char *format, ...); -void csi_debug_warning(const char *format, ...); -void csi_debug_error(const char *format, ...); -void csi_debug_fatal(const char *format, ...); -int csi_debug_callback_unset(); -#else -#define CSI_DEBUG_CALL(func) -inline void csi_debug_debug(const char *format, ...) {} -inline void csi_debug_info(const char *format, ...) {} -inline void csi_debug_warning(const char *format, ...) {} -inline void csi_debug_error(const char *format, ...) {} -inline void csi_debug_fatal(const char *format, ...) {} -inline int csi_debug_callback_unset() { return CSINN_CALLBACK_UNSET; } -#endif - -int csi_debug_get_level(); -void csi_debug_set_level(int level); -int csi_benchmark_layer(struct csi_node *node, uint64_t start_time, uint64_t end_time, - int layer_idx); - -int csi_conv2d_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params, const char *name); - -int csi_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params, const char *name); - -int csi_conv3d_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params, const char *name); - -int csi_fsmn_debug_info(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params, const char *name); - -int csi_siso_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params, const char *name); - -int csi_diso_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params, const char *name); - -int csi_relu_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params, const char *name); - -int csi_arange_debug_info(struct csi_tensor *output, struct arange_params *params, - const char *name); - -int csi_pool_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params, const char *name); - -int csi_pad_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params, const char *name); - -int csi_crop_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct crop_params *params, const char *name); - -int csi_roi_pool_debug_info(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_pool_params *params, - const char *name); - -int csi_bn_debug_info(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, struct bn_params *params, - const char *name); - -int csi_batch_to_space_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params, const char *name); - -int csi_batch_to_space_nd_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_nd_params *params, const char *name); - -int csi_cache_matmul_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params, const char *name); - -int csi_cache_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params, const char *name); - -int csi_space_to_depth_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params, const char *name); - -int csi_depth_to_space_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params, const char *name); - -int csi_space_to_batch_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params, const char *name); - -int csi_space_to_batch_nd_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_nd_params *params, const char *name); - -int csi_broadcast_to_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params, const char *name); - -int csi_reduce_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params, const char *name); - -int csi_clip_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params, const char *name); - -int csi_col2im_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct col2im_params *params, const char *name); - -int csi_concat_debug_info(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params, const char *name); - -int csi_cumprod_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params, const char *name); - -int csi_cumsum_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params, const char *name); - -int csi_expand_dims_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params, const char *name); - -int csi_flatten_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params, const char *name); - -int csi_fullyconnected_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params, const char *name); - -int csi_gather_nd_debug_info(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params, - const char *name); - -int csi_gather_debug_info(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params, - const char *name); - -int csi_hard_sigmoid_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params, const char *name); - -int csi_im2col_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params, const char *name); - -int csi_l2n_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params, const char *name); - -int csi_layer_norm_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params, const char *name); - -int csi_softmax_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params, const char *name); - -int csi_lrn_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params, const char *name); - -int csi_matmul_debug_info(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params, - const char *name); - -int csi_ndarray_size_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params, const char *name); - -int csi_nms_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct non_max_suppression_params *params, - const char *name); - -int csi_one_hot_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct one_hot_params *params, const char *name); - -int csi_prelu_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct prelu_params *params, const char *name); - -int csi_proposal_debug_info(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params, const char *name); - -int csi_psroipooling_debug_info(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params, - const char *name); - -int csi_reorg_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct reorg_params *params, const char *name); - -int csi_reshape_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params, const char *name); - -int csi_resize_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params, const char *name); - -int csi_reverse_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params, const char *name); - -int csi_roi_align_debug_info(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_align_params *params, - const char *name); - -int csi_scatter_nd_debug_info(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params, const char *name); - -int csi_segment_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params, - const char *name); - -int csi_select_debug_info(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params, const char *name); - -int csi_sequence_mask_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct sequence_mask_params *params, - const char *name); - -int csi_shape_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params, const char *name); - -int csi_shuffle_channel_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params, const char *name); - -int csi_sigmoid_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params, const char *name); - -int csi_slice_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params, const char *name); - -int csi_split_debug_info(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params, const char *name); - -int csi_squeeze_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params, const char *name); - -int csi_stack_debug_info(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params, const char *name); - -int csi_strided_slice_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params, const char *name); - -int csi_tile_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct tile_params *params, const char *name); - -int csi_topk_debug_info(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct topk_params *params, const char *name); - -int csi_transpose_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params, const char *name); - -int csi_unpooling_debug_info(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params, - const char *name); - -int csi_unstack_debug_info(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params, const char *name); - -int csi_where_debug_info(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y, - struct csi_tensor *output, struct where_params *params, const char *name); - -#endif // INCLUDE_CSI_DEBUG_H_ diff --git a/include/csi_e804.h b/include/csi_e804.h deleted file mode 100644 index a4a31413..00000000 --- a/include/csi_e804.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_E804_H_ -#define INCLUDE_CSI_E804_H_ - -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_ref.h" -#include "csi_utils.h" -#include "csky_dsp2_nnfunctions.h" - -int csi_e804_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_e804_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_e804_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_e804_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_e804_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_e804_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_e804_softmax_q7(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_e804_softmax_q15(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_e804_relu_q7(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_e804_relu_q15(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_e804_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_e804_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_e804_tanh_q7(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_e804_tanh_q15(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -#endif // INCLUDE_CSI_E804_H_ diff --git a/include/csi_gref.h b/include/csi_gref.h deleted file mode 100644 index 18c68ac6..00000000 --- a/include/csi_gref.h +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_GREF_H_ -#define INCLUDE_CSI_GREF_H_ -#include "csi_nn.h" -#include "csi_node.h" -#include "csi_utils.h" - -int csi_gref_acos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_acosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_cos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_cosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_asin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_asinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_tan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_atan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_atanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_threshold_relu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_gref_trunc(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_topk(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2, - struct topk_params *params); - -int csi_gref_cumprod(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params); - -int csi_gref_cumsum(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params); - -int csi_gref_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); - -int csi_gref_conv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); - -int csi_gref_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_group_conv2d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_conv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv3d_params *params); - -int csi_gref_deconv2d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_deconv3d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); - -int csi_gref_depthwise_deconv2d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_depthwise_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_depthwise_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_gref_fsmn(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params); - -int csi_gref_fullyconnected(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_gref_fullyconnected_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_gref_maxpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_maxpool3d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_avgpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_avgpool3d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_global_avgpool3d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_global_avgpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_global_maxpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_l2pool(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_pool_with_argmax(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_maxpool2d_locat(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_gref_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_non_max_suppression(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params); - -int csi_gref_unpooling(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output, - struct unpooling_params *params); - -int csi_gref_negative(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_floor(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_ceil(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_clip(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_abs(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_exp(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_sin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_sinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_sqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_rsqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_square(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_sigmoid(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_gref_softsign(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_space_to_batch_nd(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_nd_params *params); - -int csi_gref_elu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); - -int csi_gref_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); - -int csi_gref_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); - -int csi_gref_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); - -int csi_gref_relun(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); - -int csi_gref_roi_align(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_align_params *params); - -int csi_gref_roipool(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_pool_params *params); - -int csi_gref_round(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_leaky_relu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_gref_softrelu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_gref_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params); - -int csi_gref_softplus(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_softmax(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_gref_batch_normalization(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params); - -int csi_gref_l2_normalization(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params); - -int csi_gref_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params); - -int csi_gref_matmul(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output, - struct matmul_params *params); - -int csi_gref_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_floor_divide(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_floor_mod(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_maximum(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_minimum(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_power(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_greater(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_less(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_log_softmax(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_gref_log(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_log1p(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_not_equal(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_not(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_reduce_logsumexp(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_reduce_max(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_reduce_mean(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_reduce_min(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_reduce_prod(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_reduce_sum(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_greater_equal(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_less_equal(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_select(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params); - -int csi_gref_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params); - -int csi_gref_resize(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params); - -int csi_gref_concat(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_gref_proposal(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params); - -int csi_gref_psroipooling(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params); - -int csi_gref_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); - -int csi_gref_reshape(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_gref_shape(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params); - -int csi_gref_strided_slice(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params); - -int csi_gref_expand_dims(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params); - -int csi_gref_expm1(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_reverse(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params); - -int csi_gref_flatten(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params); - -int csi_gref_crop(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params); - -int csi_gref_slice(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params); - -int csi_gref_split(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); - -int csi_gref_stack(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params); - -int csi_gref_tile(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params); - -int csi_gref_arange(struct csi_tensor *output, struct arange_params *params); - -int csi_gref_where(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y, - struct csi_tensor *output, struct where_params *params); - -int csi_gref_unstack(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params); - -int csi_gref_gather(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output, - struct gather_params *params); - -int csi_gref_gather_nd(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params); - -int csi_gref_hard_sigmoid(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_gref_isnan_bool(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_logical_and(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_logical_not(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_logical_or(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_logical_xor(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_gref_squeeze(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params); - -int csi_gref_segment_max(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); - -int csi_gref_segment_mean(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); - -int csi_gref_segment_min(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); - -int csi_gref_segment_prod(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); - -int csi_gref_segment_sum(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); - -int csi_gref_scatter_nd(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params); - -int csi_gref_shuffle_channel(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params); - -int csi_gref_sign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_ndarray_size(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); - -int csi_gref_space_to_batch(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params); - -int csi_gref_batch_to_space(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params); - -int csi_gref_batch_to_space_nd(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_nd_params *params); - -int csi_gref_space_to_depth(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params); - -int csi_gref_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params); - -int csi_gref_broadcast_to(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); - -int csi_gref_one_hot(struct csi_tensor *input, struct csi_tensor *output, - struct one_hot_params *params); - -int csi_gref_sequence_mask(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct sequence_mask_params *params); - -int csi_gref_im2col(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params); - -int csi_gref_col2im(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct col2im_params *params); - -int csi_gref_sum(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); - -int csi_gref_mean(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_max(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); - -int csi_gref_min(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); - -int csi_gref_prod(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_argmin(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_argmax(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_gref_all(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); - -int csi_gref_any(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); - -int csi_gref_reorg(struct csi_tensor *input, struct csi_tensor *output, - struct reorg_params *params); - -int csi_gref_erf(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_gref_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_gref_yuv_rgb_scale(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_gref_layer_norm(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params); - -int csi_gref_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_gref_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -struct csi_ref_graph { - struct csi_node **input; - struct csi_node **output; - int input_num; - int output_num; - struct csi_node **layer; - int layer_size; - int layer_index; -}; - -struct csi_gref_target_data { - struct csi_ref_graph *graph; -}; - -struct csi_ref_graph *csi_gref_get_graph(struct csi_session *sess); -int csi_gref_graph_insert(struct csi_node *node, struct csi_ref_graph *graph); -void csi_gref_post_dfs(struct csi_ref_graph *graph, - void (*fvisit)(struct csi_ref_graph *, struct csi_node *)); -int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node); -struct csi_node *csi_gref_get_input_subgraph(struct csi_ref_graph *graph, struct csi_node *node, - int index); -void csi_gref_reset_graph_visit(struct csi_ref_graph *graph); -void csi_gref_update_input_output(struct csi_ref_graph *graph, int index); -int csi_gref_siso_op(struct csi_tensor *input, struct csi_tensor *output, int op, void *params); -int csi_gref_diso_op(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, int op, void *params); -int csi_gref_sidcso_op(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *const0, struct csi_tensor *const1, int op, void *params); -void csi_gref_set_tensor(struct csi_tensor *tensor, struct csi_session *sess); -void csi_gref_set_const_tensor(struct csi_tensor *tensor, struct csi_session *sess); -int csi_gref_get_tensor(int index, struct csi_tensor *ret, struct csi_session *sess); -void csi_gref_nbg(struct csi_tensor **input, struct csi_tensor **output, uint32_t inputs_count, - uint32_t outputs_count, const char *url); - -void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph, - struct csi_ref_graph *ggraph); -int csi_subgraph_init(struct csi_node *n); -int csi_subgraph_deinit(struct csi_node *n); -int csi_subgraph_run_init(struct csi_node *n); -int csi_subgraph_run(struct csi_node *n); -int csi_subgraph_run_deinit(struct csi_node *n); - -struct csi_ref_graph *csi_subgraph_generate(struct csi_ref_graph *ograph); -struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph); -struct csi_ref_graph *csi_subgraph_topology_sort(struct csi_ref_graph *graph); -void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node); -void csi_subgraph_fvisit_print(struct csi_ref_graph *graph, struct csi_node *node); -int csi_subgraph_get_device(struct csi_node *node); -#endif // INCLUDE_CSI_GREF_H_ diff --git a/include/csi_i805.h b/include/csi_i805.h deleted file mode 100644 index 1586545e..00000000 --- a/include/csi_i805.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_I805_H_ -#define INCLUDE_CSI_I805_H_ - -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_ref.h" -#include "csi_utils.h" -#include "csi_i805_nnfunction.h" - -int csi_i805_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_i805_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_i805_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_i805_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_i805_softmax_q7(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_i805_softmax_q15(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_i805_relu_q7(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_relu_q15(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_i805_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_i805_tanh_q7(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_i805_tanh_q15(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -/*********************** u8 asym quant opt func *********************************/ - -int csi_i805_add_init_u8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_i805_add_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_i805_clip_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_i805_clip_u8(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_i805_conv2d_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_conv2d_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_depthwise_conv2d_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_i805_fullyconnected_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_i805_fullyconnected_u8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_i805_maxpool2d_u8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_i805_mul_init_u8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_i805_mul_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_i805_relu_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_relu_u8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_relu6_init_u8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_relu6_u8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_i805_reshape_u8(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -#endif // INCLUDE_CSI_I805_H_ diff --git a/include/csi_memory.h b/include/csi_memory.h deleted file mode 100644 index 26cae17f..00000000 --- a/include/csi_memory.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ -#ifndef INCLUDE_CSI_MEMORY_H_ -#define INCLUDE_CSI_MEMORY_H_ - -void csi_mem_print_map(); -void *csi_mem_alloc(int64_t size); -void *csi_mem_alloc_aligned(int64_t size, int aligned_bytes); -void *csi_mem_calloc(size_t nmemb, size_t size); -void *csi_mem_realloc(void *ptr, size_t size); -void csi_mem_free(void *ptr); - -#endif // INCLUDE_CSI_MEMORY_H_ diff --git a/include/csi_nn.h b/include/csi_nn.h index ca7de6df..d0d054e4 100644 --- a/include/csi_nn.h +++ b/include/csi_nn.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #ifndef INCLUDE_CSI_NN_H_ #define INCLUDE_CSI_NN_H_ @@ -26,890 +26,1047 @@ #include #include -#include "csi_debug.h" -#include "csi_internal.h" -#include "csi_memory.h" -#include "csi_utils.h" +#include "csinn_data_structure.h" +#include "csinn_runtime.h" +#include "shl_debug.h" +#include "shl_memory.h" #ifdef __cplusplus extern "C" { #endif -int csi_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); +int csinn_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); +int csinn_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); +int csinn_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); +int csinn_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv2d_relu6_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); +int csinn_group_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); +int csinn_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_deconv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); +int csinn_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_deconv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv2d_params *params); +int csinn_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv3d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv3d_params *params); +int csinn_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_conv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv3d_params *params); +int csinn_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_deconv3d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); +int csinn_conv2d_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_deconv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv3d_params *params); +int csinn_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_fsmn_init(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params); +int csinn_deconv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_fsmn(struct csi_tensor *frame, struct csi_tensor *l_filter, struct csi_tensor *r_filter, - struct csi_tensor *frame_sequence, struct csi_tensor *frame_counter, - struct csi_tensor *output, struct fsmn_params *params); +int csinn_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); -int csi_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); +int csinn_conv3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); -int csi_fullyconnected(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); +int csinn_conv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); -int csi_fullyconnected_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); +int csinn_deconv3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); -int csi_fullyconnected_relu(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); +int csinn_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); -int csi_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_fsmn_init(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params); -int csi_maxpool2d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params); +int csinn_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params); -int csi_maxpool3d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); -int csi_maxpool3d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params); +int csinn_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); -int csi_global_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_fullyconnected_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); -int csi_global_maxpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_fullyconnected_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); -int csi_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_avgpool2d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params); +int csinn_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_avgpool3d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_maxpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_avgpool3d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params); +int csinn_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_global_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_global_avgpool2d(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_l2pool_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_l2pool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params); +int csinn_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_pool_with_argmax_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_avgpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_pool_with_argmax(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_maxpool2d_locat_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_maxpool2d_locat(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); +int csinn_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_unpooling_init(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output, - struct unpooling_params *params); +int csinn_l2pool_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_unpooling(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output, - struct unpooling_params *params); +int csinn_l2pool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_roi_align_init(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_align_params *params); +int csinn_pool_with_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_roi_align(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_align_params *params); +int csinn_pool_with_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_negative_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_maxpool2d_locat_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_negative(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); -int csi_floor_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_unpooling_init(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params); -int csi_floor(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params); -int csi_ceil_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_roi_align_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params); -int csi_ceil(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params); -int csi_sign_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_negative_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_negative(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_trunc_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_floor_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_trunc(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_floor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_round_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_ceil_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_round(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_ceil(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_abs_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_sign_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_abs(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_sign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_isnan_bool_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_trunc_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_isnan_bool(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_trunc(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_exp_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_round_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_exp(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_round(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_expm1_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_abs_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_expm1(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_abs(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sin_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_isnan_bool_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cos_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_exp_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_exp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_tanh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_expm1_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_expm1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_log_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_sin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_log(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_sin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sqrt_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_cos_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_cos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_rsqrt_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_tanh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_rsqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_square_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_log_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_square(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_log(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sigmoid_init(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); +int csinn_sqrt_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sigmoid(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params); +int csinn_sqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_hard_sigmoid_init(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); +int csinn_rsqrt_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_hard_sigmoid(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); +int csinn_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_elu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_square_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_elu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_square(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_relu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); -int csi_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); -int csi_relu1_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_hard_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); -int csi_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); -int csi_relu6_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_elu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_elu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_relun_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_relun(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_leaky_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); +int csinn_relu1_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_leaky_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softrelu_init(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); +int csinn_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softrelu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params); +int csinn_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_prelu_init(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params); +int csinn_relun_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params); +int csinn_relun(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softplus_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_leaky_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softplus(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softmax_init(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); +int csinn_softrelu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_softmax(struct csi_tensor *input, struct csi_tensor *output, struct softmax_params *params); +int csinn_softrelu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_log_softmax_init(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); +int csinn_prelu_init(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); -int csi_log_softmax(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); +int csinn_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha, struct csinn_tensor *output, + struct csinn_prelu_params *params); -int csi_batch_normalization_init(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params); +int csinn_softplus_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_batch_normalization(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params); +int csinn_softplus(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_l2_normalization_init(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params); +int csinn_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); -int csi_l2_normalization(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params); +int csinn_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); -int csi_lrn_init(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params); +int csinn_log_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); -int csi_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params); +int csinn_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); -int csi_matmul_init(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output, - struct matmul_params *params); +int csinn_batch_normalization_init(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params); -int csi_matmul(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output, - struct matmul_params *params); +int csinn_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params); -int csi_add_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_l2_normalization_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params); -int csi_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params); -int csi_sub_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_lrn_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); -int csi_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_lrn(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); -int csi_mul_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_matmul_init(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); -int csi_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output, + struct csinn_matmul_params *params); -int csi_div_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_add_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_floor_divide_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_sub_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_floor_divide(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_floor_mod_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_mul_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_floor_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_mod_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_maximum_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_floor_divide_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_minimum_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_floor_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_power_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_power(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_greater_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_maximum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_greater(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_less_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_minimum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_less(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_and_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_power_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_power(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_or_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_greater_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_greater(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_not_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_less_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_not(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_less(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_xor_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_logical_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_logical_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_equal_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_logical_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_not_equal_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_logical_not_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_not_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_logical_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_greater_equal_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_logical_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_greater_equal(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_less_equal_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); +int csinn_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_less_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_select_init(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params); +int csinn_not_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_select(struct csi_tensor *condition, struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct select_params *params); +int csinn_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_and_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_greater_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_or_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_less_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_xor_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_select_init(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); -int csi_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); +int csinn_select(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); -int csi_not_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_not(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_and(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_pad_init(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params); +int csinn_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params); +int csinn_or(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_resize_init(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params); +int csinn_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); -int csi_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params); +int csinn_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params); -int csi_concat_init(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); +int csinn_not_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params); +int csinn_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_proposal_init(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params); +int csinn_pad_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); -int csi_proposal(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params); +int csinn_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); -int csi_psroipooling_init(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params); +int csinn_resize_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params); -int csi_psroipooling(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct psroipooling_params *params); +int csinn_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params); -int csi_transpose_init(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); +int csinn_concat_init(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); -int csi_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); +int csinn_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); -int csi_reshape_init(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); +int csinn_proposal_init(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params); -int csi_reshape(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params); +int csinn_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params); -int csi_shape_init(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params); +int csinn_psroipooling_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params); -int csi_shape(struct csi_tensor *input, struct csi_tensor *output, struct shape_params *params); +int csinn_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params); -int csi_expand_dims_init(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params); +int csinn_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); -int csi_expand_dims(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params); +int csinn_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); -int csi_reverse_init(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params); +int csinn_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); -int csi_reverse(struct csi_tensor *input, struct csi_tensor *output, struct reverse_params *params); +int csinn_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); -int csi_flatten_init(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params); +int csinn_shape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); -int csi_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params); +int csinn_shape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); -int csi_crop_init(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params); +int csinn_expand_dims_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); -int csi_crop(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params); +int csinn_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); -int csi_slice_init(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params); +int csinn_reverse_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params); -int csi_slice(struct csi_tensor *input, struct csi_tensor *output, struct slice_params *params); +int csinn_reverse(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params); -int csi_split_init(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); +int csinn_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params); -int csi_split(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params); +int csinn_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params); -int csi_stack_init(struct csi_tensor **inputs, struct csi_tensor *output, - struct stack_params *params); +int csinn_crop_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params); -int csi_stack(struct csi_tensor **inputs, struct csi_tensor *output, struct stack_params *params); +int csinn_crop(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params); -int csi_unstack_init(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params); +int csinn_slice_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params); -int csi_unstack(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params); +int csinn_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params); -int csi_tile_init(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params); +int csinn_split_init(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); -int csi_tile(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params); +int csinn_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); -int csi_arange_init(struct csi_tensor *output, struct arange_params *params); +int csinn_stack_init(struct csinn_tensor **inputs, struct csinn_tensor *output, + struct csinn_stack_params *params); -int csi_arange(struct csi_tensor *output, struct arange_params *params); +int csinn_stack(struct csinn_tensor **inputs, struct csinn_tensor *output, + struct csinn_stack_params *params); -int csi_where_init(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y, - struct csi_tensor *output, struct where_params *params); +int csinn_unstack_init(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params); -int csi_where(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y, - struct csi_tensor *output, struct where_params *params); +int csinn_unstack(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params); -int csi_gather_init(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output, - struct gather_params *params); +int csinn_tile_init(struct csinn_tensor *inputs, struct csinn_tensor *output, + struct csinn_tile_params *params); -int csi_gather(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output, - struct gather_params *params); +int csinn_tile(struct csinn_tensor *inputs, struct csinn_tensor *output, + struct csinn_tile_params *params); -int csi_gather_nd_init(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params); +int csinn_arange_init(struct csinn_tensor *output, struct csinn_arange_params *params); -int csi_gather_nd(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output, - struct gather_nd_params *params); +int csinn_arange(struct csinn_tensor *output, struct csinn_arange_params *params); -int csi_squeeze_init(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params); +int csinn_where_init(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params); -int csi_squeeze(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params); +int csinn_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params); -int csi_ndarray_size_init(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); +int csinn_gather_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); -int csi_ndarray_size(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); +int csinn_gather(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); -int csi_space_to_batch_init(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params); +int csinn_gather_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params); -int csi_space_to_batch(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params); +int csinn_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params); -int csi_space_to_batch_nd_init(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_nd_params *params); +int csinn_squeeze_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params); -int csi_space_to_batch_nd(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_nd_params *params); +int csinn_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params); -int csi_batch_to_space_init(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params); +int csinn_ndarray_size_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); -int csi_batch_to_space(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params); +int csinn_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); -int csi_batch_to_space_nd_init(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_nd_params *params); +int csinn_space_to_batch_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params); -int csi_batch_to_space_nd(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_nd_params *params); +int csinn_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params); -int csi_space_to_depth_init(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params); +int csinn_space_to_batch_nd_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params); -int csi_space_to_depth(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params); +int csinn_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params); -int csi_depth_to_space_init(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params); +int csinn_batch_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params); -int csi_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params); +int csinn_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params); -int csi_one_hot_init(struct csi_tensor *input, struct csi_tensor *output, - struct one_hot_params *params); +int csinn_batch_to_space_nd_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params); -int csi_one_hot(struct csi_tensor *input, struct csi_tensor *output, struct one_hot_params *params); +int csinn_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params); -int csi_sequence_mask_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct sequence_mask_params *params); +int csinn_space_to_depth_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params); -int csi_sequence_mask(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct sequence_mask_params *params); +int csinn_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params); -int csi_im2col_init(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params); +int csinn_depth_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params); -int csi_im2col(struct csi_tensor *input, struct csi_tensor *output, struct im2col_params *params); +int csinn_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params); -int csi_col2im_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct col2im_params *params); +int csinn_one_hot_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params); -int csi_col2im(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct col2im_params *params); +int csinn_one_hot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params); -int csi_sum_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_sequence_mask_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_sequence_mask_params *params); -int csi_sum(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_sequence_mask_params *params); -int csi_mean_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_im2col_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params); -int csi_mean(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_im2col(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params); -int csi_max_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_col2im_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params); -int csi_max(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_col2im(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params); -int csi_min_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_sum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_min(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_prod_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_mean_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_prod(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_argmin_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_max_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_argmin(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_argmax_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_min_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_argmax(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_all_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_prod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_all(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_any_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_argmin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_any(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); +int csinn_argmin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reorg_init(struct csi_tensor *input, struct csi_tensor *output, - struct reorg_params *params); +int csinn_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reorg(struct csi_tensor *input, struct csi_tensor *output, struct reorg_params *params); +int csinn_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_yuv_rgb_scale_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_all_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_yuv_rgb_scale(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_all(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_segment_max_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_any_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_segment_max(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct segment_params *params); +int csinn_any(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_segment_min_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_reorg_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params); -int csi_segment_min(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct segment_params *params); +int csinn_reorg(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params); -int csi_segment_sum_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_yuv_rgb_scale_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_segment_sum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct segment_params *params); +int csinn_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_segment_mean_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_segment_max_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_segment_mean(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_segment_prod_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_segment_min_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_segment_prod(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct segment_params *params); +int csinn_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_threshold_relu_init(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); +int csinn_segment_sum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_threshold_relu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); +int csinn_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_acos_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); -int csi_acos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_segment_mean_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_acosh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_acosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_segment_prod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_asin_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); -int csi_asin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_threshold_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_asinh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); -int csi_asinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_acos_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); +int csinn_acos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_atan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_acosh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_atan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_acosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_atanh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_asin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_atanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_asin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cosh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_asinh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_asinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sinh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_atan_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_sinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_atan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_tan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_atanh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_tan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_atanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_log1p_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_cosh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_log1p(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_cosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_softsign_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_sinh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_softsign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_sinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_erf_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_tan_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_erf(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); +int csinn_tan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cumsum_init(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params); +int csinn_log1p_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cumsum(struct csi_tensor *input, struct csi_tensor *output, struct cumsum_params *params); +int csinn_log1p(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cumprod_init(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params); +int csinn_softsign_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_cumprod(struct csi_tensor *input, struct csi_tensor *output, struct cumprod_params *params); +int csinn_softsign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_reduce_max_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_erf_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_reduce_max(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_erf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); -int csi_reduce_min_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_cumsum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params); -int csi_reduce_min(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_cumsum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params); -int csi_reduce_mean_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_cumprod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params); -int csi_reduce_mean(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_cumprod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params); -int csi_reduce_sum_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_max_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reduce_sum(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reduce_prod_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_min_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reduce_prod(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reduce_logsumexp_init(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_mean_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_reduce_logsumexp(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); +int csinn_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_broadcast_to_init(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); +int csinn_reduce_sum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_broadcast_to(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); +int csinn_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_scatter_nd_init(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params); +int csinn_reduce_prod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_scatter_nd(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *updates, - struct csi_tensor *output, struct scatter_nd_params *params); +int csinn_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_clip_init(struct csi_tensor *input, struct csi_tensor *output, struct clip_params *params); +int csinn_reduce_logsumexp_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_clip(struct csi_tensor *input, struct csi_tensor *output, struct clip_params *params); +int csinn_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); -int csi_strided_slice_init(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params); +int csinn_broadcast_to_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); -int csi_strided_slice(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params); +int csinn_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); -int csi_topk_init(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2, - struct topk_params *params); +int csinn_scatter_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params); -int csi_topk(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2, - struct topk_params *params); +int csinn_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params); -int csi_non_max_suppression_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params); +int csinn_clip_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); -int csi_non_max_suppression(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct non_max_suppression_params *params); +int csinn_clip(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); -int csi_shuffle_channel_init(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params); +int csinn_strided_slice_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params); -int csi_shuffle_channel(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params); +int csinn_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params); -int csi_roipool_init(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_pool_params *params); +int csinn_topk_init(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params); -int csi_roipool(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_pool_params *params); +int csinn_topk(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params); -int csi_layer_norm_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params); +int csinn_non_max_suppression_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params); -int csi_layer_norm(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *gamma, - struct csi_tensor *beta, struct layer_norm_params *params); +int csinn_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params); -int csi_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); +int csinn_shuffle_channel_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params); -int csi_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight, - struct csi_tensor *bias, struct cache_matmul_params *params); +int csinn_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params); -int csi_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); +int csinn_roipool_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params); -int csi_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight, - struct csi_tensor *bias, struct cache_conv1d_params *params); +int csinn_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, struct csinn_tensor *output, + struct csinn_roi_pool_params *params); -int csi_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv1d_params *params); +int csinn_layer_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); -int csi_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, struct conv1d_params *params); +int csinn_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); -int csi_data_convert_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); -int csi_data_convert(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); +int csinn_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int csinn_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int csinn_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int csinn_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int csinn_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params); + +int csinn_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params); + +int csinn_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); +int csinn_data_convert(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); #ifdef __cplusplus } diff --git a/include/csi_ref.h b/include/csi_ref.h deleted file mode 100644 index 0c76a8ff..00000000 --- a/include/csi_ref.h +++ /dev/null @@ -1,1195 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_REF_H_ -#define INCLUDE_CSI_REF_H_ - -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_nn.h" -#include "csi_utils.h" - -#ifdef __cplusplus -extern "C" { -#endif - -int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_abs_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_acos_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_acosh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_add_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_add_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_and_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params); - -int csi_ref_arange_quant(struct csi_tensor *output, struct arange_params *params); - -int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_argmax_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_argmin_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_asin_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_asinh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_atan_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_atanh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_avgpool3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params); - -int csi_ref_batch_normalization_quant(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params); - -int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params); - -int csi_ref_batch_to_space_quant(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params); - -int csi_ref_broadcast_to_f32(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); - -int csi_ref_broadcast_to_quant(struct csi_tensor *input, struct csi_tensor *output, - struct broadcast_to_params *params); - -int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_ceil_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_ref_clip_quant(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params); - -int csi_ref_col2im_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct col2im_params *params); - -int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params); - -int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params); - -int csi_ref_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_conv2d_relu_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_conv2d_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_ref_cache_matmul_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_ref_cache_matmul_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params); - -int csi_ref_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -int csi_ref_cache_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params); - -int csi_ref_conv2d_channel_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_conv2d_channel_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_relu_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_channel_relu_quant(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_conv2d_channel_relu6_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_group_conv2d_channel_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); - -int csi_ref_conv3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); - -int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_cos_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_cosh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params); - -int csi_ref_cumprod_quant(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params); - -int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params); - -int csi_ref_cumsum_quant(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params); - -int csi_ref_data_convert_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); -int csi_ref_data_convert_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_depthwise_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); - -int csi_ref_deconv3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params); - -int csi_ref_depth_to_space_f32(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params); - -int csi_ref_depth_to_space_quant(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params); - -int csi_ref_div_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_div_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_elu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params); - -int csi_ref_fsmn_quant(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params); - -int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_erf_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_exp_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params); - -int csi_ref_expand_dims_quant(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params); - -int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_expm1_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params); - -int csi_ref_flatten_quant(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params); - -int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_floor_divide_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_floor_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_floor_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params); - -int csi_ref_gather_nd_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params); - -int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params); - -int csi_ref_gather_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params); - -int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_global_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_global_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_greater_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_greater_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_hard_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_im2col_f32(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params); - -int csi_ref_im2col_quant(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params); - -int csi_ref_isnan_bool_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params); - -int csi_ref_l2_normalization_quant(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params); - -int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params); - -int csi_ref_layer_norm_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params); - -int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_leaky_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_less_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_less_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_log_softmax_quant(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_log_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_log1p_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_logical_and_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_logical_not_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_logical_or_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_logical_xor_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_lrn_f32(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params); - -int csi_ref_lrn_quant(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params); - -int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output, - struct matmul_params *params); - -int csi_ref_matmul_quant(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params); - -int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_max_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_maximum_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_maxpool2d_locat_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_maxpool2d_locat_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_maxpool3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_mean_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_mean_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_min_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_minimum_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_mul_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_ndarray_size_f32(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); - -int csi_ref_ndarray_size_u8(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); - -int csi_ref_ndarray_size_i8(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); - -int csi_ref_ndarray_size_i32(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params); - -int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_negative_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params); - -int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_not_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_ref_not_i8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); - -int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_or_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_pad_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params); - -int csi_ref_pad_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params); - -int csi_ref_power_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_power_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params); - -int csi_ref_prelu_quant(struct csi_tensor *input, struct csi_tensor *alpha, - struct csi_tensor *output, struct prelu_params *params); - -int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_prod_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params); - -int csi_ref_proposal_quant(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params); - -int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params); - -int csi_ref_psroipooling_quant(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params); - -int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_logsumexp_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_max_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_mean_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_min_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_prod_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_reduce_sum_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relu1_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_relun_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_reshape(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_ref_reshape_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params); - -int csi_ref_resize_quant(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params); - -int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params); - -int csi_ref_reverse_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params); - -int csi_ref_roi_align_f32(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_align_params *params); - -int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_pool_params *params); - -int csi_ref_roipool_quant(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_pool_params *params); - -int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_round_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_rsqrt_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params); - -int csi_ref_scatter_nd_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params); - -int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_unsorted_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params); - -int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params); - -int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params); - -int csi_ref_select_i8(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params); - -int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params); - -int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params); - -int csi_ref_shape_i8(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params); - -int csi_ref_shuffle_channel_f32(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params); - -int csi_ref_shuffle_channel_quant(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params); - -int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sign_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sin_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sinh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params); - -int csi_ref_slice_quant(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params); - -int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_softmax_quant(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_softplus_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_softrelu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_softsign_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params); - -int csi_ref_space_to_batch_quant(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params); - -int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params); - -int csi_ref_space_to_depth_quant(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params); - -int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); - -int csi_ref_split_quant(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params); - -int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_sqrt_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_square_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_squeeze(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params); - -int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params); - -int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params); - -int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params); - -int csi_ref_strided_slice_quant(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params); - -int csi_ref_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_sub_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_sum_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_tan_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_tanh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_threshold_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output, - struct tile_params *params); - -int csi_ref_tile_quant(struct csi_tensor *input, struct csi_tensor *output, - struct tile_params *params); - -int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1, - struct csi_tensor *output2, struct topk_params *params); - -int csi_ref_topk_quant(struct csi_tensor *input, struct csi_tensor *output1, - struct csi_tensor *output2, struct topk_params *params); - -int csi_ref_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); - -int csi_ref_transpose_quant(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); - -int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_trunc_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_unpooling_f32(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params); - -int csi_ref_unpooling_quant(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params); - -int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params); - -int csi_ref_unstack_qunat(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params); - -int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_xor_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params); - -int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_yuv_rgb_scale_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int32_t csi_ref_max_internal_s32(int32_t a, int32_t b); -int32_t csi_ref_min_internal_s32(int32_t a, int32_t b); -int32_t csi_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, - int32_t index3); -int32_t csi_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, - int32_t index3, int32_t index4); -int32_t csi_ref_get_index_iter(int32_t *dim, int dim_count, int32_t *index); -float csi_ref_get_scale(int32_t multiplier, int32_t shift); -float csi_ref_dequantize_u8_to_f32(uint8_t input, struct csi_quant_info *qinfo); -float csi_ref_dequantize_i8_to_f32(int8_t input, struct csi_quant_info *qinfo); -uint8_t csi_ref_quantize_f32_to_u8(float input, struct csi_quant_info *qinfo); -int8_t csi_ref_quantize_f32_to_i8(float input, struct csi_quant_info *qinfo); -uint8_t csi_ref_quantize_channel_u8(int32_t data, struct csi_tensor *input, - struct csi_tensor *output, float wscale); -int8_t csi_ref_quantize_channel_i8(int32_t data, struct csi_tensor *input, - struct csi_tensor *output, float wscale); -float csi_ref_uint8_to_float(uint8_t i, struct csi_tensor *t); -float csi_ref_int8_to_float(int8_t i, struct csi_tensor *t); -int16_t csi_ref_float32_to_float16(float value); -float csi_ref_float16_to_float32(int16_t value); -int16_t csi_ref_float32_to_bfloat16(float value); -float csi_ref_bfloat16_to_float32(int16_t value); -struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t); -void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t); -struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t, int32_t permute[4]); -struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t); -void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t); -int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents, - int32_t n); -struct csi_tensor *csi_ref_alloc_float_tensor(struct csi_tensor *src); -void csi_ref_free_float_tensor(struct csi_tensor *src); -struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src); -void csi_ref_conv_free_float_tensor(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias); -struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input); -int csi_ref_tensor_transform_free_f32(struct csi_tensor *input); -uint8_t *csi_ref_f32_to_input_dtype(uint32_t index, float *data, struct csi_session *sess); - -struct csi_ref_diso_callback { - void (*bc)(); - struct csi_tensor *input0; - struct csi_tensor *input1; - struct csi_tensor *output; - int32_t *input_dim; -}; - -void *csi_init_map_ref(int op, int dtype); - -int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params, - struct csi_ref_diso_callback *cb); -int csi_ref_broadcast_to_shape(struct csi_tensor *input, struct csi_tensor *output, int32_t *shape, - int32_t shape_count); -int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *output, - int32_t *shape, int32_t shape_count); -int csi_ref_broadcast_to_shape_quant(struct csi_tensor *input, struct csi_tensor *output, - int32_t *shape, int32_t shape_count); - -int csi_ref_siso_callback_base(struct csi_tensor *input, struct csi_tensor *output, void *params, - void *cb); -int csi_ref_diso_callback_base(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, void *params, void *cb); -int csi_ref_conv_callback_base(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, void *params, - void *cb); - -void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output); - -void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output); - -int csi_ref_flatten_init(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_ref_reshape_init(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params); - -int csi_ref_transpose_init(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params); - -void asr_buffer_init(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth); - -void *asr_buffer_insert_front(struct asr_buffer_t *buffer, void *input, size_t len); - -void *asr_buffer_insert_back(struct asr_buffer_t *buffer, void *input, size_t len); - -void *asr_buffer_get_buffer(struct asr_buffer_t *buffer); - -void asr_buffer_reset(struct asr_buffer_t *buffer); - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_CSI_REF_H_ diff --git a/include/csi_ref_i805.h b/include/csi_ref_i805.h deleted file mode 100644 index 5bc64166..00000000 --- a/include/csi_ref_i805.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_REF_I805_H_ -#define INCLUDE_CSI_REF_I805_H_ - -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_nnfunctions.h" -#include "csi_ref.h" -#include "csi_utils.h" - -int csi_ref_i805_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_ref_i805_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_ref_i805_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_ref_i805_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -int csi_ref_i805_softmax_q7(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_i805_softmax_q15(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -int csi_ref_i805_relu_q7(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_i805_relu_q15(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_ref_i805_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_i805_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_ref_i805_tanh_q7(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -int csi_ref_i805_tanh_q15(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params); - -#endif // INCLUDE_CSI_REF_I805_H_ diff --git a/include/csi_thead_rvv.h b/include/csi_thead_rvv.h deleted file mode 100644 index 2f4a8da8..00000000 --- a/include/csi_thead_rvv.h +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_THEAD_RVV_H_ -#define INCLUDE_CSI_THEAD_RVV_H_ - -#include -#include -#include -#include -#include - -#include "csi_internal.h" -#include "csi_ref.h" -#include "csi_utils.h" - -#ifdef __cplusplus -extern "C" { -#endif - -int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -/************************************ convolution *********************************/ -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel, - struct conv2d_params *params); - -int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -int csi_nn_rvv_conv3x3s1_winograd64_packn_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel); - -int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params); - -void csi_nn_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx); -void csi_nn_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx); -void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n, - int ldc, float *bias); - -void csi_nn_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx); -void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n, - int ldc, float *bias); - -void csi_nn_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); -void csi_nn_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); -void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, int n, - int ldc, __fp16 *bias); - -void csi_nn_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); -void csi_nn_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); -void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, - int n, int ldc, __fp16 *bias); - -void csi_nn_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx); -void csi_nn_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); -void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, - int n, int ldc, int32_t *bias); -void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, - int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, - int32_t *shift); - -void csi_nn_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); -void csi_nn_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, - int n, int ldc, int32_t *bias); - -void csi_nn_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx); -void csi_nn_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx); -void csi_nn_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, - int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, - int32_t *shift); - -/************************************ pooling *********************************/ -int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_avgpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool2x2s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_maxpool3x3s1_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -int csi_nn_rvv_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params); - -/************************************ fullyconnected *********************************/ -void csi_nn_rvv_fc_gemv_transform_weight_fp32(struct csi_tensor *weights); - -int csi_nn_rvv_fullyconnected_packn_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -void csi_nn_rvv_fc_gemv_transform_weight_fp16(struct csi_tensor *weights); - -int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -void csi_nn_rvv_fc_gemv_transform_weight_int8(struct csi_tensor *weights); - -int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params); - -/************************************ activation *********************************/ -int csi_nn_rvv_relu_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_leaky_relu_int8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params); - -int csi_nn_rvv_sigmoid_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params); - -int csi_nn_rvv_softmax_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params); - -/************************************ layout/memory transform *********************************/ -int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params); - -/************************************ basic math *********************************/ -int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_mul_fp32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params); - -int csi_nn_rvv_sum_stride_int8(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params); - -/************************************ utils *********************************/ -void csi_nn_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left); - -void csi_nn_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left); - -void csi_nn_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left, - int8_t pad_value); - -void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size); - -void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size); - -void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, - int inh, int inw, int padded_h, int padded_w, int pad_top, - int pad_left, int8_t pad_value); -void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size); -void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size); -void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size); - -int csrr_vl(); -int csrr_vlenb(); - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_CSI_THEAD_RVV_H_ diff --git a/include/csi_utils.h b/include/csi_utils.h deleted file mode 100644 index cb275726..00000000 --- a/include/csi_utils.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#ifndef INCLUDE_CSI_UTILS_H_ -#define INCLUDE_CSI_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#if (!defined CSI_BUILD_RTOS) -#include -#endif -#include "csi_internal.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* misc */ -void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *cls); -void csi_show_top5(struct csi_tensor *output, struct csi_session *sess); -uint64_t csi_get_timespec(); -void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg); -void csi_statistical_mean_std(float *data, int sz); -void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift); - -/* tensor */ -int csi_tensor_size(struct csi_tensor *tensor); -int csi_tensor_byte_size(struct csi_tensor *tensor); -struct csi_tensor *csi_alloc_tensor(struct csi_session *session); -void csi_free_tensor(struct csi_tensor *tensor); -void csi_realloc_quant_info(struct csi_tensor *tensor, int quant_info_num); -void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src); -int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src); - -/* op parameters */ -void *csi_alloc_params(int params_size, struct csi_session *session); -void csi_free_params(void *params); - -/* session */ -struct csi_session *csi_alloc_session(); -void csi_free_session(struct csi_session *session); -void csi_session_init(struct csi_session *session); -void csi_session_deinit(struct csi_session *session); -int csi_session_setup(struct csi_session *session); -int csi_session_run(struct csi_session *session); -int csi_load_binary_model(char *path, struct csi_session *session); - -/* input/output */ -void csi_set_input_number(int number, struct csi_session *sess); -void csi_set_output_number(int number, struct csi_session *sess); -int csi_get_input_number(struct csi_session *sess); -int csi_get_output_number(struct csi_session *sess); -int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess); -int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess); -int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess); -int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess); -int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess); -int csi_update_output(int index, struct csi_tensor *output, struct csi_session *sess); -int csi_set_tensor_entry(struct csi_tensor *tensor, struct csi_session *sess); - -/* - * model setup and run - */ -void csi_nn_init(struct csi_tensor *input, struct csi_tensor *output); - -void csi_nn_setup(void *td); - -void csi_nn_run(void *td); - -void csi_nn_postprocess(void *td); - -void csi_nn_deinit(struct csi_tensor *input, struct csi_tensor *output); - -void *csi_nn_presetup(int input, int output); -void *csi_bc_map(int api, int rmode, int op, int dtype); -void *csi_init_map(int api, int op, int dtype); - -struct csi_bc_op_list *csi_bc_list_end(struct csi_bc_op_list *list); -void *csi_bc_list_match(struct csi_bc_op_list *list, enum csinn_dtype_enum dtype, - enum csinn_op_enum op_name); - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_CSI_UTILS_H_ diff --git a/include/csi_internal.h b/include/csinn_data_structure.h similarity index 72% rename from include/csi_internal.h rename to include/csinn_data_structure.h index ddb3be7e..25161ac6 100644 --- a/include/csi_internal.h +++ b/include/csinn_data_structure.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #ifndef INCLUDE_CSI_INTERNAL_H_ #define INCLUDE_CSI_INTERNAL_H_ @@ -46,6 +46,8 @@ enum csinn_mem_type_enum { CSINN_MEM_TYPE_CPU_NOT_ALIGNED = 0, CSINN_MEM_TYPE_CPU_ALIGNED, CSINN_MEM_TYPE_DMABUF, + CSINN_MEM_TYPE_ASP42, /* structed sparsity 4:2 */ + CSINN_MEM_TYPE_ASP41, /* structed sparsity 4:1 */ }; /* quant type */ @@ -134,7 +136,6 @@ enum csinn_op_enum { CSINN_OP_CONV2D_CHANNEL, CSINN_OP_CONV2D_CHANNEL_RELU, CSINN_OP_CONV2D_CHANNEL_RELU6, - CSINN_OP_DATA_CONVERT, CSINN_OP_DEPTHWISE_CONV2D, CSINN_OP_DEPTHWISE_CONV2D_RELU, CSINN_OP_DEPTHWISE_CONV2D_RELU6, @@ -147,6 +148,7 @@ enum csinn_op_enum { CSINN_OP_GROUP_CONV2D_CHANNEL, CSINN_OP_GROUP_CONV2D_CHANNEL_RELU, CSINN_OP_CONV3D, + CSINN_OP_DATA_CONVERT, CSINN_OP_COS, CSINN_OP_COSH, CSINN_OP_CROP, @@ -284,7 +286,16 @@ enum csinn_op_enum { CSINN_OP_XOR, CSINN_OP_YUV_RGB_SCALE, - /* utils functions */ + CSINN_OP_SIZE, + + /* graph */ + CSINN_TENSOR, + CSINN_SUBGRAPH, + CSINN_SUBGRAPH_RETURN, + CSINN_OP_AND_UTILS_SIZE, +}; + +enum csinn_runtime_enum { CSINN_SESSION_INIT, CSINN_SESSION_DEINIT, CSINN_SESSION_SETUP, @@ -301,12 +312,7 @@ enum csinn_op_enum { CSINN_GET_OUTPUT, CSINN_TENSOR_ENTRY, CSINN_LOAD_BG, - - /* graph */ - CSINN_TENSOR, - CSINN_SUBGRAPH, - CSINN_SUBGRAPH_RETURN, - CSINN_OP_AND_UTILS_SIZE, + CSINN_RUNTIME_OP_SIZE, }; /* convolution mode */ @@ -354,6 +360,8 @@ enum csinn_layout_enum { // WEIGHT CSINN_LAYOUT_O, CSINN_LAYOUT_OI, + CSINN_LAYOUT_O16I16, + CSINN_LAYOUT_O32I32, CSINN_LAYOUT_OIW, CSINN_LAYOUT_OIHW, CSINN_LAYOUT_OIDHW, @@ -367,8 +375,16 @@ enum csinn_layout_enum { // WEIGHT CSINN_LAYOUT_OWI, CSINN_LAYOUT_OHWI, + CSINN_LAYOUT_O16HWI16, + CSINN_LAYOUT_O32HWI32, CSINN_LAYOUT_ODHWI, CSINN_LAYOUT_1HWO, // depthwise kernel + CSINN_LAYOUT_1HW16O16, + CSINN_LAYOUT_1HW32O32, + + // NCXHWX + // ACTIVITION + CSINN_LAYOUT_NC1HWC0, // rvv: c0=4/8/8 for fp32/fp16/int8 when vlen=128 }; enum csinn_status_enum { @@ -384,7 +400,15 @@ enum csinn_profiler_enum { CSI_PROFILER_LEVEL_TIMER, // print time }; -struct csi_quant_info { +enum csinn_debug_enum { + CSINN_DEBUG_LEVEL_DEBUG = -2, + CSINN_DEBUG_LEVEL_INFO, + CSINN_DEBUG_LEVEL_WARNING, + CSINN_DEBUG_LEVEL_ERROR, + CSINN_DEBUG_LEVEL_FATAL, +}; + +struct csinn_quant_info { int32_t zero_point; float scale; int32_t multiplier; @@ -394,7 +418,7 @@ struct csi_quant_info { }; #define MAX_DIM 8 -struct csi_tensor { +struct csinn_tensor { void *data; enum csinn_dtype_enum dtype; enum csinn_mem_type_enum mtype; @@ -404,48 +428,53 @@ struct csi_tensor { char *name; int32_t layout; int32_t quant_channel; - struct csi_quant_info *qinfo; - struct csi_session *sess; + struct csinn_quant_info *qinfo; + struct csinn_session *sess; }; -struct csi_session { +struct csinn_model { + char *bm_path; + void *bm_addr; + size_t bm_size; + int32_t save_mode; + int32_t priority; +}; + +struct csinn_session { int32_t base_dtype; int32_t base_layout; int32_t base_api; int32_t base_run_mode; enum csinn_quant_enum base_quant_type; - char *model_name; - int32_t model_save; + struct csinn_model model; int32_t debug_level; int32_t profiler_level; int32_t input_num; int32_t output_num; - struct csi_tensor **input; - struct csi_tensor **output; + struct csinn_tensor **input; + struct csinn_tensor **output; void *td; }; -struct csi_scale_zp { - float scale; - int32_t zero_point; -}; - -struct csi_min_max { - float min; - float max; +struct csinn_callback { + int (*init)(); // initialization + int (*est)(); // establish graph + int (*exec)(); // execute real compute + int (*caps)(); // capabilities + int (*perf)(); // profiling }; -struct csi_params_base { - int (*bc)(); +struct csinn_params_base { + struct csinn_callback *cb; char *name; int32_t layout; int32_t api; - int32_t run_mode; - struct csi_session *sess; + enum csinn_quant_enum quant_type; + struct csinn_session *sess; }; -struct fsmn_params { - struct csi_params_base base; +struct csinn_fsmn_params { + struct csinn_params_base base; int32_t l_order; int32_t r_order; int32_t l_stride; @@ -453,8 +482,8 @@ struct fsmn_params { int32_t unavailable_frames; }; -struct conv2d_params { - struct csi_params_base base; +struct csinn_conv2d_params { + struct csinn_params_base base; int32_t group; int32_t stride_height; int32_t stride_width; @@ -464,15 +493,17 @@ struct conv2d_params { int32_t pad_right; int32_t dilation_height; int32_t dilation_width; + int32_t out_pad_height; + int32_t out_pad_width; struct { - struct csi_tensor *kernel_tm; + struct csinn_tensor *kernel_tm; enum csinn_conv_mode_enum conv_mode; int32_t fuse_zp2bias; } conv_extra; }; -struct conv3d_params { - struct csi_params_base base; +struct csinn_conv3d_params { + struct csinn_params_base base; int32_t group; int32_t stride_depth; int32_t stride_height; @@ -491,16 +522,16 @@ struct conv3d_params { int32_t out_pad_width; }; -struct fc_params { - struct csi_params_base base; +struct csinn_fc_params { + struct csinn_params_base base; int32_t units; struct { int32_t fuse_zp2bias; } fc_extra; }; -struct pool_params { - struct csi_params_base base; +struct csinn_pool_params { + struct csinn_params_base base; int32_t pool_type; int32_t filter_height; int32_t filter_width; @@ -518,16 +549,16 @@ struct pool_params { bool count_include_pad; }; -struct unpooling_params { - struct csi_params_base base; +struct csinn_unpooling_params { + struct csinn_params_base base; int32_t scale_height; int32_t scale_width; int32_t pad_out_height; int32_t pad_out_width; }; -struct roi_align_params { - struct csi_params_base base; +struct csinn_roi_align_params { + struct csinn_params_base base; int32_t pooled_size_h; int32_t pooled_size_w; float spatial_scale; @@ -536,8 +567,8 @@ struct roi_align_params { int32_t sample_ratio; }; -struct roi_pool_params { - struct csi_params_base base; +struct csinn_roi_pool_params { + struct csinn_params_base base; int32_t pooled_size_h; int32_t pooled_size_w; float spatial_scale; @@ -545,20 +576,20 @@ struct roi_pool_params { int32_t spatial_scale_shift; }; -struct siso_params { - struct csi_params_base base; +struct csinn_siso_params { + struct csinn_params_base base; }; -struct scatter_nd_params { - struct csi_params_base base; +struct csinn_scatter_nd_params { + struct csinn_params_base base; }; -struct sigmoid_params { - struct csi_params_base base; +struct csinn_sigmoid_params { + struct csinn_params_base base; }; -struct relu_params { - struct csi_params_base base; +struct csinn_relu_params { + struct csinn_params_base base; /* n / alpha / threshold */ float n; @@ -566,25 +597,25 @@ struct relu_params { int32_t n_shift; }; -struct prelu_params { - struct csi_params_base base; +struct csinn_prelu_params { + struct csinn_params_base base; int32_t axis; }; -struct softmax_params { - struct csi_params_base base; +struct csinn_softmax_params { + struct csinn_params_base base; int32_t axis; }; -struct bn_params { - struct csi_params_base base; +struct csinn_bn_params { + struct csinn_params_base base; float epsilon; int32_t epsilon_multiplier; int32_t epsilon_shift; }; -struct l2n_params { - struct csi_params_base base; +struct csinn_l2n_params { + struct csinn_params_base base; float epsilon; int32_t epsilon_multiplier; int32_t epsilon_shift; @@ -592,8 +623,8 @@ struct l2n_params { int32_t n; }; -struct lrn_params { - struct csi_params_base base; +struct csinn_lrn_params { + struct csinn_params_base base; int32_t range; double bias; int32_t bias_multiplier; @@ -607,22 +638,22 @@ struct lrn_params { enum csinn_lrn_enum norm_region; }; -struct matmul_params { - struct csi_params_base base; +struct csinn_matmul_params { + struct csinn_params_base base; bool trans_a; bool trans_b; }; -struct diso_params { - struct csi_params_base base; +struct csinn_diso_params { + struct csinn_params_base base; }; -struct select_params { - struct csi_params_base base; +struct csinn_select_params { + struct csinn_params_base base; }; -struct pad_params { - struct csi_params_base base; +struct csinn_pad_params { + struct csinn_params_base base; int32_t *pad_before; int32_t *pad_after; int32_t pad_num; @@ -630,20 +661,20 @@ struct pad_params { enum csinn_pad_enum pad_mode; }; -struct resize_params { - struct csi_params_base base; +struct csinn_resize_params { + struct csinn_params_base base; enum csinn_resize_enum resize_mode; bool align_corners; }; -struct concat_params { - struct csi_params_base base; +struct csinn_concat_params { + struct csinn_params_base base; int32_t inputs_count; int32_t axis; }; -struct proposal_params { - struct csi_params_base base; +struct csinn_proposal_params { + struct csinn_params_base base; float *scales; int32_t *scale_multipliers; int32_t *scale_shifts; @@ -662,8 +693,8 @@ struct proposal_params { bool iou_loss; }; -struct psroipooling_params { - struct csi_params_base base; +struct csinn_psroipooling_params { + struct csinn_params_base base; int32_t output_dim; int32_t group_size; float spatial_scale; @@ -671,72 +702,72 @@ struct psroipooling_params { int32_t spatial_scale_shift; }; -struct transpose_params { - struct csi_params_base base; +struct csinn_transpose_params { + struct csinn_params_base base; int32_t *permute; int32_t permute_num; }; -struct reshape_params { - struct csi_params_base base; +struct csinn_reshape_params { + struct csinn_params_base base; int32_t *shape; int32_t shape_num; }; -struct shape_params { - struct csi_params_base base; +struct csinn_shape_params { + struct csinn_params_base base; }; -struct expand_dims_params { - struct csi_params_base base; +struct csinn_expand_dims_params { + struct csinn_params_base base; int32_t axis; }; -struct reverse_params { - struct csi_params_base base; +struct csinn_reverse_params { + struct csinn_params_base base; int32_t axis; }; -struct flatten_params { - struct csi_params_base base; +struct csinn_flatten_params { + struct csinn_params_base base; }; -struct crop_params { - struct csi_params_base base; +struct csinn_crop_params { + struct csinn_params_base base; int32_t axis; int32_t *offset; int32_t offset_num; }; -struct slice_params { - struct csi_params_base base; +struct csinn_slice_params { + struct csinn_params_base base; int32_t *begin; int32_t *end; int32_t *strides; int32_t slice_num; }; -struct split_params { - struct csi_params_base base; +struct csinn_split_params { + struct csinn_params_base base; int32_t *split_index; int32_t output_num; int32_t axis; }; -struct stack_params { - struct csi_params_base base; +struct csinn_stack_params { + struct csinn_params_base base; int32_t inputs_count; int32_t axis; }; -struct tile_params { - struct csi_params_base base; +struct csinn_tile_params { + struct csinn_params_base base; int32_t *reps; int32_t reps_num; }; -struct arange_params { - struct csi_params_base base; +struct csinn_arange_params { + struct csinn_params_base base; float start; int32_t start_multiplier; int32_t start_shift; @@ -748,42 +779,36 @@ struct arange_params { int32_t step_shift; }; -struct where_params { - struct csi_params_base base; +struct csinn_where_params { + struct csinn_params_base base; }; -struct unstack_params { - struct csi_params_base base; +struct csinn_unstack_params { + struct csinn_params_base base; int32_t outputs_count; int32_t axis; }; -struct take_params { - struct csi_params_base base; +struct csinn_gather_params { + struct csinn_params_base base; int32_t axis; - const char *mode; }; - -struct gather_params { - struct csi_params_base base; - int32_t axis; -}; -struct gather_nd_params { - struct csi_params_base base; +struct csinn_gather_nd_params { + struct csinn_params_base base; }; -struct squeeze_params { - struct csi_params_base base; +struct csinn_squeeze_params { + struct csinn_params_base base; int32_t *axis; int32_t axis_num; }; -struct ndarray_size_params { - struct csi_params_base base; +struct csinn_ndarray_size_params { + struct csinn_params_base base; }; -struct space_to_batch_params { - struct csi_params_base base; +struct csinn_space_to_batch_params { + struct csinn_params_base base; int32_t pad_top; int32_t pad_bottom; int32_t pad_left; @@ -791,15 +816,15 @@ struct space_to_batch_params { int32_t block_size; }; -struct space_to_batch_nd_params { - struct csi_params_base base; +struct csinn_space_to_batch_nd_params { + struct csinn_params_base base; int32_t *paddings; int32_t *block_shape; int32_t spatial_dim_cnt; }; -struct batch_to_space_params { - struct csi_params_base base; +struct csinn_batch_to_space_params { + struct csinn_params_base base; int32_t crop_top; int32_t crop_bottom; int32_t crop_left; @@ -807,26 +832,26 @@ struct batch_to_space_params { int32_t block_size; }; -struct batch_to_space_nd_params { - struct csi_params_base base; +struct csinn_batch_to_space_nd_params { + struct csinn_params_base base; int32_t *crops; int32_t *block_shape; int32_t spatial_dim_cnt; }; -struct space_to_depth_params { - struct csi_params_base base; +struct csinn_space_to_depth_params { + struct csinn_params_base base; int32_t block_size; }; -struct depth_to_space_params { - struct csi_params_base base; +struct csinn_depth_to_space_params { + struct csinn_params_base base; enum csinn_depth2space_enum mode; int32_t block_size; }; -struct one_hot_params { - struct csi_params_base base; +struct csinn_one_hot_params { + struct csinn_params_base base; float f_on_value; float f_off_value; int32_t on_value; @@ -835,16 +860,16 @@ struct one_hot_params { int32_t axis; }; -struct sequence_mask_params { - struct csi_params_base base; +struct csinn_sequence_mask_params { + struct csinn_params_base base; float mask_value; int32_t mask_value_multiplier; int32_t mask_value_shift; int32_t axis; }; -struct im2col_params { - struct csi_params_base base; +struct csinn_im2col_params { + struct csinn_params_base base; int32_t pad_top; int32_t pad_down; int32_t pad_left; @@ -855,16 +880,16 @@ struct im2col_params { int32_t kernel_w; }; -struct col2im_params { - struct csi_params_base base; +struct csinn_col2im_params { + struct csinn_params_base base; int32_t pad_h; int32_t pad_w; int32_t stride_h; int32_t stride_w; }; -struct reduce_params { - struct csi_params_base base; +struct csinn_reduce_params { + struct csinn_params_base base; int32_t *out_strides; int32_t *out_extents; int32_t n; @@ -877,76 +902,76 @@ struct reduce_params { bool keepdims; }; -struct reorg_params { - struct csi_params_base base; +struct csinn_reorg_params { + struct csinn_params_base base; int32_t stride; }; -struct segment_params { - struct csi_params_base base; +struct csinn_segment_params { + struct csinn_params_base base; int32_t num_segments; bool unsorted; }; -struct cumsum_params { - struct csi_params_base base; +struct csinn_cumsum_params { + struct csinn_params_base base; int32_t axis; bool exclusive; }; -struct cumprod_params { - struct csi_params_base base; +struct csinn_cumprod_params { + struct csinn_params_base base; int32_t axis; bool exclusive; }; -struct broadcast_to_params { - struct csi_params_base base; +struct csinn_broadcast_to_params { + struct csinn_params_base base; int32_t *shape; int32_t shape_count; }; -struct clip_params { - struct csi_params_base base; +struct csinn_clip_params { + struct csinn_params_base base; float min_value; float max_value; }; -struct strided_slice_params { - struct csi_params_base base; +struct csinn_strided_slice_params { + struct csinn_params_base base; int32_t *begin; int32_t *end; int32_t *stride; int32_t slice_count; }; -struct shuffle_channel_params { - struct csi_params_base base; +struct csinn_shuffle_channel_params { + struct csinn_params_base base; int32_t group; }; -struct topk_params { - struct csi_params_base base; +struct csinn_topk_params { + struct csinn_params_base base; int32_t k; }; -struct non_max_suppression_params { - struct csi_params_base base; +struct csinn_non_max_suppression_params { + struct csinn_params_base base; int32_t max_output_size; float iou_threshold; // float score_threshold; }; // modyfied to use asr model -struct layer_norm_params { - struct csi_params_base base; +struct csinn_layer_norm_params { + struct csinn_params_base base; float epsilon; bool center; bool scale; int32_t axis; }; -struct asr_buffer_t { +struct csinn_asr_buffer_t { size_t writer_index; size_t buffer_lenth; // lenth of buffer size_t data_lenth; // lenth of data @@ -954,18 +979,18 @@ struct asr_buffer_t { uint8_t flag; }; -struct cache_matmul_params { - struct csi_params_base base; - struct asr_buffer_t asr_buffer; +struct csinn_cache_matmul_params { + struct csinn_params_base base; + struct csinn_asr_buffer_t asr_buffer; int32_t *cache_shape; int32_t *shape; int32_t *axes; void *data; }; -struct cache_conv1d_params { - struct csi_params_base base; - struct asr_buffer_t asr_buffer; +struct csinn_cache_conv1d_params { + struct csinn_params_base base; + struct csinn_asr_buffer_t asr_buffer; int32_t *cache_shape; int32_t *in_shape; int32_t group; @@ -976,8 +1001,8 @@ struct cache_conv1d_params { void *data; }; -struct conv1d_params { - struct csi_params_base base; +struct csinn_conv1d_params { + struct csinn_params_base base; int32_t group; int32_t stride_width; int32_t dilation_width; @@ -985,11 +1010,4 @@ struct conv1d_params { int32_t pad_right; }; -struct csi_bc_op_list { - struct csi_bc_op_list *next; - enum csinn_dtype_enum dtype; - enum csinn_op_enum op_name; - void *bc; -}; - #endif // INCLUDE_CSI_INTERNAL_H_ diff --git a/include/csinn_runtime.h b/include/csinn_runtime.h new file mode 100644 index 00000000..18c7dd0e --- /dev/null +++ b/include/csinn_runtime.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_CSINN_RUNTIME_H_ +#define INCLUDE_CSINN_RUNTIME_H_ + +#include +#include +#include +#include +#include +#include +#include +#if (!defined SHL_BUILD_RTOS) +#include +#endif +#include "csinn_data_structure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VERSION_MAJOR 2 +#define VERSION_MINOR 0 +#define VERSION_PATCH 5 +#define VERSION_SHIFT 8 +int csinn_version(char *vstr); + +/* tensor */ +int csinn_tensor_size(struct csinn_tensor *tensor); +int csinn_tensor_byte_size(struct csinn_tensor *tensor); +struct csinn_tensor *csinn_alloc_tensor(struct csinn_session *session); +void csinn_free_tensor(struct csinn_tensor *tensor); +void csinn_realloc_quant_info(struct csinn_tensor *tensor, int quant_info_num); +void csinn_tensor_copy(struct csinn_tensor *dest, struct csinn_tensor *src); +int csinn_tensor_data_convert(struct csinn_tensor *dest, struct csinn_tensor *src); +int csinn_tensor_layout_convert(struct csinn_tensor *dest, struct csinn_tensor *src); + +/* op parameters */ +void *csinn_alloc_params(int params_size, struct csinn_session *session); +void csinn_free_params(void *params); + +/* session */ +struct csinn_session *csinn_alloc_session(); +void csinn_free_session(struct csinn_session *session); +void csinn_session_init(struct csinn_session *session); +void csinn_session_deinit(struct csinn_session *session); +int csinn_session_setup(struct csinn_session *session); +int csinn_session_run(struct csinn_session *session); +int csinn_load_binary_model(struct csinn_session *session); +struct csinn_session *__attribute__((weak)) csinn_import_binary_model(char *bm_addr); + +/* input/output */ +void csinn_set_input_number(int number, struct csinn_session *sess); +void csinn_set_output_number(int number, struct csinn_session *sess); +int csinn_get_input_number(struct csinn_session *sess); +int csinn_get_output_number(struct csinn_session *sess); +int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess); +int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess); +int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess); +int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess); +int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess); +int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess); +int csinn_set_tensor_entry(struct csinn_tensor *tensor, struct csinn_session *sess); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_CSINN_RUNTIME_H_ diff --git a/include/include_xt800/csi_i805_nnfunction.h b/include/include_xt800/csi_i805_nnfunction.h deleted file mode 100644 index 11a47c42..00000000 --- a/include/include_xt800/csi_i805_nnfunction.h +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nnfunctions.h - * Description: Public header file for CSI NN Library - * - * -------------------------------------------------------------------- */ - -#ifndef INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_ -#define INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "csky_vdsp2_nnfunctions.h" - -/** - * @brief u8 asym quant generic convolution optimized function - * @param[in] input_data pointer to input tensor data - * @param[in] kernel_data pointer to kernel tensor data - * @param[in] bias_data pointer to bias tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in,out] bufferA pointer to buffer for input/im2col data - * @param[in] input_h input height - * @param[in] input_w input width - * @param[in] input_ch input channel / output_channel - * @param[in] kernel_h kernel height - * @param[in] kernel_w kernel width - * @param[in] pad_h pad on height - * @param[in] pad_w pad on width - * @param[in] stride_h stride on height - * @param[in] stride_w stride on width - * @param[in] out_h output height - * @param[in] out_w output width - * @param[in] input_zero_point input zero_point - * @param[in] kernel_zero_point weight zero_point - * @param[in] output_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right - * @return none. - * bufferA size: 2*input_ch*kernel_h*kernel_w - */ -void csi_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, - uint8_t *output_data, uint8_t *bufferA, int32_t input_h, - int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, - int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, - int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point, - int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult, - int32_t out_shift); - -/** - * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function - * @param[in] input_data pointer to input tensor data - * @param[in] kernel_data pointer to kernel tensor data - * @param[in] bias_data pointer to bias tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] input_hxw input height mul width - * @param[in] input_ch input channel - * @param[in] output_ch output_channel - * @param[in] input_zero_point input zero_point - * @param[in] kernel_zero_point weight zero_point - * @param[in] output_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right - * @return none. - * - */ -void csi_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, - uint8_t *output_data, int32_t input_hxw, int32_t input_ch, - int32_t output_ch, int32_t input_zero_point, - int32_t weight_zero_point, int32_t output_zero_point, - int32_t out_mult, int32_t out_shift); - -/** - * @brief u8 asym quant depthwise convolution optimized function - * @param[in] input_data pointer to input tensor data - * @param[in] kernel_data pointer to kernel tensor data - * @param[in] bias_data pointer to bias tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in,out] bufferA pointer to buffer for input/im2col data - * @param[in] input_h input height - * @param[in] input_w input width - * @param[in] input_ch input channel / output_channel - * @param[in] kernel_h kernel height - * @param[in] kernel_w kernel width - * @param[in] pad_h pad on height - * @param[in] pad_w pad on width - * @param[in] stride_h stride on height - * @param[in] stride_w stride on width - * @param[in] out_h output height - * @param[in] out_w output width - * @param[in] input_zero_point input zero_point - * @param[in] kernel_zero_point weight zero_point - * @param[in] output_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right - * @return none. - * bufferA size: 4*input_ch*kernel_h*kernel_w - */ -void csi_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, - uint8_t *output_data, uint8_t *bufferA, int32_t input_h, - int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, - int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, - int32_t out_h, int32_t out_w, int32_t input_zero_point, - int32_t weight_zero_point, int32_t output_zero_point, - int32_t out_mult, int32_t out_shift); - -/** - * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function - * @param[in] input pointer to input tensor data - * @param[in] kernel pointer to kernel tensor data - * @param[in] bias pointer to bias tensor data - * @param[in,out] output pointer to output tensor data - * @param[in] input_zero_point input zero_point - * @param[in] kernel_zero_point weight zero_point - * @param[in] output_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right - * @return none. - * - */ -void csi_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output, - int32_t input_zero_point, int32_t kernel_zero_point, - int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift); - -/** - * @brief u8 asym quant fullyconnected optimized function - * @param[in] input_data pointer to input tensor data - * @param[in] weight_data pointer to weight tensor data - * @param[in] bias_data pointer to bias tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] in_nodes input nodes (weight cols) - * @param[in] out_nodes output nodes (weight rows) - * @param[in] input_zero_point input zero_point - * @param[in] weight_zero_point weight zero_point - * @param[in] output_zero_point output zero_point - * @param[in] output_mult multiplier for s1 * s2 / s3 - * @param[in] output_shift output shift for s1 * s2 / s3. shift_right - * @return none. - * - */ -void csi_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data, - uint8_t *output_data, int32_t in_nodes, int32_t out_nodes, - int32_t input_zero_point, int32_t weight_zero_point, - int32_t output_zero_point, int32_t output_mult, - int32_t output_shift); - -/** - * @brief u8 asym quant generic maxpool optimized function - * @param[in] input_data pointer to input tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] input_h input height - * @param[in] input_w input width - * @param[in] input_ch input channel / output_channel - * @param[in] kernel_h kernel height - * @param[in] kernel_w kernel width - * @param[in] pad_h pad on height - * @param[in] pad_w pad on width - * @param[in] stride_h stride on height - * @param[in] stride_w stride on width - * @param[in] out_h output height - * @param[in] out_w output width - * @return none. - * bufferA size: 2*input_ch*kernel_h*kernel_w - */ -void csi_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h, - int32_t input_w, int32_t input_ch, int32_t kernel_h, - int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h, - int32_t stride_w, int32_t output_h, int32_t output_w); - -/** - * @brief u8 asym quant relu optimized function - * @param[in,out] data pointer to input/output tensor data, compute inplace - * @param[in] size input tensor size, tensor length - * @param[in] input_zeropoint input zero_point - * @param[in] out_multiplier multiplier for sacle_in / scale_out - * @param[in] out_shift shift left > 0 - * @return none. - * can be fused with conv/fc - */ -void csi_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, - int32_t out_multiplier, int32_t out_shift); - -/** - * @brief u8 asym quant relu6 optimized function - * @param[in,out] data pointer to input/output tensor data, compute inplace - * @param[in] size input tensor size, tensor length - * @param[in] input_zeropoint input zero_point - * @param[in] out_multiplier multiplier for sacle_in / scale_out - * @param[in] out_shift shift left > 0 - * @return none. - * can be fused with conv/fc - */ -void csi_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, - int32_t out_multiplier, int32_t out_shift); - -/** - * @brief u8 asym quant clip optimized function - * @param[in] input_data pointer to input tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] size input tensor size, tensor length - * @param[in] clip_qmin clip min value(quant) - * @param[in] clip_qmax clip max value(quant) - * @param[in] input_zeropoint input zero_point - * @param[in] output_zeropoint output zero_point - * @param[in] out_multiplier multiplier for sacle_in / scale_out - * @param[in] out_shift shift left > 0 - * @return none. - * can be fused with conv/fc - */ -void csi_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min, - int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint, - int32_t out_multiplier, int32_t out_shift); - -/** - * @brief u8 asym quant element add optimized function - * @param[in] input_0 pointer to input_0 tensor data - * @param[in] input_1 pointer to input_1 tensor data - * @param[in,out] output pointer to output tensor data - * @param[in] size input tensor size, tensor length, element size - * @param[in] input_0_zeroponit input_0 zero_point. Range: Range: -255 to 0 - * @param[in] input_0_mult multiplier for sacle_input_0 - * @param[in] input_0_shift input_0 shift - * @param[in] input_1_zeropoint input_1 zero_point. Range: Range: -255 to 0 - * @param[in] input_1_mult multiplier for sacle_input_1 - * @param[in] input_1_shift input_1 shift - * @param[in] output_zeropoint output zero_point - * @param[in] output_mult multiplier for scale_output - * @param[in] output_shift output shift - * @return none. - * - */ -void csi_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, - int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult, - int32_t input_0_shift, int32_t input_1_zeropoint, - int32_t input_1_mult, int32_t input_1_shift, - int32_t output_zeropoint, int32_t output_mult, - int32_t output_shift); - -/** - * @brief u8 asym quant element mul optimized function - * @param[in] input_0 pointer to input_0 tensor data - * @param[in] input_1 pointer to input_1 tensor data - * @param[in,out] output pointer to output tensor data - * @param[in] size input tensor size, tensor length, element size - * @param[in] input_0_zeroponit input_0 zero_point - * @param[in] input_1_zeropoint input_1 zero_point - * @param[in] output_zeropoint output zero_point - * @param[in] output_mult multiplier for s1 * s2 / s3 - * @param[in] output_shift output shift for s1 * s2 / s3 - * @return none. - * - */ -void csi_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, - int32_t size, int32_t input_0_zeroponit, - int32_t input_1_zeropoint, int32_t output_zeropoint, - int32_t output_mult, int32_t output_shift); - -/** - * @brief u8 asym quant softmax optimized function - * @param[in] input_data pointer to input tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] size tensor size - * @param[in] out_mult multiplier - * @param[in] out_shift output shift - * @return none. - * - */ -void csi_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, - int32_t out_mult, int32_t out_shift); - -/** - * @brief u8 asym quant reshape optimized function - * @param[in] input_data pointer to input tensor data - * @param[in,out] output_data pointer to output tensor data - * @param[in] size tensor size - * @return none. - * - */ -void csi_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size); - -/** - * @brief u8 asym quant vec and matrix mul optimized function - * @param[in] lhs pointer to input tensor data - * @param[in] rhs pointer to weight tensor data - * @param[in] bias pointer to bias tensor data - * @param[in,out] dst pointer to output tensor data - * @param[in] rhs_col input nodes (weight cols) - * @param[in] rhs_row output nodes (weight rows) - * @param[in] lhs_zero_point input zero_point - * @param[in] rhs_zero_point weight zero_point - * @param[in] dst_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3 - * @return none. - * - */ -void csi_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, - int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point, - int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult, - int32_t dst_shift); - -/** - * @brief u8 asym quant matrix mul(A * B_trans) optimized function - * @param[in] lhs pointer to input tensor data - * @param[in] rhs pointer to weight tensor data - * @param[in] bias pointer to bias tensor data - * @param[in,out] dst pointer to output tensor data - * @param[in] lhs_row input row / m - * @param[in] lhs_col input col / k - * @param[in] rhs_row weight row / n - * @param[in] lhs_zero_point input zero_point - * @param[in] rhs_zero_point weight zero_point - * @param[in] dst_zero_point output zero_point - * @param[in] dst_mult multiplier for s1 * s2 / s3 - * @param[in] dst_shift output shift for s1 * s2 / s3 - * @return none. - * - */ -void csi_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, - int32_t lhs_row, int32_t lhs_col, int32_t rhs_row, - int32_t lhs_zero_point, int32_t rhs_zero_point, - int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift); - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_ diff --git a/include/include_xt800/csi_instance.h b/include/include_xt800/csi_instance.h deleted file mode 100644 index 2fe3adcd..00000000 --- a/include/include_xt800/csi_instance.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (C) 2016-2020 T-head Limited. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/****************************************************************************** - * @file csi_instance.h - * @brief Some common define - * @version V1.0 - * @date Feb. 2020 - ******************************************************************************/ - -#ifndef INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_ -#define INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/** - * @brief 8-bit fractional data type in 1.7 format. - */ -typedef int8_t q7_t; - -/** - * @brief 16-bit fractional data type in 1.15 format. - */ -typedef int16_t q15_t; - -/** - * @brief 32-bit fractional data type in 1.31 format. - */ -typedef int32_t q31_t; - -/** - * @brief 64-bit fractional data type in 1.63 format. - */ -typedef int64_t q63_t; - -/** - * @brief 32-bit floating-point type definition. - */ -typedef float float32_t; - -/** - * @brief 64-bit floating-point type definition. - */ -typedef double float64_t; - -/** - @brief definition to read/write two 16 bit values. - @deprecated - */ -#define __SIMD32_TYPE int32_t -#define __SIMD32(addr) (*(__SIMD32_TYPE **)&(addr)) - -/** - * @brief definition to pack two 16 bit values. - */ -#define __PKHBT(ARG1, ARG2, ARG3) \ - ((((int32_t)(ARG1) << 0) & (int32_t)0x0000FFFF) | \ - (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)) -#define __PKHTB(ARG1, ARG2, ARG3) \ - ((((int32_t)(ARG1) << 0) & (int32_t)0xFFFF0000) | \ - (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)) - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_ diff --git a/include/include_xt800/csi_nn_tables.h b/include/include_xt800/csi_nn_tables.h deleted file mode 100644 index 77ce9101..00000000 --- a/include/include_xt800/csi_nn_tables.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csky_nn_tables.h - * Description: Extern declaration for NN tables - * -------------------------------------------------------------------- */ - -#ifndef INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_ -#define INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_ - -#include "csi_instance.h" - -/** -* @brief tables for various activation functions -* -*/ - -extern const q15_t sigmoidTable_q15[256]; -extern const q7_t sigmoidTable_q7[256]; - -extern const q7_t tanhTable_q7[256]; -extern const q15_t tanhTable_q15[256]; - - /** - * @brief 2-way tables for various activation functions - * - * 2-way table, H table for value larger than 1/4 - * L table for value smaller than 1/4, H table for remaining - * We have this only for the q15_t version. It does not make - * sense to have it for q7_t type - */ -extern const q15_t sigmoidHTable_q15[192]; -extern const q15_t sigmoidLTable_q15[128]; - -extern const q15_t sigmoidLTable_q15[128]; -extern const q15_t sigmoidHTable_q15[192]; - -#endif // INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_ diff --git a/include/include_xt800/csi_nnsupportfunctions.h b/include/include_xt800/csi_nnsupportfunctions.h deleted file mode 100644 index 38a3b01f..00000000 --- a/include/include_xt800/csi_nnsupportfunctions.h +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nnsupportfunctions.h - * Description: Public header file of support functions for CSI NN Library - * - * -------------------------------------------------------------------- */ - -#ifndef INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_ -#define INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_ - -#include "csi_instance.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * @brief Union for SIMD access of Q31/Q15/Q7 types - */ -union csi_nnword { - q31_t word; /**< Q31 type */ - q15_t half_words[2]; /**< Q15 type */ - q7_t bytes[4]; /**< Q7 type */ -}; - -/** - * @defgroup nndata_convert Neural Network Data Conversion Functions - * - * Perform data type conversion in-between neural network operations - * - */ - -/** - * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - */ - -void csi_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); - -/** - * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - */ - -void csi_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); - -#if defined(CSI_MATH_DSP) - -/* - * @brief C custom defined SXTB16 - */ -uint32_t __SXTB16(uint32_t x) -{ - return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) | - ((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000))); -} - -/** - \brief Rotate Right in unsigned value (32 bit) - \details Rotate Right (immediate) provides the value of the contents of a register rotated by a - variable number of bits. \param [in] op1 Value to rotate \param [in] op2 Number of Bits to - rotate \return Rotated value - */ -uint32_t __ROR(uint32_t op1, uint32_t op2) { return (op1 >> op2) | (op1 << (32U - op2)); } - -int32_t __SSAT_8(int32_t x) -{ - int32_t res = x; - if (x > 0x7f) { - res = 0x7f; - } else if (x < -128) { - res = -128; - } - - return res; -} - -/** - \details This function saturates a signed value. - \param [in] x Value to be saturated - \param [in] y Bit position to saturate to [1..32] - \return Saturated value. - */ -int32_t __SSAT(int32_t x, uint32_t y) -{ - int32_t posMax, negMin; - uint32_t i; - - posMax = 1; - - for (i = 0; i < (y - 1); i++) { - posMax = posMax * 2; - } - - if (x > 0) { - posMax = (posMax - 1); - - if (x > posMax) { - x = posMax; - } - - // x &= (posMax * 2 + 1); - } else { - negMin = -posMax; - - if (x < negMin) { - x = negMin; - } - - // x &= (posMax * 2 - 1); - } - - return (x); -} - -/** - \brief Unsigned Saturate - \details Saturates an unsigned value. - \param [in] value Value to be saturated - \param [in] sat Bit position to saturate to (0..31) - \return Saturated value - */ -uint32_t __USAT(uint32_t value, uint32_t sat) -{ - uint32_t result; - - if ((((0xFFFFFFFF >> sat) << sat) & value) != 0) { - result = 0xFFFFFFFF >> (32 - sat); - } else { - result = value; - } - - return (result); -} - -/** - \brief Dual 16-bit saturating subtract. - \details This function enables you to perform two 16-bit integer subtractions in parallel, - saturating the results to the 16-bit signed integer range -2^15 <= x <= 2^15 - 1. - \param [in] x first two 16-bit summands. - \param [in] y second two 16-bit summands. - \return the saturated subtraction of the low halfwords, in the low halfword of the return - value.\n the saturated subtraction of the high halfwords, in the high halfword of the return - value.\n The returned results are saturated to the 16-bit signed integer range -2^15 <= x <= 2^15 - - 1. \remark res[15:0] = val1[15:0] - val2[15:0] \n res[31:16] = val1[31:16] - - val2[31:16] - */ -uint32_t __QSUB16(uint32_t x, uint32_t y) -{ - int32_t r, s; - - r = __SSAT(((((int32_t)x << 16) >> 16) - (((int32_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; - s = __SSAT(((((int32_t)x) >> 16) - (((int32_t)y) >> 16)), 16) & (int32_t)0x0000FFFF; - - return ((uint32_t)((s << 16) | (r))); -} - -/** - \brief Quad 8-bit saturating subtract. - \details This function enables you to perform four 8-bit integer subtractions, - saturating the results to the 8-bit signed integer range -2^7 <= x <= 2^7 - 1. - \param [in] x first four 8-bit summands. - \param [in] y second four 8-bit summands. - \return the subtraction of the first byte of each operand in the first byte of the return - value.\n the subtraction of the second byte of each operand in the second byte of the return - value.\n the subtraction of the third byte of each operand in the third byte of the return - value.\n the subtraction of the fourth byte of each operand in the fourth byte of the return - value.\n The returned results are saturated to the 8-bit signed integer range -2^7 <= x <= 2^7 - - 1. \remark res[7:0] = val1[7:0] - val2[7:0] \n res[15:8] = val1[15:8] - val2[15:8] - \n res[23:16] = val1[23:16] - val2[23:16] \n res[31:24] = val1[31:24] - val2[31:24] - */ -uint32_t __QSUB8(uint32_t x, uint32_t y) -{ - int32_t r, s, t, u; - - r = __SSAT(((((int32_t)x << 24) >> 24) - (((int32_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF; - s = __SSAT(((((int32_t)x << 16) >> 24) - (((int32_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF; - t = __SSAT(((((int32_t)x << 8) >> 24) - (((int32_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF; - u = __SSAT(((((int32_t)x) >> 24) - (((int32_t)y) >> 24)), 8) & (int32_t)0x000000FF; - - return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r))); -} - -/** - \brief Dual 16-bit signed multiply with single 32-bit accumulator. - \details This function enables you to perform two signed 16-bit multiplications, - adding both results to a 32-bit accumulate operand. - \param [in] x first 16-bit operands for each multiplication. - \param [in] y second 16-bit operands for each multiplication. - \param [in] sum accumulate value. - \return the product of each multiplication added to the accumulate value, as a 32-bit - integer. \remark p1 = val1[15:0] * val2[15:0] \n p2 = val1[31:16] * val2[31:16] \n - res[31:0] = p1 + p2 + val3[31:0] - */ - -uint32_t __SMLAD(uint32_t x, uint32_t y, uint32_t sum) -{ - return ((uint32_t)(((((int32_t)x << 16) >> 16) * (((int32_t)y << 16) >> 16)) + - ((((int32_t)x) >> 16) * (((int32_t)y) >> 16)) + (((int32_t)sum)))); -} -/** - \brief Dual 16-bit saturating addition. - \details This function enables you to perform two 16-bit integer arithmetic additions in parallel, - saturating the results to the 16-bit signed integer range -2^15 <= x <= 2^15 - 1. - \param [in] x first two 16-bit summands. - \param [in] y second two 16-bit summands. - \return the saturated addition of the low halfwords, in the low halfword of the return - value.\n the saturated addition of the high halfwords, in the high halfword of the return value.\n - The returned results are saturated to the 16-bit signed integer range -2^15 <= x <= - 2^15 - 1. \remark res[15:0] = val1[15:0] + val2[15:0] \n res[31:16] = val1[31:16] + - val2[31:16] - */ -uint32_t __QADD16(uint32_t x, uint32_t y) -{ - int32_t r = 0, s = 0; - - r = __SSAT(((((int32_t)x << 16) >> 16) + (((int32_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF; - s = __SSAT(((((int32_t)x) >> 16) + (((int32_t)y) >> 16)), 16) & (int32_t)0x0000FFFF; - - return ((uint32_t)((s << 16) | (r))); -} - -/** - * @brief read and expand one Q7 word into two Q15 words - */ - -void *read_and_pad(void *source, q31_t *out1, q31_t *out2) -{ - q31_t inA = *__SIMD32(source)++; - q31_t inAbuf1 = __SXTB16(__ROR(inA, 8)); - q31_t inAbuf2 = __SXTB16(inA); - -#ifndef CSKY_MATH_BIG_ENDIAN - *out2 = __PKHTB(inAbuf1, inAbuf2, 16); - *out1 = __PKHBT(inAbuf2, inAbuf1, 16); -#else - *out1 = __PKHTB(inAbuf1, inAbuf2, 16); - *out2 = __PKHBT(inAbuf2, inAbuf1, 16); -#endif - - return source; -} - -/** - * @brief read and expand one Q7 word into two Q15 words with reordering - */ - -void *read_and_pad_reordered(void *source, q31_t *out1, q31_t *out2) -{ - q31_t inA = *__SIMD32(source)++; -#ifndef CSKY_MATH_BIG_ENDIAN - *out2 = __SXTB16(__ROR(inA, 8)); - *out1 = __SXTB16(inA); -#else - *out1 = __SXTB16(__ROR(inA, 8)); - *out2 = __SXTB16(inA); -#endif - - return source; -} -#endif - -q7_t *csi_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, const q15_t *pInBuffer, - const uint16_t ch_im_out, const uint16_t numCol_A, - const uint16_t bias_shift, const uint16_t out_shift, - const q7_t *bias, q7_t *pOut); - -q7_t *csi_nn_mat_mult_kernel_q7_q15(const q7_t *pA, const q15_t *pInBuffer, - const uint16_t ch_im_out, const uint16_t numCol_A, - const uint16_t bias_shift, const uint16_t out_shift, - const q7_t *bias, q7_t *pOut); - -/** - * @brief A few utility functions used by pooling functions - * - */ - -void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale); - -void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length); - -/** - * @brief defition to adding rouding offset - */ -#ifndef CSKY_NN_TRUNCATE -#define NN_ROUND(out_shift) (0x1 << (out_shift - 1)) -#else -#define NN_ROUND(out_shift) 0 -#endif - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_ diff --git a/include/include_xt800/csky_dsp2_nnfunctions.h b/include/include_xt800/csky_dsp2_nnfunctions.h deleted file mode 100644 index e45e137f..00000000 --- a/include/include_xt800/csky_dsp2_nnfunctions.h +++ /dev/null @@ -1,745 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csky_dsp2_nnfunctions.h - * Description: Public header file for CSI NN Library - * - * -------------------------------------------------------------------- */ - -#ifndef INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_ -#define INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_ - -#ifdef __cplusplus -extern "C" -{ -#endif - -#include "csi_instance.h" -/** - * @brief Struct for specifying activation function types - * - */ -typedef enum -{ - CSKY_SIGMOID = 0, /**< Sigmoid activation function */ - CSKY_TANH = 1, /**< Tanh activation function */ -} csky_dsp2_nn_activation_type; - - /** - * @brief Basic Q7 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - */ - -void csky_dsp2_convolve_HWC_q7_basic(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Basic Q15 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - */ - -void csky_dsp2_convolve_HWC_q15_basic(const q15_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q15_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q15_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Fast Q7 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 4 - * ch_im_out is multiple of 2 - */ - -void csky_dsp2_convolve_HWC_q7_fast(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Fast Q7 convolution function (non-sqaure shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel_x filter kernel size x - * @param[in] dim_kernel_y filter kernel size y - * @param[in] padding_x padding size x - * @param[in] padding_y padding size y - * @param[in] stride_x convolution stride x - * @param[in] stride_y convolution stride y - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 4 - * ch_im_out is multiple of 2 - */ - -void csky_dsp2_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel_x, - const uint16_t dim_kernel_y, - const uint16_t padding_x, - const uint16_t padding_y, - const uint16_t stride_x, - const uint16_t stride_y, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA); - - /** - * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel_x filter kernel size x - * @param[in] dim_kernel_y filter kernel size y - * @param[in] padding_x padding size x - * @param[in] padding_y padding size y - * @param[in] stride_x convolution stride x - * @param[in] stride_y convolution stride y - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 - * and dim_kernel_y=1). It can be used for - * second half of MobileNets after depthwise separable convolution. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 4 - * ch_im_out is multiple of 2 - */ -void csky_dsp2_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA); - - /** - * @brief Q7 version of convolution for RGB image - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This kernel is written exclusively for convolution with ch_im_in - * equals 3. This applies on the first layer of CNNs which has input - * image with RGB format. - */ - -void csky_dsp2_convolve_HWC_q7_RGB(const q7_t * Im_in, - const uint16_t dim_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Fast Q15 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 2 - * ch_im_out is multiple of 2 - */ - -void csky_dsp2_convolve_HWC_q15_fast(const q15_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q15_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q15_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Q7 depthwise separable convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 2 - * ch_im_out is multiple of 2 - */ - -void csky_dsp2_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA); - - /** - * @brief Q7 depthwise separable convolution function (non-square shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel_x filter kernel size x - * @param[in] dim_kernel_y filter kernel size y - * @param[in] padding_x padding sizes x - * @param[in] padding_y padding sizes y - * @param[in] stride_x convolution stride x - * @param[in] stride_y convolution stride y - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return none. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 2 - * ch_im_out is multiple of 2 - */ -void csky_dsp2_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel_x, - const uint16_t dim_kernel_y, - const uint16_t padding_x, - const uint16_t padding_y, - const uint16_t stride_x, - const uint16_t stride_y, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA); - - - /** - * @brief Q7 basic fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return none. - */ - -void csky_dsp2_fully_connected_q7(const q7_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut); - - /** - * @brief Q7 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @param[in,out] vec_buffer pointer to buffer space for input - * @return none. - * - */ - -void csky_dsp2_fully_connected_q7_opt(const q7_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut, - q15_t * vec_buffer); - - /** - * @brief Q15 basic fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return none. - * - */ - -void csky_dsp2_fully_connected_q15(const q15_t * pV, - const q15_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut); - - /** - * @brief Q15 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return none. - * - */ - -void csky_dsp2_fully_connected_q15_opt(const q15_t * pV, - const q15_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut); - - /** - * @brief Mixed Q15-Q7 fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return none. - * - */ - -void csky_dsp2_fully_connected_mat_q7_vec_q15(const q15_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut); - - /** - * @brief Mixed Q15-Q7 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return none. - * - */ - -void csky_dsp2_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut); - -/** - * @brief Matrix-Multiplication Kernels for Convolution - * - * These functions are used within convolution layer functions for - * matrix multiplication. - * - * The implementation is similar to CSI-DSP csky_dsp2_mat_mult functions - * with one Q7 and one Q15 operands. The Q15 operand is the im2col - * output which is always with 2 columns. - * - */ - - /** - * @brief Matrix-multiplication function for convolution - * @param[in] pA pointer to operand A - * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors - * @param[in] ch_im_out numRow of A - * @param[in] numCol_A numCol of A - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias the bias - * @param[in,out] pOut pointer to output - * @return The function returns the incremented output pointer - */ - -q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15(const q7_t * pA, - const q15_t * pInBuffer, - const uint16_t ch_im_out, - const uint16_t numCol_A, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut); - - /** - * @brief Matrix-multiplication function for convolution with reordered columns - * @param[in] pA pointer to operand A - * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors - * @param[in] ch_im_out numRow of A - * @param[in] numCol_A numCol of A - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias the bias - * @param[in,out] pOut pointer to output - * @return The function returns the incremented output pointer - */ - -q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA, - const q15_t * pInBuffer, - const uint16_t ch_im_out, - const uint16_t numCol_A, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut); - -#ifdef __cplusplus -} -#endif - -/* - * Other functions - * These layers are typically not timing critical - * Basic implementation is supported here - */ - -#ifdef __cplusplus -extern "C" -{ -#endif - - - /** - * @brief Q7 RELU function - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @return none. - */ - -void csky_dsp2_relu_q7(q7_t * data, uint16_t size); - - /** - * @brief Q15 RELU function - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @return none. - */ - -void csky_dsp2_relu_q15(q15_t * data, uint16_t size); - - /** - * @brief Q7 neural network activation function using direct table look-up - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 - * @param[in] type type of activation functions - * @return none. - */ - -void csky_dsp2_nn_activations_direct_q7(q7_t * data, uint16_t size, - uint16_t int_width, - csky_dsp2_nn_activation_type type); - - /** - * @brief Q15 neural network activation function using direct table look-up - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 - * @param[in] type type of activation functions - * @return none. - */ - -void csky_dsp2_nn_activations_direct_q15(q15_t * data, uint16_t size, - uint16_t int_width, - csky_dsp2_nn_activation_type type); - - /** - * @brief Q7 max pooling function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @param[in,out] Im_out pointer to output tensor - * @return none. - * - */ - -void csky_dsp2_maxpool2d_q7_HWC(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out); - - /** - * @brief Q7 average pooling function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @param[in,out] Im_out pointer to output tensor - * @return none. - * - */ - -void csky_dsp2_avepool_q7_HWC(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out); - - - /** - * @brief Q7 softmax function - * @param[in] vec_in pointer to input vector - * @param[in] dim_vec input vector dimention - * @param[out] p_out pointer to output vector - * @return none. - * - */ - -void csky_dsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); - - /** - * @brief Q15 softmax function - * @param[in] vec_in pointer to input vector - * @param[in] dim_vec input vector dimention - * @param[out] p_out pointer to output vector - * @return none. - * - */ - -void csky_dsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, - q15_t *p_out); - -#ifdef __cplusplus -} -#endif - -#endif // INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_ diff --git a/include/csi_c860.h b/include/shl_c860.h similarity index 77% rename from include/csi_c860.h rename to include/shl_c860.h index 87310f63..5a807f99 100644 --- a/include/csi_c860.h +++ b/include/shl_c860.h @@ -16,21 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #ifndef INCLUDE_CSI_C860_H_ #define INCLUDE_CSI_C860_H_ -#include -#include -#include -#include +#include "csi_nn.h" +#include "shl_ref.h" -#include "csi_internal.h" -#include "csi_ref.h" -#include "csi_utils.h" - -void csi_dequantize_f32_c860(uint8_t *input, float *output, int32_t offset, int32_t multiplier, +void shl_c860_dequantize_f32(uint8_t *input, float *output, int32_t offset, int32_t multiplier, int32_t shift, int32_t length); #endif // INCLUDE_CSI_C860_H_ diff --git a/include/shl_c906.h b/include/shl_c906.h new file mode 100644 index 00000000..49300088 --- /dev/null +++ b/include/shl_c906.h @@ -0,0 +1,519 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_C906_H_ +#define INCLUDE_SHL_C906_H_ + +#include "csi_nn.h" +#include "shl_gref.h" +#include "shl_ref.h" +#include "shl_thead_rvv.h" + +/************************** f32 func declaration ***************************/ +int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_c906_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); + +int shl_c906_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_c906_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +int shl_c906_split_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); + +int shl_c906_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); + +int shl_c906_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); + +int shl_c906_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params); + +int shl_c906_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +/* pack */ +void shl_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx); + +void shl_c906_reorder_input(float *b, float *sb, int k, int n, int ldx); + +void shl_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx); + +/* gemm */ +void shl_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n, + int ldc, float *bias, bool fuse_relu); + +/* kernel transform */ +void shl_c906_conv1x1s1_sgemm_transform_kernel(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +void shl_c906_conv_im2col_sgemm_transform_kernel(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s1_winograd23_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd43_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd64_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd64_transform_kernel_1(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +/* convolution optimization */ +int shl_c906_conv1x1s1_sgemm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv1x1s1_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv_im2col_sgemm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv_im2col_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd23(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd43(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd64(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd64_1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd64_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd43_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/* depthwise convolution optimization */ +int shl_c906_dwconv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv5x5s1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv5x5s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s1_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/* depthwise convolution fuse relu */ +int shl_c906_dwconv3x3s1_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv5x5s1_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv5x5s2_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s1_pack4_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2_pack4_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv2d_s1_pad0_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/************************** fp16 func declaration ***************************/ +int shl_c906_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_minimum_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_c906_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c906_pad_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); + +int shl_c906_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_relu1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); + +int shl_c906_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_c906_abs_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_c906_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_c906_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +int shl_c906_split_fp16(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); + +int shl_c906_fullyconnected_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_fullyconnected_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_fullyconnected_pack8_fp16_1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *weights, + struct csinn_tensor *bias, + struct csinn_fc_params *params); + +void shl_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx); + +void shl_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx); + +/* pack fp16 */ +void shl_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); +void shl_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); + +void shl_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx); + +void shl_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx); +void shl_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx); + +/* gemm fp16 */ +void shl_c906_sgemm_kernel_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, + int n, int ldc, __fp16 *bias); +void shl_c906_sgemm_kernel_fp16_1(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, + int n, int ldc, __fp16 *bias); + +/* gemv fp16 */ +void shl_c906_gemv_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, + int ldc, __fp16 *bias); +void shl_c906_gemv_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, + int ldc, __fp16 *bias); + +void shl_c906_gemv_trans_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, + int ldc, __fp16 *bias); +void shl_c906_gemv_trans_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n, + int ldc, __fp16 *bias); + +/* kernel transform fp16 */ +void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel); + +/* convolution optimization fp16 */ +int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv1x1s1_batch_gemv_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd43_pack8_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_conv3x3s1_winograd64_pack8_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c906_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/* depthwise convolution optimization for fp16*/ +int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c906_dwconv3x3s2_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/* utils */ +void shl_c906_memcpy(void *dst, const void *src, size_t n); + +void shl_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left); + +void shl_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w, + int wino_h, int wino_w); + +void shl_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left); + +void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h, + int out_w, int wino_h, int wino_w); + +/*asr related fuctions*/ +int shl_c906_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_c906_cache_matmul_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_c906_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); + +int shl_c906_layer_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); + +int shl_c906_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_c906_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); + +int shl_c906_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); + +int shl_c906_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_c906_cache_conv1d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_c906_lrn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); + +void asr_buffer_init_c906(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth); + +void *asr_buffer_insert_c906_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len); + +void *asr_buffer_insert_c906_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len); + +void *asr_buffer_get_buffer_c906(struct csinn_asr_buffer_t *buffer); + +void asr_buffer_reset_c906(struct csinn_asr_buffer_t *buffer); + +void shl_c906_reset_fcsr(); +int shl_c906_get_fcsr(); + +/* hardware performance */ +struct shl_c906_hpm { + size_t inst; + size_t cycle; + size_t l1_icache_access; + size_t l1_icache_miss; + size_t store_inst; + size_t l1_dcache_raccess; + size_t l1_dcache_rmiss; + size_t l1_dcache_waccess; + size_t l1_dcache_wmiss; +}; + +uint64_t shl_c906_get_inst(); +uint64_t shl_c906_get_cycle(); +uint64_t shl_c906_get_l1_icache_access(); +uint64_t shl_c906_get_l1_icache_miss(); +uint64_t shl_c906_get_cb_miss(); +uint64_t shl_c906_get_cb_inst(); +uint64_t shl_c906_get_store_inst(); +uint64_t shl_c906_get_l1_dcache_raccess(); +uint64_t shl_c906_get_l1_dcache_rmiss(); +uint64_t shl_c906_get_l1_dcache_waccess(); +uint64_t shl_c906_get_l1_dcache_wmiss(); + +struct shl_c906_hpm shl_c906_get_hw_perf(); + +int shl_c906_sum_stride_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +void shl_c906_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale, + uint32_t length); + +struct csinn_callback *shl_cb_map_c906(int op, int dtype); +int shl_c906_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, + void *exec); +int shl_c906_reg_op_est(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *est); +#endif // INCLUDE_SHL_C906_H_ diff --git a/include/shl_c908.h b/include/shl_c908.h new file mode 100644 index 00000000..fe8c2a1c --- /dev/null +++ b/include/shl_c908.h @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_C908_H_ +#define INCLUDE_SHL_C908_H_ + +#include "csi_nn.h" +#include "shl_gref.h" +#include "shl_ref.h" +#include "shl_thead_rvv.h" + +/*********************************** initialization ***********************************/ +int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_c908_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_c908_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +/************************************ convolution *********************************/ +/*********************************** im2col + gemm ********************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/******************************** conv2d1x1s1 + gemm ******************************/ +void shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_c908_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/*********************************** winograd ***********************************/ +void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_wg_b6f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_wg_b4f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_wg_b4f3s1_trans_kernel_pack8_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +int shl_c908_wg_b6f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_wg_b6f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_wg_b6f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_c908_wg_b4f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_wg_b4f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_wg_b4f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_wg_b4f3s1_pack8_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/*********************************** gemm ncxhwx kernel ***********************************/ +void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, + const float *bias, int m, int k, int n, bool fuse_relu); +void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + const __fp16 *bias, int m, int k, int n, bool fuse_relu); + +void shl_c908_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + const int32_t *bias, int m, int k, int n, int32_t out_zp, + int32_t *mult, int32_t *shift); + +void shl_c908_ncxhwx_gemm_12xpackn_int16(int32_t *dst, const int16_t *sa, const int16_t *sb, int m, + int k, int n); +/*********************************** gemm kernel ***********************************/ +void shl_c908_reorder_kernel_n8_fp32(float *src, float *dst, int m, int k, int ldc); +void shl_c908_reorder_input_z12_fp32(float *src, float *dst, int k, int n, int ldc); +void shl_c908_gemm_8x12_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc); +void shl_c908_reorder_input_z8_fp32(float *src, float *dst, int k, int n, int ldc); +void shl_c908_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k, + int n, int ldc); + +void shl_c908_reorder_kernel_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldc); +void shl_c908_reorder_input_z24_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc); +void shl_c908_gemm_8x24_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc); +void shl_c908_reorder_input_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc); +void shl_c908_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc); + +void shl_c908_reorder_kernel_n8_int8(int8_t *src, int8_t *dst, int m, int k, int ldc); +void shl_c908_reorder_input_z8_int8(int8_t *src, int8_t *dst, int k, int n, int ldc); +void shl_c908_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift); +void shl_c908_reorder_input_z12_int8(int8_t *src, int8_t *dst, int k, int n, int ldc); + +/*********************************** VLEN = 256 ***********************************/ +/*********************************** VLEN = 256 ***********************************/ +/*********************************** VLEN = 256 ***********************************/ + +void shl_c908_reorder_input_z16_fp32_v256(float *src, float *dst, int k, int n, int ldc); +void shl_c908_gemm_8x16_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc); + +void shl_c908_reorder_input_z32_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc); +void shl_c908_gemm_8x32_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int m, int k, int n, int ldc); + +void shl_c908_reorder_input_z16_int8_v256(int8_t *src, int8_t *dst, int k, int n, int ldc); +void shl_c908_gemm_8x16_int8_v256(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, + int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult, + int32_t *shift); + +#endif // INCLUDE_SHL_C908_H_ diff --git a/include/shl_debug.h b/include/shl_debug.h new file mode 100644 index 00000000..0d356f7a --- /dev/null +++ b/include/shl_debug.h @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +#ifndef INCLUDE_SHL_DEBUG_H_ +#define INCLUDE_SHL_DEBUG_H_ +#include "csi_nn.h" +#include "shl_node.h" + +enum shl_debug_enum { + SHL_DEBUG_LEVEL_DEBUG = -2, + SHL_DEBUG_LEVEL_INFO, + SHL_DEBUG_LEVEL_WARNING, + SHL_DEBUG_LEVEL_ERROR, + SHL_DEBUG_LEVEL_FATAL, +}; + +#ifdef SHL_DEBUG +#define SHL_DEBUG_CALL(func) func +void shl_debug_debug(const char *format, ...); +void shl_debug_info(const char *format, ...); +void shl_debug_warning(const char *format, ...); +void shl_debug_error(const char *format, ...); +void shl_debug_fatal(const char *format, ...); +int shl_debug_callback_unset(); +#else +#define SHL_DEBUG_CALL(func) +inline void shl_debug_debug(const char *format, ...) {} +inline void shl_debug_info(const char *format, ...) {} +inline void shl_debug_warning(const char *format, ...) {} +inline void shl_debug_error(const char *format, ...) {} +inline void shl_debug_fatal(const char *format, ...) {} +inline int shl_debug_callback_unset() { return CSINN_CALLBACK_UNSET; } +#endif + +int shl_debug_get_level(); +void shl_debug_set_level(int level); +int shl_benchmark_layer(struct shl_node *node, uint64_t start_time, uint64_t end_time, + int layer_idx); + +int shl_conv2d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, const char *name); + +int shl_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, const char *name); + +int shl_conv3d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, const char *name); + +int shl_fsmn_debug_info(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params, const char *name); + +int shl_siso_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, const char *name); + +int shl_diso_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + const char *name); + +int shl_relu_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, const char *name); + +int shl_arange_debug_info(struct csinn_tensor *output, struct csinn_arange_params *params, + const char *name); + +int shl_pool_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, const char *name); + +int shl_pad_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, const char *name); + +int shl_crop_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params, const char *name); + +int shl_roi_pool_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params, + const char *name); + +int shl_bn_debug_info(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params, const char *name); + +int shl_batch_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params, const char *name); + +int shl_batch_to_space_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params, + const char *name); + +int shl_cache_matmul_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params, const char *name); + +int shl_cache_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params, const char *name); + +int shl_space_to_depth_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, const char *name); + +int shl_depth_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, const char *name); + +int shl_space_to_batch_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params, const char *name); + +int shl_space_to_batch_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params, + const char *name); + +int shl_broadcast_to_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params, const char *name); + +int shl_reduce_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, const char *name); + +int shl_clip_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, const char *name); + +int shl_col2im_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_col2im_params *params, const char *name); + +int shl_concat_debug_info(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, const char *name); + +int shl_cumprod_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params, const char *name); + +int shl_cumsum_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params, const char *name); + +int shl_expand_dims_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, const char *name); + +int shl_flatten_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, const char *name); + +int shl_fullyconnected_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, const char *name); + +int shl_gather_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params, + const char *name); + +int shl_gather_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, + const char *name); + +int shl_hard_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, const char *name); + +int shl_im2col_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params, const char *name); + +int shl_l2n_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params, const char *name); + +int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, const char *name); + +int shl_softmax_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, const char *name); + +int shl_lrn_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, const char *name); + +int shl_matmul_debug_info(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + const char *name); + +int shl_ndarray_size_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params, const char *name); + +int shl_nms_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_non_max_suppression_params *params, + const char *name); + +int shl_one_hot_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params, const char *name); + +int shl_prelu_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_prelu_params *params, + const char *name); + +int shl_proposal_debug_info(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params, const char *name); + +int shl_psroipooling_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, + struct csinn_psroipooling_params *params, const char *name); + +int shl_reorg_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params, const char *name); + +int shl_reshape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, const char *name); + +int shl_resize_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, const char *name); + +int shl_reverse_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params, const char *name); + +int shl_roi_align_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params, + const char *name); + +int shl_scatter_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params, const char *name); + +int shl_segment_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params, + const char *name); + +int shl_select_debug_info(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params, const char *name); + +int shl_sequence_mask_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_sequence_mask_params *params, const char *name); + +int shl_shape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params, const char *name); + +int shl_shuffle_channel_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params, const char *name); + +int shl_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, const char *name); + +int shl_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params, const char *name); + +int shl_split_debug_info(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, const char *name); + +int shl_squeeze_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, const char *name); + +int shl_stack_debug_info(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params, const char *name); + +int shl_strided_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, const char *name); + +int shl_tile_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params, const char *name); + +int shl_topk_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_topk_params *params, + const char *name); + +int shl_transpose_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, const char *name); + +int shl_unpooling_debug_info(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params, + const char *name); + +int shl_unstack_debug_info(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params, const char *name); + +int shl_where_debug_info(struct csinn_tensor *condition, struct csinn_tensor *x, + struct csinn_tensor *y, struct csinn_tensor *output, + struct csinn_where_params *params, const char *name); + +#endif // INCLUDE_SHL_DEBUG_H_ diff --git a/include/shl_e804.h b/include/shl_e804.h new file mode 100644 index 00000000..624b88d1 --- /dev/null +++ b/include/shl_e804.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_E804_H_ +#define INCLUDE_SHL_E804_H_ + +#include +#include +#include +#include + +#include "csi_nn.h" +#include "shl_ref.h" + +int shl_e804_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_e804_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_e804_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_e804_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_e804_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_e804_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_e804_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_e804_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_e804_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_e804_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_e804_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_e804_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_e804_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_e804_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_e804_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +#endif // INCLUDE_SHL_E804_H_ diff --git a/include/shl_gref.h b/include/shl_gref.h new file mode 100644 index 00000000..08f1443d --- /dev/null +++ b/include/shl_gref.h @@ -0,0 +1,604 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_GREF_H_ +#define INCLUDE_SHL_GREF_H_ +#include "csi_nn.h" +#include "shl_node.h" +#include "shl_utils.h" + +int shl_gref_acos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_acosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_cos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_cosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_asin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_asinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_tan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_atan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_atanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_trunc(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_topk(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params); + +int shl_gref_cumprod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params); + +int shl_gref_cumsum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params); + +int shl_gref_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_conv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_gref_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_gref_depthwise_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_depthwise_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_gref_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params); + +int shl_gref_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_gref_fullyconnected_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_gref_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_global_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_l2pool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_pool_with_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_gref_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params); + +int shl_gref_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params); + +int shl_gref_negative(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_floor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_ceil(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_clip(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_abs(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_exp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_sin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_sinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_sqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_square(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_gref_softsign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params); + +int shl_gref_elu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_relun(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params); + +int shl_gref_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params); + +int shl_gref_round(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_softrelu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_gref_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); + +int shl_gref_softplus(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_gref_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params); + +int shl_gref_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params); + +int shl_gref_lrn(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); + +int shl_gref_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); + +int shl_gref_add(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_div(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_power(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_greater(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_less(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_gref_log(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_log1p(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_select(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); + +int shl_gref_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); + +int shl_gref_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params); + +int shl_gref_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +int shl_gref_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params); + +int shl_gref_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params); + +int shl_gref_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); + +int shl_gref_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_gref_shape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); + +int shl_gref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params); + +int shl_gref_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); + +int shl_gref_expm1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_reverse(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params); + +int shl_gref_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params); + +int shl_gref_crop(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params); + +int shl_gref_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params); + +int shl_gref_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); + +int shl_gref_stack(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params); + +int shl_gref_tile(struct csinn_tensor *inputs, struct csinn_tensor *output, + struct csinn_tile_params *params); + +int shl_gref_arange(struct csinn_tensor *output, struct csinn_arange_params *params); + +int shl_gref_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params); + +int shl_gref_unstack(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params); + +int shl_gref_gather(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); + +int shl_gref_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params); + +int shl_gref_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_gref_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_logical_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params); + +int shl_gref_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_gref_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_gref_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_gref_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_gref_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_gref_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params); + +int shl_gref_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params); + +int shl_gref_sign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); + +int shl_gref_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params); + +int shl_gref_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params); + +int shl_gref_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params); + +int shl_gref_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params); + +int shl_gref_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params); + +int shl_gref_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); + +int shl_gref_one_hot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params); + +int shl_gref_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_sequence_mask_params *params); + +int shl_gref_im2col(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params); + +int shl_gref_col2im(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params); + +int shl_gref_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_argmin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_all(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_any(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_gref_reorg(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params); + +int shl_gref_erf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_gref_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_gref_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); + +int shl_gref_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_gref_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_gref_data_convert(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); +struct shl_ref_graph { + struct shl_node **input; + struct shl_node **output; + int input_num; + int output_num; + struct shl_node **layer; + int layer_size; + int layer_index; +}; + +struct shl_gref_target_data { + struct shl_ref_graph *graph; +}; + +struct shl_ref_graph *shl_gref_get_graph(struct csinn_session *sess); +int shl_gref_graph_insert(struct shl_node *node, struct shl_ref_graph *graph); +void shl_gref_post_dfs(struct shl_ref_graph *graph, + void (*fvisit)(struct shl_ref_graph *, struct shl_node *)); +int shl_gref_is_root_node(struct shl_ref_graph *graph, struct shl_node *node); +struct shl_node *shl_gref_get_input_subgraph(struct shl_ref_graph *graph, struct shl_node *node, + int index); +void shl_gref_reset_graph_visit(struct shl_ref_graph *graph); +void shl_gref_update_input_output(struct shl_ref_graph *graph, int index); +int shl_gref_siso_op(struct csinn_tensor *input, struct csinn_tensor *output, int op, void *params); +int shl_gref_diso_op(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, int op, void *params); +int shl_gref_sidcso_op(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *const0, struct csinn_tensor *const1, int op, + void *params); +void shl_gref_set_tensor(struct csinn_tensor *tensor, struct csinn_session *sess); +void shl_gref_set_const_tensor(struct csinn_tensor *tensor, struct csinn_session *sess); +int shl_gref_get_tensor(int index, struct csinn_tensor *ret, struct csinn_session *sess); +void shl_gref_nbg(struct csinn_tensor **input, struct csinn_tensor **output, uint32_t inputs_count, + uint32_t outputs_count, const char *url); + +void shl_subgraph_alloc(struct shl_node *node, struct shl_ref_graph *ograph, + struct shl_ref_graph *ggraph); +int shl_subgraph_setup(struct shl_node *n); +int shl_subgraph_deinit(struct shl_node *n); +int shl_subgraph_run_init(struct shl_node *n); +int shl_subgraph_run(struct shl_node *n); +int shl_subgraph_run_deinit(struct shl_node *n); + +struct shl_ref_graph *shl_subgraph_generate(struct shl_ref_graph *ograph); +struct shl_ref_graph *shl_subgraph_rebuild(struct shl_ref_graph *subgraph); +struct shl_ref_graph *shl_subgraph_topology_sort(struct shl_ref_graph *graph); +void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node); +void shl_subgraph_fvisit_print(struct shl_ref_graph *graph, struct shl_node *node); +int shl_subgraph_get_device(struct shl_node *node); +void *shl_gref_runtime_callback(int api); +#endif // INCLUDE_SHL_GREF_H_ diff --git a/include/shl_i805.h b/include/shl_i805.h new file mode 100644 index 00000000..f399fced --- /dev/null +++ b/include/shl_i805.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_I805_H_ +#define INCLUDE_SHL_I805_H_ + +#include +#include +#include +#include + +#include "csi_nn.h" +#include "shl_ref.h" + +int shl_i805_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_i805_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_i805_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_i805_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_i805_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_i805_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_i805_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_i805_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +/*********************** u8 asym quant opt func *********************************/ + +int shl_i805_add_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_i805_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_i805_clip_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_i805_clip_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_i805_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_depthwise_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_depthwise_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_fullyconnected_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_fullyconnected_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_maxpool2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_i805_mul_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_i805_mul_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_i805_relu_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_relu_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_relu6_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_relu6_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_reshape_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +#endif // INCLUDE_SHL_I805_H_ diff --git a/include/shl_memory.h b/include/shl_memory.h new file mode 100644 index 00000000..c0fee308 --- /dev/null +++ b/include/shl_memory.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +#ifndef INCLUDE_SHL_MEMORY_H_ +#define INCLUDE_SHL_MEMORY_H_ + +#include +#include + +void shl_mem_print_map(); +void *shl_mem_alloc(int64_t size); +void *shl_mem_alloc_aligned(int64_t size, int aligned_bytes); +void *shl_mem_calloc(size_t nmemb, size_t size); +void *shl_mem_realloc(void *ptr, size_t size); +void shl_mem_free(void *ptr); + +#endif // INCLUDE_SHL_MEMORY_H_ diff --git a/include/csi_node.h b/include/shl_node.h similarity index 50% rename from include/csi_node.h rename to include/shl_node.h index f48790ba..11bf0fc6 100644 --- a/include/csi_node.h +++ b/include/shl_node.h @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#ifndef INCLUDE_CSI_NODE_H_ -#define INCLUDE_CSI_NODE_H_ +#ifndef INCLUDE_SHL_NODE_H_ +#define INCLUDE_SHL_NODE_H_ -struct csi_node { +struct shl_node { int type; - struct csi_node **in; - struct csi_node **out; + struct shl_node **in; + struct shl_node **out; int subgraph_idx; int in_num; int out_num; @@ -38,18 +38,18 @@ struct csi_node { }; /* node */ -struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_num, void *data); -struct csi_node *csi_node_var_alloc(char *name, void *data); -struct csi_node *csi_node_const_var_alloc(char *name, void *data); -int csi_node_free(struct csi_node *node); -int csi_node_add_in(struct csi_node *node, struct csi_node *in, int index); -int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index); -int csi_node_get_in_number(struct csi_node *node); -int csi_node_get_out_number(struct csi_node *node); -int csi_node_get_non_const_in_number(struct csi_node *node); -struct csi_node *csi_node_get_in(struct csi_node *node, int index); -struct csi_node *csi_node_get_out(struct csi_node *node, int index); -int csi_node_restrict_map_insert(int value, struct csi_node *node); -int csi_node_find(struct csi_node **list, int len, struct csi_node *node); +struct shl_node *shl_node_alloc(int node_type, char *name, int in_num, int out_num, void *data); +struct shl_node *shl_node_var_alloc(char *name, void *data); +struct shl_node *shl_node_const_var_alloc(char *name, void *data); +int shl_node_free(struct shl_node *node); +int shl_node_add_in(struct shl_node *node, struct shl_node *in, int index); +int shl_node_add_out(struct shl_node *node, struct shl_node *out, int index); +int shl_node_get_in_number(struct shl_node *node); +int shl_node_get_out_number(struct shl_node *node); +int shl_node_get_non_const_in_number(struct shl_node *node); +struct shl_node *shl_node_get_in(struct shl_node *node, int index); +struct shl_node *shl_node_get_out(struct shl_node *node, int index); +int shl_node_restrict_map_insert(int value, struct shl_node *node); +int shl_node_find(struct shl_node **list, int len, struct shl_node *node); -#endif // INCLUDE_CSI_NODE_H_ +#endif // INCLUDE_SHL_NODE_H_ diff --git a/include/shl_ref.h b/include/shl_ref.h new file mode 100644 index 00000000..2ce6ef6c --- /dev/null +++ b/include/shl_ref.h @@ -0,0 +1,1206 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_REF_H_ +#define INCLUDE_SHL_REF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int shl_ref_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_abs_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_acos_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_acos_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_acosh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_acosh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_add_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_and_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_and_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_and_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_arange_f32(struct csinn_tensor *output, struct csinn_arange_params *params); + +int shl_ref_arange_quant(struct csinn_tensor *output, struct csinn_arange_params *params); + +int shl_ref_argmax_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_argmax_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_argmin_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_argmin_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_asin_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_asin_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_asinh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_asinh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_atan_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_atan_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_atanh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_atanh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_avgpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_avgpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_batch_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params); + +int shl_ref_batch_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params); + +int shl_ref_batch_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params); + +int shl_ref_batch_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params); + +int shl_ref_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); + +int shl_ref_broadcast_to_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params); + +int shl_ref_ceil_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_ceil_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_ref_clip_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params); + +int shl_ref_col2im_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params); + +int shl_ref_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +int shl_ref_concat_quant(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +int shl_ref_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params); + +int shl_ref_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params); + +int shl_ref_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_relu_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_ref_cache_matmul_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_ref_cache_matmul_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params); + +int shl_ref_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_ref_cache_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_ref_cache_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params); + +int shl_ref_conv2d_channel_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv2d_channel_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_relu_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_channel_relu_quant(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_conv2d_channel_relu6_quant(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, + struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_group_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_conv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_ref_conv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_ref_cos_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_cos_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_cosh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_cosh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_cumprod_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params); + +int shl_ref_cumprod_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params); + +int shl_ref_cumsum_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params); + +int shl_ref_cumsum_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params); + +int shl_ref_data_convert_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); +int shl_ref_data_convert_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_depthwise_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_ref_deconv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_ref_deconv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params); + +int shl_ref_depth_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params); + +int shl_ref_depth_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params); + +int shl_ref_div_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_div_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_elu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_elu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_fsmn_f32(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params); + +int shl_ref_fsmn_quant(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params); + +int shl_ref_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_erf_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_erf_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_exp_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_exp_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_expand_dims_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); + +int shl_ref_expand_dims_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); + +int shl_ref_expm1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_expm1_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params); + +int shl_ref_flatten_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params); + +int shl_ref_floor_divide_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_floor_divide_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_floor_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_floor_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_floor_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_floor_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_ref_fullyconnected_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_ref_gather_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params); + +int shl_ref_gather_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params); + +int shl_ref_gather_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); + +int shl_ref_gather_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params); + +int shl_ref_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_global_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_global_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_greater_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_greater_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_greater_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_greater_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_hard_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_ref_hard_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_ref_im2col_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params); + +int shl_ref_im2col_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params); + +int shl_ref_isnan_bool_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_l2_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params); + +int shl_ref_l2_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params); + +int shl_ref_l2pool_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_layer_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); + +int shl_ref_layer_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params); + +int shl_ref_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_leaky_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_less_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_less_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_less_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_less_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_log_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_ref_log_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_ref_log_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_log_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_log1p_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_log1p_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_logical_and_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_logical_and_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_logical_not_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_logical_not_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_logical_or_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_logical_or_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_logical_xor_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_logical_xor_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_lrn_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); + +int shl_ref_lrn_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params); + +int shl_ref_matmul_f32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); + +int shl_ref_matmul_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); + +int shl_ref_max_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_max_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_maximum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_maximum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_maxpool2d_locat_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_maxpool2d_locat_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_maxpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_maxpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_ref_mean_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_mean_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_min_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_min_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_minimum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_mul_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_ndarray_size_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); + +int shl_ref_ndarray_size_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); + +int shl_ref_ndarray_size_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); + +int shl_ref_ndarray_size_i32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params); + +int shl_ref_negative_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_negative_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_non_max_suppression_std(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params); + +int shl_ref_not_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_not_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_not_u32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_not_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_not_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_or_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_or_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_or_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); + +int shl_ref_pad_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params); + +int shl_ref_power_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_power_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); + +int shl_ref_prelu_quant(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params); + +int shl_ref_prod_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_prod_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_proposal_f32(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params); + +int shl_ref_proposal_quant(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params); + +int shl_ref_psroipooling_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params); + +int shl_ref_psroipooling_quant(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, + struct csinn_psroipooling_params *params); + +int shl_ref_reduce_logsumexp_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_logsumexp_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_max_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_max_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_mean_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_min_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_min_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_prod_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_prod_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_sum_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_reduce_sum_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relu1_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relun_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_relun_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_ref_reshape_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_ref_resize_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params); + +int shl_ref_resize_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params); + +int shl_ref_reverse_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params); + +int shl_ref_reverse_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params); + +int shl_ref_roi_align_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params); + +int shl_ref_roipool_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params); + +int shl_ref_roipool_quant(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params); + +int shl_ref_round_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_round_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_rsqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_rsqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_scatter_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params); + +int shl_ref_scatter_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params); + +int shl_ref_unsorted_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_mean_quant(struct csinn_tensor *input, + struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_mean_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_prod_quant(struct csinn_tensor *input, + struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_prod_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_unsorted_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params); + +int shl_ref_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params); + +int shl_ref_select_f32(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); + +int shl_ref_select_u8(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); + +int shl_ref_select_i8(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params); + +int shl_ref_shape_i32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); + +int shl_ref_shape_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); + +int shl_ref_shape_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params); + +int shl_ref_shuffle_channel_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params); + +int shl_ref_shuffle_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params); + +int shl_ref_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_ref_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_ref_sign_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sign_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sin_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sin_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sinh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sinh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params); + +int shl_ref_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params); + +int shl_ref_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_ref_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_ref_softplus_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_softplus_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_softrelu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_softrelu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_softsign_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_softsign_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_space_to_batch_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params); + +int shl_ref_space_to_batch_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params); + +int shl_ref_space_to_depth_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params); + +int shl_ref_space_to_depth_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params); + +int shl_ref_split_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); + +int shl_ref_split_quant(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params); + +int shl_ref_sqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_sqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_square_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params); + +int shl_ref_stack_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params); + +int shl_ref_stack_quant(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params); + +int shl_ref_strided_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params); + +int shl_ref_strided_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params); + +int shl_ref_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_sub_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_sum_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_sum_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +int shl_ref_tan_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_tan_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_tanh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_tanh_f64(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_tanh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_threshold_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_threshold_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_ref_tile_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params); + +int shl_ref_tile_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params); + +int shl_ref_topk_f32(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params); + +int shl_ref_topk_quant(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params); + +int shl_ref_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); + +int shl_ref_transpose_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); + +int shl_ref_trunc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_trunc_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_unpooling_f32(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params); + +int shl_ref_unpooling_quant(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params); + +int shl_ref_unstack_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params); + +int shl_ref_unstack_qunat(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params); + +int shl_ref_xor_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_xor_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_xor_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_ref_yuv_rgb_scale_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_ref_yuv_rgb_scale_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int32_t shl_ref_max_internal_s32(int32_t a, int32_t b); +int32_t shl_ref_min_internal_s32(int32_t a, int32_t b); +int32_t shl_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, + int32_t index3); +int32_t shl_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, + int32_t index3, int32_t index4); +int32_t shl_ref_get_index_iter(int32_t *dim, int dim_count, int32_t *index); +float shl_ref_get_scale(int32_t multiplier, int32_t shift); +float shl_ref_dequantize_u8_to_f32(uint8_t input, struct csinn_quant_info *qinfo); +float shl_ref_dequantize_i8_to_f32(int8_t input, struct csinn_quant_info *qinfo); +uint8_t shl_ref_quantize_f32_to_u8(float input, struct csinn_quant_info *qinfo); +int8_t shl_ref_quantize_f32_to_i8(float input, struct csinn_quant_info *qinfo); +uint8_t shl_ref_quantize_channel_u8(int32_t data, struct csinn_tensor *input, + struct csinn_tensor *output, float wscale); +int8_t shl_ref_quantize_channel_i8(int32_t data, struct csinn_tensor *input, + struct csinn_tensor *output, float wscale); +float shl_ref_uint8_to_float(uint8_t i, struct csinn_tensor *t); +float shl_ref_int8_to_float(int8_t i, struct csinn_tensor *t); +int16_t shl_ref_float32_to_float16(float value); +float shl_ref_float16_to_float32(int16_t value); +int16_t shl_ref_float32_to_bfloat16(float value); +float shl_ref_bfloat16_to_float32(int16_t value); +struct csinn_tensor *shl_ref_nchw_to_nhwc_8(struct csinn_tensor *t); +void shl_ref_nhwc_to_nchw_8(struct csinn_tensor *nt, struct csinn_tensor *t); +struct csinn_tensor *shl_ref_deconv_kernel_nchw_to_nhwc_f32(struct csinn_tensor *t, + int32_t permute[4]); +struct csinn_tensor *shl_ref_nchw_to_nhwc_f32(struct csinn_tensor *t); +void shl_ref_nhwc_to_nchw_f32(struct csinn_tensor *nt, struct csinn_tensor *t); +int32_t shl_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents, + int32_t n); +struct csinn_tensor *shl_ref_alloc_float_tensor(struct csinn_tensor *src); +void shl_ref_free_float_tensor(struct csinn_tensor *src); +struct csinn_tensor *shl_ref_convert_float_tensor(struct csinn_tensor *src); +void shl_ref_conv_free_float_tensor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias); +struct csinn_tensor *shl_ref_tensor_transform_f32(struct csinn_tensor *input); +int shl_ref_tensor_transform_free_f32(struct csinn_tensor *input); +uint8_t *shl_ref_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess); + +struct shl_ref_diso_callback { + void (*bc)(); + struct csinn_tensor *input0; + struct csinn_tensor *input1; + struct csinn_tensor *output; + int32_t *input_dim; +}; + +int shl_ref_diso_broadcast_base(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct shl_ref_diso_callback *cb); +int shl_ref_broadcast_to_shape(struct csinn_tensor *input, struct csinn_tensor *output, + int32_t *shape, int32_t shape_count); +int shl_ref_broadcast_to_shape_f32(struct csinn_tensor *input, struct csinn_tensor *output, + int32_t *shape, int32_t shape_count); +int shl_ref_broadcast_to_shape_quant(struct csinn_tensor *input, struct csinn_tensor *output, + int32_t *shape, int32_t shape_count); + +int shl_ref_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output, + void *params, void *cb); +int shl_ref_diso_callback_base(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, void *params, void *cb); +int shl_ref_conv_callback_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, void *params, + void *cb); + +void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output); + +void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output); + +int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_ref_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params); + +int shl_ref_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params); + +void asr_buffer_init(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth); + +void *asr_buffer_insert_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len); + +void *asr_buffer_insert_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len); + +void *asr_buffer_get_buffer(struct csinn_asr_buffer_t *buffer); + +void asr_buffer_reset(struct csinn_asr_buffer_t *buffer); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_SHL_REF_H_ diff --git a/include/shl_ref_i805.h b/include/shl_ref_i805.h new file mode 100644 index 00000000..a222c038 --- /dev/null +++ b/include/shl_ref_i805.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_I805_REF_H_ +#define INCLUDE_SHL_I805_REF_H_ + +#include "csi_nn.h" +#include "shl_ref.h" + +int shl_i805_ref_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_ref_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_ref_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_i805_ref_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_i805_ref_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_i805_ref_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_ref_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_i805_ref_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_i805_ref_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +int shl_i805_ref_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_ref_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_i805_ref_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_i805_ref_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_i805_ref_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int shl_i805_ref_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +#endif // INCLUDE_SHL_I805_REF_H_ diff --git a/include/shl_thead_rvv.h b/include/shl_thead_rvv.h new file mode 100644 index 00000000..09c54cde --- /dev/null +++ b/include/shl_thead_rvv.h @@ -0,0 +1,668 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_RVV_H_ +#define INCLUDE_SHL_RVV_H_ + +#if __riscv_vector +#include + +#if (__riscv_v == 1000000) +#define RVV_1_0_0 +#elif (__riscv_v == 7000) +#define RVV_0_7_1 +#endif + +#ifdef __riscv_xtheadv +#define XTHEADV +#endif // __riscv_xtheadv + +#endif // __riscv_vector + +#include "csi_nn.h" +#include "shl_gref.h" +#include "shl_ref.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/********************************** initialization ******************************/ +int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +/************************************ convolution *********************************/ +/*********************************** im2col + gemm ********************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/******************************** conv2d1x1s1 + gemm ******************************/ +void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/************************************* winograd ***********************************/ +void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +int shl_rvv_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); +void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel); + +int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/******************************* depthwise convolution ****************************/ +int shl_rvv_dwconv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s1_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); +void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params); + +int shl_rvv_dwconv3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); +int shl_rvv_dwconv3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params); + +/*************************************** gemm *************************************/ +void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx); +void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx); +void shl_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k, + int n, int ldc); + +void shl_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx); +void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc); + +void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); +void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); +void shl_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc); + +void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); +void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); +void shl_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int m, int k, int n, int ldc); + +void shl_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx); +void shl_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc); +void shl_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift); + +void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, + int m, int k, int n, int ldc); + +void shl_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx); +void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx); +void shl_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, + int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift); + +/************************************ gemm ncxhwx *********************************/ +void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx); +void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int m, int k, int n, int ldc); +void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int m, int k, int n, int ldc); + +void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx); +void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int m, int k, int n, int ldc); +void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int m, int k, int n, int ldc); + +void shl_rvv_reorder_input_z8_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_8xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift); +void shl_rvv_reorder_input_z12_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift); + +void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift); + +void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx); +void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift); + +void shl_rvv_reorder_input_z12_pack1ton_fp32(float *b, float *sb, int inc, int maxk, int n, + int ldx); +void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb, int inc, int maxk, int n, + int ldx); +void shl_rvv_reorder_input_z12_pack1ton_int8(int8_t *b, int8_t *sb, int inc, int maxk, int n, + int ldx); + +/************************************ pooling *********************************/ +int shl_rvv_avgpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_maxpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_global_avgpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_maxpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_avgpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); +int shl_rvv_global_avgpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params); + +/************************************ fullyconnected *********************************/ +void shl_rvv_fc_gemv_transform_weight_fp32(struct csinn_tensor *weights); +void shl_rvv_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights); +void shl_rvv_fc_gemv_transform_weight_int8(struct csinn_tensor *weights); + +int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); +int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); +int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +void shl_rvv_fc_gemv_transform_weight_int8_dot(struct csinn_tensor *weights); +void shl_rvv_fc_gemv_transform_weight_int4_dot(struct csinn_tensor *weights); + +int shl_rvv_fullyconnected_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); +int shl_rvv_fullyconnected_packn_int4_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params); + +/************************************ activation *********************************/ +int shl_rvv_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); +int shl_rvv_leaky_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params); + +int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params); + +int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params); + +/************************************ layout/memory transform *********************************/ +int shl_rvv_concat_fp32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); +int shl_rvv_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); +int shl_rvv_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params); + +/************************************ basic math *********************************/ +int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); +int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); +int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); +int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); +int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_rvv_sum_stride_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params); + +/************************************ utils *********************************/ +void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left); +void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left); +void shl_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left, + int8_t pad_value); + +void shl_rvv_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left); +void shl_rvv_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left); +void shl_rvv_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left, + int8_t pad_value); + +void shl_rvv_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left); +void shl_rvv_pad_input_pack1ton_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left); +void shl_rvv_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left, + int8_t pad_value); + +void shl_rvv_reorder_input_pack1ton_fp32(const float *src, float *dst, int inc, int inh, int inw); +void shl_rvv_reorder_input_pack1ton_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw); +void shl_rvv_reorder_input_pack1ton_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw); +void shl_rvv_reorder_input_packnto1_fp32(const float *src, float *dst, int inc, int inh, int inw); +void shl_rvv_reorder_input_packnto1_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw); +void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw); + +void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size); + +void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size); + +void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left, int8_t pad_value); +void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size); +void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size); +void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size); +void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size); + +int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); +int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params); + +int csrr_vl(); +int csrr_vlenb(); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_SHL_RVV_H_ diff --git a/include/shl_utils.h b/include/shl_utils.h new file mode 100644 index 00000000..706708a2 --- /dev/null +++ b/include/shl_utils.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#ifndef INCLUDE_SHL_UTILS_H_ +#define INCLUDE_SHL_UTILS_H_ + +#include +#include +#include +#include +#include +#include +#include +#if (!defined SHL_BUILD_RTOS) +#include +#endif +#include "csinn_data_structure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void shl_get_top5(float *buf, uint32_t size, float *prob, uint32_t *cls); +void shl_show_top5(struct csinn_tensor *output, struct csinn_session *sess); +uint64_t shl_get_timespec(); +void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg); +void shl_statistical_mean_std(float *data, int sz); +void shl_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, + int32_t *shift); + +void shl_register_runtime_callback(int api, void *cb); +void shl_register_op_callback(int api, void *cb); +int shl_op_callback_map(struct csinn_params_base *base, int op, int dtype); + +void *shl_get_p0_cb(struct csinn_params_base *base); +void *shl_get_init_cb(struct csinn_params_base *base); + +enum csinn_rmode_enum shl_get_run_mode(struct csinn_params_base *base); + +struct shl_cb_op_list { + struct shl_cb_op_list *next; + enum csinn_dtype_enum dtype; + enum csinn_op_enum op_name; + struct csinn_callback *cb; +}; + +struct shl_cb_op_list *shl_cb_list_end(struct shl_cb_op_list *list); +struct csinn_callback *shl_cb_list_match(struct shl_cb_op_list *list, enum csinn_dtype_enum dtype, + enum csinn_op_enum op_name); + +struct shl_bm_sections { + int32_t graph_offset; + int32_t graph_size; + int32_t params_offset; + int32_t params_size; + int32_t info_offset; + int32_t info_size; + int32_t debug_offset; + int32_t debug_size; +}; + +struct shl_binary_model_section_info { + int32_t section_num; + int32_t section_info_size; + int32_t reserve[6]; + struct shl_bm_sections sections[127]; +}; + +char *shl_bm_header_str(); + +void shl_dump_bm_header(FILE *f); +void shl_dump_bm_section_info(FILE *f, struct shl_binary_model_section_info *info); +void shl_dump_bm_graph_info_section(FILE *f, struct csinn_session *sess); +void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_SHL_UTILS_H_ diff --git a/source/c860_opt/csi_u8_to_f32_c860.S b/source/c860_opt/shl_c860_u8_to_f32.S similarity index 91% rename from source/c860_opt/csi_u8_to_f32_c860.S rename to source/c860_opt/shl_c860_u8_to_f32.S index c4f013b9..33a431fd 100644 --- a/source/c860_opt/csi_u8_to_f32_c860.S +++ b/source/c860_opt/shl_c860_u8_to_f32.S @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /** * - * void csi_u8_to_f32_c860( + * void shl_c860_u8_to_f32( * uint8_t *input, * float *output, * int32_t offset, @@ -30,11 +30,11 @@ **/ .file "utils.S" - .section .text.csi_u8_to_f32_c860,"ax",@progbits + .section .text.shl_c860_u8_to_f32,"ax",@progbits .align 2 - .global csi_u8_to_f32_c860 + .global shl_c860_u8_to_f32 -csi_u8_to_f32_c860: +shl_c860_u8_to_f32: ld.w t0, (sp, 0x0) // length vdupg.32 vr7, a2 // offset ld.w a3, (a3, 0) @@ -92,5 +92,5 @@ csi_u8_to_f32_c860: .L4: rts - .size csi_u8_to_f32_c860, .-csi_u8_to_f32_c860 + .size shl_c860_u8_to_f32, .-shl_c860_u8_to_f32 diff --git a/source/c860_opt/utils.S b/source/c860_opt/utils.S index 56e8cbf8..929a10f0 100644 --- a/source/c860_opt/utils.S +++ b/source/c860_opt/utils.S @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /** * - * void csi_dequantize_f32_c860( + * void shl_c860_dequantize_f32( * uint8_t *input, * float *output, * int32_t offset, @@ -31,11 +31,11 @@ **/ .file "utils.S" - .section .text.csi_dequantize_f32_c860,"ax",@progbits + .section .text.shl_c860_dequantize_f32,"ax",@progbits .align 2 - .global csi_dequantize_f32_c860 + .global shl_c860_dequantize_f32 -csi_dequantize_f32_c860: +shl_c860_dequantize_f32: ld.w t0, (sp, 0x4) // length ld.w t3, (sp, 0x0) // shift vdupg.32 vr0, a3 @@ -98,5 +98,5 @@ csi_dequantize_f32_c860: .L4: rts - .size csi_dequantize_f32_c860, .-csi_dequantize_f32_c860 + .size shl_c860_dequantize_f32, .-shl_c860_dequantize_f32 diff --git a/source/c906_opt/abs.c b/source/c906_opt/abs.c index 6fe47754..bc48de8c 100644 --- a/source/c906_opt/abs.c +++ b/source/c906_opt/abs.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_abs_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -56,10 +55,8 @@ int csi_c906_abs_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_abs_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_c906_abs_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/add.c b/source/c906_opt/add.c index 27247832..6d39b21f 100644 --- a/source/c906_opt/add.c +++ b/source/c906_opt/add.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" static void element_add_f32(float *input0, float *input1, float *output, int size) { @@ -49,18 +48,16 @@ static void element_add_f32(float *input0, float *input1, float *output, int siz ); } -int csi_c906_add_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; float *output_data = (float *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // HACK: special case: tensorflow densenet121 // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55] @@ -135,29 +132,28 @@ int csi_c906_add_f32(struct csi_tensor *input0, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { + float *in0_data_b = shl_mem_alloc(out_size * 4); + float *in1_data_b = shl_mem_alloc(out_size * 4); - float *in0_data_b = csi_mem_alloc(out_size * 4); - float *in1_data_b = csi_mem_alloc(out_size * 4); - - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_add_f32(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] else { @@ -202,19 +198,16 @@ static void element_add_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int ); } - -int csi_c906_add_fp16(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { __fp16 *input0_data = (__fp16 *)input0->data; __fp16 *input1_data = (__fp16 *)input1->data; __fp16 *output_data = (__fp16 *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) { int inner_size = input0->dim[2] * input0->dim[3]; @@ -281,29 +274,28 @@ int csi_c906_add_fp16(struct csi_tensor *input0, } } if (!flag) { + __fp16 *in0_data_b = shl_mem_alloc(out_size * 2); + __fp16 *in1_data_b = shl_mem_alloc(out_size * 2); - __fp16 *in0_data_b = csi_mem_alloc(out_size * 2); - __fp16 *in1_data_b = csi_mem_alloc(out_size * 2); - - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_add_fp16(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } else { int inner_size = in_size1; int outer_size = out_size / in_size1; diff --git a/source/c906_opt/avgpool.c b/source/c906_opt/avgpool.c index 6a82a177..7f26a8ad 100644 --- a/source/c906_opt/avgpool.c +++ b/source/c906_opt/avgpool.c @@ -16,18 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* pad_left = pad_top = 0 pad_right = 0 or 1 pad_down = 0 or 1 */ -static int avgpool2x2s2(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool2x2s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -201,10 +200,8 @@ static int avgpool2x2s2(struct csi_tensor *input, return CSINN_TRUE; } - -static int avgpool2x2s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -382,9 +379,8 @@ static int avgpool2x2s2_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int avgpool2x2s2_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool2x2s2_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -631,9 +627,8 @@ static int avgpool2x2s2_p1(struct csi_tensor *input, return CSINN_TRUE; } -static int avgpool2x2s2_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -892,9 +887,8 @@ static int avgpool2x2s2_p1_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int avgpool3x3s2(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -1129,9 +1123,8 @@ static int avgpool3x3s2(struct csi_tensor *input, return CSINN_TRUE; } -static int avgpool3x3s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -1373,9 +1366,8 @@ static int avgpool3x3s2_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int avgpool3x3s2_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s2_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -1725,9 +1717,8 @@ static int avgpool3x3s2_p1(struct csi_tensor *input, return CSINN_TRUE; } -static int avgpool3x3s2_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -2082,14 +2073,12 @@ static int avgpool3x3s2_p1_fp16(struct csi_tensor *input, return CSINN_TRUE; } - /* pad_left = pad_right = pad_top = pad_down = 1 in_w = out_w in_h = out_h */ -static int avgpool3x3s1_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s1_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -2397,9 +2386,8 @@ static int avgpool3x3s1_p1(struct csi_tensor *input, return CSINN_TRUE; } -static int avgpool3x3s1_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -2731,10 +2719,8 @@ static int avgpool3x3s1_p1_fp16(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_avgpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int32_t input_h = input->dim[2]; int32_t input_w = input->dim[3]; @@ -2749,14 +2735,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input, int32_t pad_top = params->pad_top; int32_t pad_down = params->pad_down; - params->base.bc = NULL; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; // global avgpool2d if (input_h == kernel_h && input_w == kernel_w) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_global_avgpool2d_f32; + cb->exec = shl_c906_global_avgpool2d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_c906_global_avgpool2d_fp16; + cb->exec = shl_c906_global_avgpool2d_fp16; } return CSINN_TRUE; } @@ -2774,15 +2761,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input, // end consider ceil_mode 2x2s2p0 if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = avgpool2x2s2; + cb->exec = avgpool2x2s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = avgpool2x2s2_fp16; + cb->exec = avgpool2x2s2_fp16; } } else if (pad_left == 1 && pad_top == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = avgpool2x2s2_p1; + cb->exec = avgpool2x2s2_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = avgpool2x2s2_p1_fp16; + cb->exec = avgpool2x2s2_p1_fp16; } } } else if (kernel_h == 3 && kernel_w == 3) { @@ -2797,15 +2784,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input, // end consider ceil_mode 3x3s2p0 if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = avgpool3x3s2; + cb->exec = avgpool3x3s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = avgpool3x3s2_fp16; + cb->exec = avgpool3x3s2_fp16; } } else if (pad_left == 1 && pad_top == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = avgpool3x3s2_p1; + cb->exec = avgpool3x3s2_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = avgpool3x3s2_p1_fp16; + cb->exec = avgpool3x3s2_p1_fp16; } } } @@ -2813,20 +2800,22 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input, if (kernel_h == 3 && kernel_w == 3) { if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = avgpool3x3s1_p1; + cb->exec = avgpool3x3s1_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = avgpool3x3s1_p1_fp16; + cb->exec = avgpool3x3s1_p1_fp16; } } } } - if (params->base.bc == NULL) { - csi_debug_warning("avgpool is not optimized to achieve under this condition on C906, call reference func replaced.\n"); + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on C906, call reference func " + "replaced.\n"); if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_avgpool2d_f32; + cb->exec = shl_ref_avgpool2d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_avgpool2d_quant; + cb->exec = shl_ref_avgpool2d_quant; } } return CSINN_TRUE; diff --git a/source/c906_opt/broadcast_to.c b/source/c906_opt/broadcast_to.c index 0563179d..a53fcaf9 100644 --- a/source/c906_opt/broadcast_to.c +++ b/source/c906_opt/broadcast_to.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_broadcast_to_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int shl_c906_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; diff --git a/source/c906_opt/cache_conv1d.c b/source/c906_opt/cache_conv1d.c index 12c692d1..efe62845 100644 --- a/source/c906_opt/cache_conv1d.c +++ b/source/c906_opt/cache_conv1d.c @@ -16,42 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_c906_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { size_t data_size = output->dim[0] * output->dim[1] * output->dim[2] * sizeof(__fp16); // 512*13*2 asr_buffer_init_c906(¶ms->asr_buffer, 2 * data_size, data_size); + struct csinn_callback *cb = params->base.cb; if (input->dtype == CSINN_DTYPE_FLOAT16) { __fp16 *weight_data = (__fp16 *)weight->data; int n = weight->dim[0]; // out_nodes int k = weight->dim[1]; // in_nodes if (k % 16 != 0) { - csi_debug_error("out_nodes num should be multiple of 16\n"); + shl_debug_error("out_nodes num should be multiple of 16\n"); } - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16)); - csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16)); + shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); - csi_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); + shl_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); params->data = weight_data; - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); - params->base.bc = csi_c906_cache_conv1d_fp16; + cb->exec = shl_c906_cache_conv1d_fp16; } return CSINN_TRUE; } -int csi_c906_cache_conv1d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_c906_cache_conv1d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { __fp16 *input_data = input->data; __fp16 *output_data = output->data; diff --git a/source/c906_opt/cache_matmul.c b/source/c906_opt/cache_matmul.c index 6810ce48..c670be37 100644 --- a/source/c906_opt/cache_matmul.c +++ b/source/c906_opt/cache_matmul.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" -#include "csi_memory.h" +#include "shl_c906.h" +#include "shl_memory.h" // asr data buffer -void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth) +void asr_buffer_init_c906(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth) { - buffer->buffer = csi_mem_alloc(buffer_size); + buffer->buffer = shl_mem_alloc(buffer_size); buffer->buffer_lenth = buffer_size; buffer->data_lenth = data_lenth; buffer->writer_index = buffer_size - data_lenth; @@ -32,7 +32,7 @@ void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_ } // insert front -void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, size_t len) +void *asr_buffer_insert_c906_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len) { int start_position = buffer->writer_index - len; uint8_t *p = NULL; @@ -60,7 +60,7 @@ void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, siz } } -void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size_t len) +void *asr_buffer_insert_c906_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len) { int end_position = buffer->writer_index + len; uint8_t *p = NULL; @@ -80,15 +80,15 @@ void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size } // get buffer -void *asr_buffer_get_buffer_c906(struct asr_buffer_t *buffer) +void *asr_buffer_get_buffer_c906(struct csinn_asr_buffer_t *buffer) { return asr_buffer_insert_c906_back(buffer, NULL, 0); } // reset buffer -void asr_buffer_reset_c906(struct asr_buffer_t *buffer) +void asr_buffer_reset_c906(struct csinn_asr_buffer_t *buffer) { - csi_mem_free(buffer->buffer); + shl_mem_free(buffer->buffer); buffer->writer_index = 0; buffer->buffer = NULL; buffer->buffer_lenth = 0; @@ -96,9 +96,9 @@ void asr_buffer_reset_c906(struct asr_buffer_t *buffer) buffer->flag = 0; } -int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) +int shl_c906_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) { size_t data_size = params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(__fp16); @@ -107,28 +107,29 @@ int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *outp int accum_depth = weight->dim[0]; int output_depth = weight->dim[1]; + struct csinn_callback *cb = params->base.cb; if (input->dtype == CSINN_DTYPE_FLOAT16) { __fp16 *weight_data = (__fp16 *)weight->data; int n = weight->dim[0]; // out_nodes int k = weight->dim[1]; // in_nodes if (k % 16 != 0) { - csi_debug_error("out_nodes num should be multiple of 16\n"); + shl_debug_error("out_nodes num should be multiple of 16\n"); } - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16)); - csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16)); + shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); - csi_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); + shl_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); params->data = weight_data; - csi_mem_free(pa_reorder); - params->base.bc = csi_c906_cache_matmul_fp16; + shl_mem_free(pa_reorder); + cb->exec = shl_c906_cache_matmul_fp16; } return CSINN_TRUE; } -int csi_c906_cache_matmul_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) +int shl_c906_cache_matmul_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) { int accum_depth = weight->dim[0]; int output_depth = weight->dim[1]; diff --git a/source/c906_opt/clip.c b/source/c906_opt/clip.c index db9fc59c..5e34b9dd 100644 --- a/source/c906_opt/clip.c +++ b/source/c906_opt/clip.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_clip_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int shl_c906_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -58,10 +57,8 @@ int csi_c906_clip_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_clip_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int shl_c906_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/concat.c b/source/c906_opt/concat.c index 9c1c0d15..790f5105 100644 --- a/source/c906_opt/concat.c +++ b/source/c906_opt/concat.c @@ -18,12 +18,10 @@ /* CSI-NN2 version 1.9.x */ -#include "csi_c906.h" +#include "shl_c906.h" - -int csi_c906_concat_f32(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params) +int shl_c906_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -38,21 +36,19 @@ int csi_c906_concat_f32(struct csi_tensor **input, float *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; float *input_item_data = input_item->data; const int copy_size = input_item->dim[params->axis] * base_inner_size; const float *input_ptr = input_item_data + k * copy_size; - csi_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(float)); + shl_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(float)); output_ptr += copy_size; } } return CSINN_TRUE; } - -int csi_c906_concat_fp16(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params) +int shl_c906_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -67,11 +63,11 @@ int csi_c906_concat_fp16(struct csi_tensor **input, __fp16 *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; __fp16 *input_item_data = input_item->data; const int copy_size = input_item->dim[params->axis] * base_inner_size; const __fp16 *input_ptr = input_item_data + k * copy_size; - csi_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(__fp16)); + shl_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(__fp16)); output_ptr += copy_size; } } diff --git a/source/c906_opt/convolution.c b/source/c906_opt/convolution.c index cdfb544a..76449afb 100644 --- a/source/c906_opt/convolution.c +++ b/source/c906_opt/convolution.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* only support layout:NCHW @@ -26,11 +26,9 @@ kernel layout: O I h w output layout: N O H W */ -int csi_c906_conv2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int32_t out_c = kernel->dim[0]; int32_t in_c = kernel->dim[1]; @@ -42,6 +40,7 @@ int csi_c906_conv2d_init(struct csi_tensor *input, int32_t stride_w = params->stride_width; int32_t dalition_h = params->dilation_height; int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; // check int out_height = (in_h + params->pad_top + params->pad_down - kernel_h) / stride_h + 1; @@ -54,12 +53,12 @@ int csi_c906_conv2d_init(struct csi_tensor *input, if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) { params->conv_extra.conv_mode = CSINN_GEMM; if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, params); - params->base.bc = csi_c906_conv1x1s1_sgemm; + shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, params); + cb->exec = shl_c906_conv1x1s1_sgemm; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_c906_conv1x1s1_sgemm_fp16; - // params->base.bc = csi_c906_conv1x1s1_batch_gemv_fp16; + shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params); + cb->exec = shl_c906_conv1x1s1_sgemm_fp16; + // cb->exec = shl_c906_conv1x1s1_batch_gemv_fp16; } // winograd convolution condition: @@ -67,66 +66,63 @@ int csi_c906_conv2d_init(struct csi_tensor *input, if (input->dtype == CSINN_DTYPE_FLOAT32) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm; + shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm; return CSINN_TRUE; } // pack4 for winograd convolution if ( (out_c % 4 == 0) && (in_c % 4 ==0) ) { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel); + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel); params->conv_extra.kernel_tm = t_kernel; - params->base.bc = csi_c906_conv3x3s1_winograd64_pack4; + cb->exec = shl_c906_conv3x3s1_winograd64_pack4; } else { params->conv_extra.conv_mode = CSINN_GEMM; - csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm; + shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm; } } else if (input->dtype == CSINN_DTYPE_FLOAT16) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm_fp16; + shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm_fp16; return CSINN_TRUE; } // pack8 for winograd convolution if ( (out_c % 8 == 0) && (in_c % 8 ==0) ) { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel); + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel); params->conv_extra.kernel_tm = t_kernel; - params->base.bc = csi_c906_conv3x3s1_winograd64_pack8_fp16; + cb->exec = shl_c906_conv3x3s1_winograd64_pack8_fp16; } else { params->conv_extra.conv_mode = CSINN_GEMM; - csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm_fp16; + shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm_fp16; } } } else { params->conv_extra.conv_mode = CSINN_GEMM; if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm; + shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_c906_conv_im2col_sgemm_fp16; + shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); + cb->exec = shl_c906_conv_im2col_sgemm_fp16; } } return CSINN_TRUE; } - -int csi_c906_depthwise_conv2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int32_t batch = input->dim[0]; int32_t in_ch = input->dim[1]; @@ -141,48 +137,49 @@ int csi_c906_depthwise_conv2d_init(struct csi_tensor *input, int32_t kernel_w = kernel->dim[3]; int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_dwconv3x3s1; + cb->exec = shl_c906_dwconv3x3s1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_c906_dwconv3x3s1_fp16; + cb->exec = shl_c906_dwconv3x3s1_fp16; } } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_dwconv3x3s2; + cb->exec = shl_c906_dwconv3x3s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_c906_dwconv3x3s2_fp16; + cb->exec = shl_c906_dwconv3x3s2_fp16; } } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 1 && stride_w == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_dwconv5x5s1; + cb->exec = shl_c906_dwconv5x5s1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_depthwise_conv2d_quant; + cb->exec = shl_ref_depthwise_conv2d_quant; } } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 2 && stride_w == 2) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_dwconv5x5s2; + cb->exec = shl_c906_dwconv5x5s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_depthwise_conv2d_quant; + cb->exec = shl_ref_depthwise_conv2d_quant; } } else { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_depthwise_conv2d_f32; + cb->exec = shl_ref_depthwise_conv2d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { if (params->pad_left == 0 && params->pad_top == 0 && input->dim[1] == output->dim[1]) { - params->base.bc = csi_c906_dwconv2d_s1_pad0_fp16; + cb->exec = shl_c906_dwconv2d_s1_pad0_fp16; } else { - params->base.bc = csi_ref_depthwise_conv2d_quant; + cb->exec = shl_ref_depthwise_conv2d_quant; } } } return CSINN_TRUE; } -int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params) +int shl_c906_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params) { int32_t out_c = kernel->dim[0]; int32_t in_c = kernel->dim[1]; @@ -190,6 +187,7 @@ int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, int32_t kernel_w = kernel->dim[2]; int32_t stride_w = params->stride_width; int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; // check output_dim int out_width = (in_w + params->pad_left + params->pad_right - kernel_w) / stride_w + 1; @@ -199,17 +197,18 @@ int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, } if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, (struct conv2d_params *)params); - params->base.bc = csi_c906_conv1x1s1_sgemm; + shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, (struct csinn_conv2d_params *)params); + cb->exec = shl_c906_conv1x1s1_sgemm; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, (struct conv2d_params *)params); - params->base.bc = csi_c906_conv1x1s1_sgemm_fp16; + shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, + (struct csinn_conv2d_params *)params); + cb->exec = shl_c906_conv1x1s1_sgemm_fp16; } } else { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_conv1d_f32; + cb->exec = shl_ref_conv1d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_conv1d_quant; + cb->exec = shl_ref_conv1d_quant; } } return CSINN_TRUE; diff --git a/source/c906_opt/convolution_1x1_fp16.c b/source/c906_opt/convolution_1x1_fp16.c index 71f51ecd..fa4a369a 100644 --- a/source/c906_opt/convolution_1x1_fp16.c +++ b/source/c906_opt/convolution_1x1_fp16.c @@ -16,32 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -void csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params) +void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { __fp16 *kernel_data = (__fp16 *)kernel->data; int group = params->group; - int m = kernel->dim[0] / group; // out_ch - int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + int m = kernel->dim[0] / group; // out_ch + int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) - __fp16* pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16)); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); for (int g = 0; g < group; g++) { - csi_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -49,7 +47,7 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, __fp16 *bias_data = (__fp16 *)bias->data; int32_t group = params->group; - int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t batch = input->dim[0]; // assert(batch == 1); int32_t in_ch = input->dim[1]; int32_t out_ch = kernel->dim[0]; int32_t out_h = output->dim[2]; @@ -59,7 +57,7 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, int32_t k = in_ch / group; int32_t n = out_h * out_w; - __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -67,17 +65,17 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, __fp16 *pb = pb_reorder; __fp16 *pc = output_data; // pack - csi_nn_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n); - // csi_c906_reorder_input_fp16_1(input_data, pb, k, n, n); + shl_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n); + // shl_c906_reorder_input_fp16_1(input_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); - // csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); + // shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); + shl_mem_free(pb_reorder); return CSINN_TRUE; } @@ -85,11 +83,9 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, matrix: input data matrix vector: kernel data row */ -int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv1x1s1_batch_gemv_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -97,7 +93,7 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, __fp16 *bias_data = (__fp16 *)bias->data; int32_t group = params->group; - int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t batch = input->dim[0]; // assert(batch == 1); int32_t in_ch = input->dim[1]; int32_t out_ch = kernel->dim[0]; int32_t out_h = output->dim[2]; @@ -107,13 +103,13 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, int32_t k = in_ch / group; int32_t n = out_h * out_w; - bool flag_bias = 1; // default: conv2d layer include bias + bool flag_bias = 1; // default: conv2d layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(out_ch * sizeof(__fp16)); + bias_data = (__fp16 *)shl_mem_alloc(out_ch * sizeof(__fp16)); } - __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -123,20 +119,20 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, __fp16 *bias_tmp = bias_data + g * m; // pack/reorder - csi_c906_reorder_matrix_z16_fp16(input_data, pb, k, n, n); + shl_c906_reorder_matrix_z16_fp16(input_data, pb, k, n, n); // batch GEMV for (int j = 0; j < m; j++) { - csi_c906_gemv_trans_pack16_fp16(pc + j * n, pa + j * k, pb, k, n, n, bias_tmp + j); + shl_c906_gemv_trans_pack16_fp16(pc + j * n, pa + j * k, pb, k, n, n, bias_tmp + j); } input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); + shl_mem_free(pb_reorder); if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; diff --git a/source/c906_opt/convolution_1x1.c b/source/c906_opt/convolution_1x1_fp32.c similarity index 50% rename from source/c906_opt/convolution_1x1.c rename to source/c906_opt/convolution_1x1_fp32.c index 1cb90509..6d2fcb34 100644 --- a/source/c906_opt/convolution_1x1.c +++ b/source/c906_opt/convolution_1x1_fp32.c @@ -16,34 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -void csi_c906_conv1x1s1_sgemm_transform_kernel(struct csi_tensor *kernel, - struct conv2d_params *params) +void shl_c906_conv1x1s1_sgemm_transform_kernel(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { float *kernel_data = (float *)kernel->data; int group = params->group; - int m = kernel->dim[0] / group; // out_ch / group - int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + int m = kernel->dim[0] / group; // out_ch / group + int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) - float* pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float)); + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); for (int g = 0; g < group; g++) { - csi_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } - -static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params, - bool fuse_relu) +static int shl_c906_conv1x1s1_sgemm_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, bool fuse_relu) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -51,7 +47,7 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input, float *bias_data = (float *)bias->data; int32_t group = params->group; - int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t batch = input->dim[0]; // assert(batch == 1); int32_t in_ch = input->dim[1]; int32_t out_ch = kernel->dim[0]; int32_t out_h = output->dim[2]; @@ -61,7 +57,7 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input, int32_t k = in_ch / group; int32_t n = out_h * out_w; - float* pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float)); + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -69,34 +65,29 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input, float *pb = pb_reorder; float *pc = output_data; // pack - csi_c906_reorder_input_1(input_data, pb, k, n, n); + shl_c906_reorder_input_1(input_data, pb, k, n, n); // GEMM - csi_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu); + shl_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu); input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); + shl_mem_free(pb_reorder); return CSINN_TRUE; } -int csi_c906_conv1x1s1_sgemm(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv1x1s1_sgemm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { bool fuse_relu = 0; - return csi_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu); + return shl_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu); } - -int csi_c906_conv1x1s1_sgemm_fuse_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv1x1s1_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { bool fuse_relu = 1; - return csi_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu); + return shl_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu); } \ No newline at end of file diff --git a/source/c906_opt/convolution_3x3_fp16.c b/source/c906_opt/convolution_3x3_fp16.c index b81ed12f..b197671c 100644 --- a/source/c906_opt/convolution_3x3_fp16.c +++ b/source/c906_opt/convolution_3x3_fp16.c @@ -16,8 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - +/* CSI-NN2 version 2.0.x */ /* the conditions for using winograd convolution @@ -27,7 +26,7 @@ input_width <= 120 */ -#include "csi_c906.h" +#include "shl_c906.h" /* padding input for winograd input transform , and change memory layout to [n c/8 h w 8] @@ -36,120 +35,109 @@ constrain: input channel % 8 = 0 */ -void csi_c906_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left) +void shl_c906_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left) { int inc8 = inc / 8; int padded_hw = padded_h * padded_w; __fp16 *pad_ptr = input_padded; __fp16 *inp_ptr = (__fp16 *)input; - int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) + int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "vmv.v.x v2, zero\n\t" // clear v2, for memset value 0 - "mulw t1, %6, %7\n\t" // pad_top * padded_w - "mulw t2, %6, %9\n\t" // pad_down * padded_w - "mulw t0, %3, %4\n\t" // input_size per_channel - "slli t0, t0, 1\n\t" // load stride = input_size * 2 - "slli t6, t0, 3\n\t" // t6 = input_size * 8 * 2 + "vmv.v.x v2, zero\n\t" // clear v2, for memset value 0 + "mulw t1, %6, %7\n\t" // pad_top * padded_w + "mulw t2, %6, %9\n\t" // pad_down * padded_w + "mulw t0, %3, %4\n\t" // input_size per_channel + "slli t0, t0, 1\n\t" // load stride = input_size * 2 + "slli t6, t0, 3\n\t" // t6 = input_size * 8 * 2 - "1:\n\t" // channel loop [inc/8] - "mv a0, %0\n\t" // update input_addr - "mv t5, %3\n\t" // t5 = in_h - "beqz %7, 3f\n\t" // if pad_top = 0 - "mv t3, t1\n\t" // t3 = num to memset + "1:\n\t" // channel loop [inc/8] + "mv a0, %0\n\t" // update input_addr + "mv t5, %3\n\t" // t5 = in_h + "beqz %7, 3f\n\t" // if pad_top = 0 + "mv t3, t1\n\t" // t3 = num to memset - "2:\n\t" // pad h_top - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "2:\n\t" // pad h_top + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 2b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" - "3:\n\t" // pad h_mid - "mv t4, %4\n\t" // t4 = in_w - "beqz %8, 5f\n\t" // if pad_left = 0 - "mv t3, %8\n\t" // t3 = pad_left + "3:\n\t" // pad h_mid + "mv t4, %4\n\t" // t4 = in_w + "beqz %8, 5f\n\t" // if pad_left = 0 + "mv t3, %8\n\t" // t3 = pad_left - "4:\n\t" // pad w_left - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "4:\n\t" // pad w_left + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 4b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" - "5:\n\t" // pad w_mid - "vlse.v v4, (a0), t0\n\t" - "addi a0, a0, 2\n\t" - "vse.v v4, (%1)\n\t" - "addi %1, %1, 16\n\t" + "5:\n\t" // pad w_mid + "vlse.v v4, (a0), t0\n\t" + "addi a0, a0, 2\n\t" + "vse.v v4, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t4, t4, -1\n\t" - "bnez t4, 5b\n\t" + "addi t4, t4, -1\n\t" + "bnez t4, 5b\n\t" - "beqz %10, 7f\n\t" // if pad_right = 0 - "mv t3, %10\n\t" // t3 = pad_right + "beqz %10, 7f\n\t" // if pad_right = 0 + "mv t3, %10\n\t" // t3 = pad_right - "6:\n\t" // pad w_right - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "6:\n\t" // pad w_right + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 6b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" "7:\n\t" - "addi t5, t5, -1\n\t" - "bnez t5, 3b\n\t" + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" - "beqz %9, 9f\n\t" // if pad_down = 0 - "mv t3, t2\n\t" // t3 = num to memset 0 + "beqz %9, 9f\n\t" // if pad_down = 0 + "mv t3, t2\n\t" // t3 = num to memset 0 - "8:\n\t" // pad h_down - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "8:\n\t" // pad h_down + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 8b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" - "9:\n\t" - "add %0, %0, t6\n\t" // input_data jump to next 8 channel + "9:\n\t" + "add %0, %0, t6\n\t" // input_data jump to next 8 channel "addi %2, %2, -1\n\t" "bnez %2, 1b\n\t" - :"=r"(inp_ptr), // %0 - "=r"(pad_ptr), // %1 - "=r"(inc8), // %2 - "=r"(inh), // %3 - "=r"(inw), // %4 - "=r"(padded_hw), // %5 - "=r"(padded_w), // %6 - "=r"(pad_top), // %7 - "=r"(pad_left), // %8 - "=r"(resi_h), // %9 - "=r"(resi_w) // %10 - :"0"(inp_ptr), - "1"(pad_ptr), - "2"(inc8), - "3"(inh), - "4"(inw), - "5"(padded_hw), - "6"(padded_w), - "7"(pad_top), - "8"(pad_left), - "9"(resi_h), - "10"(resi_w) - :"cc", "memory", "v2", "v4", - "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); - + : "=r"(inp_ptr), // %0 + "=r"(pad_ptr), // %1 + "=r"(inc8), // %2 + "=r"(inh), // %3 + "=r"(inw), // %4 + "=r"(padded_hw), // %5 + "=r"(padded_w), // %6 + "=r"(pad_top), // %7 + "=r"(pad_left), // %8 + "=r"(resi_h), // %9 + "=r"(resi_w) // %10 + : "0"(inp_ptr), "1"(pad_ptr), "2"(inc8), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w), + "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w) + : "cc", "memory", "v2", "v4", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); } -void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output, int out_c, int out_h, int out_w, - int wino_h, int wino_w) +void shl_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) { int out_c8 = out_c / 8; __fp16 *out_tm_ptr = (__fp16 *)output_trans; @@ -158,62 +146,56 @@ void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *outp asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mulw t0, %3, %4\n\t" // output_size per_channel - "slli t0, t0, 1\n\t" // store_stride = output_size * 2 + "mulw t0, %3, %4\n\t" // output_size per_channel + "slli t0, t0, 1\n\t" // store_stride = output_size * 2 "slli t3, t0, 3\n\t" // t3 = output_size * 8 * 2 "slli t4, %6, 4\n\t" // t4 = wino_w * 8 * 2 - "mulw t5, %5, %6\n\t" // crop_size per_channel - "slli t5, t5, 4\n\t" // t5 = crop_size * 8 * 2 + "mulw t5, %5, %6\n\t" // crop_size per_channel + "slli t5, t5, 4\n\t" // t5 = crop_size * 8 * 2 - "1:\n\t" // channel loop [out_ch / 8] - "mv a1, %1\n\t" // update output_addr - "mv a0, %0\n\t" // update crop_addr per-channel + "1:\n\t" // channel loop [out_ch / 8] + "mv a1, %1\n\t" // update output_addr + "mv a0, %0\n\t" // update crop_addr per-channel - "mv t1, %3\n\t" // t1 = out_h + "mv t1, %3\n\t" // t1 = out_h - "2:\n\t" // crop h - "mv t2, %4\n\t" // t2 = out_w - "mv s1, a0\n\t" // update crop_addr per-row + "2:\n\t" // crop h + "mv t2, %4\n\t" // t2 = out_w + "mv s1, a0\n\t" // update crop_addr per-row - "3:\n\t" // crop w - "vle.v v2, (s1)\n\t" - "addi s1, s1, 16\n\t" - "vsse.v v2, (a1), t0\n\t" - "addi a1, a1, 2\n\t" + "3:\n\t" // crop w + "vle.v v2, (s1)\n\t" + "addi s1, s1, 16\n\t" + "vsse.v v2, (a1), t0\n\t" + "addi a1, a1, 2\n\t" - "addi t2, t2, -1\n\t" - "bnez t2, 3b\n\t" + "addi t2, t2, -1\n\t" + "bnez t2, 3b\n\t" - "add a0, a0, t4\n\t" // crop-data jump to next row + "add a0, a0, t4\n\t" // crop-data jump to next row - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" - "4:\n\t" - "add %1, %1, t3\n\t" // output_data jump to next 8 channel - "add %0, %0, t5\n\t" // crop-data jump to next 8 channel + "4:\n\t" + "add %1, %1, t3\n\t" // output_data jump to next 8 channel + "add %0, %0, t5\n\t" // crop-data jump to next 8 channel "addi %2, %2, -1\n\t" "bnez %2, 1b\n\t" - :"=r"(out_tm_ptr), // %0 - "=r"(out_ptr), // %1 - "=r"(out_c8), // %2 - "=r"(out_h), // %3 - "=r"(out_w), // %4 - "=r"(wino_h), // %5 - "=r"(wino_w) // %6 - :"0"(out_tm_ptr), - "1"(out_ptr), - "2"(out_c8), - "3"(out_h), - "4"(out_w), - "5"(wino_h), - "6"(wino_w) - :"cc", "memory", "v2", "v3", "a0", "a1", "s1", - "t0", "t1", "t2", "t3", "t4", "t5" + : "=r"(out_tm_ptr), // %0 + "=r"(out_ptr), // %1 + "=r"(out_c8), // %2 + "=r"(out_h), // %3 + "=r"(out_w), // %4 + "=r"(wino_h), // %5 + "=r"(wino_w) // %6 + : "0"(out_tm_ptr), "1"(out_ptr), "2"(out_c8), "3"(out_h), "4"(out_w), "5"(wino_h), + "6"(wino_w) + : "cc", "memory", "v2", "v3", "a0", "a1", "s1", "t0", "t1", "t2", "t3", "t4", "t5" ); } @@ -224,26 +206,24 @@ void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *outp kernel before: [O I 3*3] kernel after : [O/8 8*8 I 8] */ -void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; __fp16 *kernel_data = (__fp16 *)o_kernel->data; // for kernel transform buf, 3x3 --> 8x8 - __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); // kernel transform matrix: G - const __fp16 ktm[8][3] = { - {1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; // const __fp16 ktm[8][3] = { // {1.0f, 0.0f, 0.0f}, @@ -256,13 +236,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor // {0.0f, 0.0f, 1.0f} // }; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const __fp16* kernel0 = kernel_data + p * inch * 9 + q * 9; - __fp16* kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; // transform kernel const __fp16 *k0 = kernel0; @@ -272,7 +251,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor // h : first compute the transport matrix tmp = (g * GT)T __fp16 tmp[8][3]; for (int i = 0; i < 8; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -280,20 +258,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor // U for (int j = 0; j < 8; j++) { - __fp16* tmpp = &tmp[j][0]; + __fp16 *tmpp = &tmp[j][0]; for (int i = 0; i < 8; i++) { - kernel_tmp[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } // optimized layout for winograd64 - __fp16 *kernel_tm_pack8 = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + __fp16 *kernel_tm_pack8 = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); t_kernel->data = kernel_tm_pack8; for (int oc = 0; oc < outch / 8; oc++) { - __fp16 *g0 = kernel_tm_pack8 + oc * 64 * inch * 8; const __fp16 *k0 = kernel_tm + oc * 64 * inch * 8; @@ -306,13 +284,10 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor const __fp16 *k7 = k6 + 64 * inch; for (int k = 0; k < 64; k++) { - __fp16 *g00 = g0 + k * inch * 8; for (int ic = 0; ic < inch / 8; ic++) { - for (int i = 0; i < 8; i++) { - const __fp16 *k00 = k0 + (ic * 8 + i) * 64; const __fp16 *k10 = k1 + (ic * 8 + i) * 64; const __fp16 *k20 = k2 + (ic * 8 + i) * 64; @@ -337,22 +312,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } - /* constrain: output channel % 8 = 0 input channel % 8 = 0 */ -int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd64_pack8_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { // uint64_t start_time, end_time; - // start_time = csi_get_timespec(); + // start_time = shl_get_timespec(); __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -366,7 +339,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -385,28 +358,31 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, int block_h = (out_h + 5) / 6; int block_w = (out_w + 5) / 6; - int padded_in_h = block_h * 6 + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 6 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 6 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias + bool flag_bias = 1; // default: conv2d layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16)); + bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16)); } - for(int n = 0; n < batch; n++) { - + for (int n = 0; n < batch; n++) { // pad buffer: [in_c/8 h w 8] - __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); // pad input - csi_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); input_data += input_size; // input transform buffer1: [in_ch/8, 64, blocks, 8] - __fp16 *input_tm1_buf = (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); + __fp16 *input_tm1_buf = + (__fp16 *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); /****************************** transform input *****************************/ /* @@ -427,23 +403,26 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, int tiles = block_h * block_w; - #pragma omp parallel for num_threads(1) - for(int q = 0; q < in_c / 8; q++) { - - __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 8; // feature map after padding - q channel - __fp16 *img0_tm = input_tm1_buf + q * 64 * tiles * 8; // transform and interleave - q channel +#pragma omp parallel for num_threads(1) + for (int q = 0; q < in_c / 8; q++) { + __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * + 8; // feature map after padding - q channel + __fp16 *img0_tm = + input_tm1_buf + q * 64 * tiles * 8; // transform and interleave - q channel - __fp16 *tmp = (__fp16 *)csi_mem_alloc(8 * 8 * 8 * sizeof(__fp16)); + __fp16 *tmp = (__fp16 *)shl_mem_alloc(8 * 8 * 8 * sizeof(__fp16)); // __fp16 tmp[512] = {0.0}; // ?????? - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - - __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) * 8; // feature map after padding 8*8 start addr - __fp16 *r0_tm = img0_tm + (i * block_w + j) * 8; // input_tm1 8*8 block start addr - - __fp16 ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0, 0.5, -2.5, 2.0}; // note: in fact cannot be output constrain + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { + __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) * + 8; // feature map after padding 8*8 start addr + __fp16 *r0_tm = + img0_tm + (i * block_w + j) * 8; // input_tm1 8*8 block start addr + + __fp16 ratio[] = { + 5.25, -4.25, 0.25, -1.25, + 4.0, 0.5, -2.5, 2.0}; // note: in fact cannot be output constrain __fp16 *ratio_ptr = ratio; asm volatile( @@ -452,91 +431,96 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "mv t5, %2\n\t" // t5 = tmp start addr "slli t1, %4, 4\n\t" // t1 = padded_in_w * 8 * 2bytes - "flh fa0, 0(%3)\n\t" // fa0 = 5.25 - "flh fa1, 2(%3)\n\t" // fa1 = -4.25 - "flh fa2, 4(%3)\n\t" // fa2 = 0.25 - "flh fa3, 6(%3)\n\t" // fa3 = -1.25 - "flh fa4, 8(%3)\n\t" // fa4 = 4.0 - "flh fa5, 10(%3)\n\t" // fa5 = 0.5 - "flh fa6, 12(%3)\n\t" // fa6 = -2.5 - "flh fa7, 14(%3)\n\t" // fa7 = 2.0 - - "1:\n\t" - "mv s1, %0\n\t" // s1 = r00 addr - - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 128\n\t" // tmp[1][m] - "addi a2, a1, 128\n\t" // tmp[2][m] - "addi a3, a2, 128\n\t" // tmp[3][m] - "addi a4, a3, 128\n\t" // tmp[4][m] - "addi a5, a4, 128\n\t" // tmp[5][m] - "addi a6, a5, 128\n\t" // tmp[6][m] - "addi a7, a6, 128\n\t" // tmp[7][m] - - "vle.v v0, (s1)\n\t" // r00 + "flh fa0, 0(%3)\n\t" // fa0 = 5.25 + "flh fa1, 2(%3)\n\t" // fa1 = -4.25 + "flh fa2, 4(%3)\n\t" // fa2 = 0.25 + "flh fa3, 6(%3)\n\t" // fa3 = -1.25 + "flh fa4, 8(%3)\n\t" // fa4 = 4.0 + "flh fa5, 10(%3)\n\t" // fa5 = 0.5 + "flh fa6, 12(%3)\n\t" // fa6 = -2.5 + "flh fa7, 14(%3)\n\t" // fa7 = 2.0 + + "1:\n\t" + "mv s1, %0\n\t" // s1 = r00 addr + + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 128\n\t" // tmp[1][m] + "addi a2, a1, 128\n\t" // tmp[2][m] + "addi a3, a2, 128\n\t" // tmp[3][m] + "addi a4, a3, 128\n\t" // tmp[4][m] + "addi a5, a4, 128\n\t" // tmp[5][m] + "addi a6, a5, 128\n\t" // tmp[6][m] + "addi a7, a6, 128\n\t" // tmp[7][m] + + "vle.v v0, (s1)\n\t" // r00 "addi s1, s1, 16\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "addi s1, s1, 16\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "addi s1, s1, 16\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "addi s1, s1, 16\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "addi s1, s1, 16\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "addi s1, s1, 16\n\t" - "vle.v v6, (s1)\n\t" // r06 + "vle.v v6, (s1)\n\t" // r06 "addi s1, s1, 16\n\t" - "vle.v v7, (s1)\n\t" // r07 + "vle.v v7, (s1)\n\t" // r07 "addi s1, s1, 16\n\t" "vmv.v.v v10, v6\n\t" //--------------------------------------------- - "vfsub.vv v8, v4, v2\n\t" // r04 - r02 - "vfsub.vv v9, v3, v5\n\t" // r03 - r05 + "vfsub.vv v8, v4, v2\n\t" // r04 - r02 + "vfsub.vv v9, v3, v5\n\t" // r03 - r05 - "vfsub.vv v24, v0, v6\n\t" // r00 - r06 - "vfsub.vv v31, v7, v1\n\t" // r07 - r01 + "vfsub.vv v24, v0, v6\n\t" // r00 - r06 + "vfsub.vv v31, v7, v1\n\t" // r07 - r01 - "vfmacc.vf v10, fa2, v2\n\t" // r06 + r02 * 0.25f + "vfmacc.vf v10, fa2, v2\n\t" // r06 + r02 * 0.25f - "vfmul.vf v11, v1, fa5\n\t" // r01 * 0.5f - "vfmul.vf v12, v1, fa7\n\t" // r01 * 2.0f + "vfmul.vf v11, v1, fa5\n\t" // r01 * 0.5f + "vfmul.vf v12, v1, fa7\n\t" // r01 * 2.0f - "vfmacc.vf v24, fa0, v8\n\t" // r00 - r06 + 5.25 * (r04 - r02) = tmp[0][m] - "vfmacc.vf v31, fa0, v9\n\t" // r07 - r01 + 5.25 * (r03 - r05) = tmp[7][m] + "vfmacc.vf v24, fa0, v8\n\t" // r00 - r06 + 5.25 * (r04 - r02) = + // tmp[0][m] + "vfmacc.vf v31, fa0, v9\n\t" // r07 - r01 + 5.25 * (r03 - r05) = + // tmp[7][m] //--------------------------------------------- - "vfadd.vv v8, v2, v6\n\t" // r02 + r06 - "vfadd.vv v9, v1, v5\n\t" // r01 + r05 + "vfadd.vv v8, v2, v6\n\t" // r02 + r06 + "vfadd.vv v9, v1, v5\n\t" // r01 + r05 - "vfmacc.vf v11, fa6, v3\n\t" // r01 * 0.5f - r03 * 2.5f - "vfmacc.vf v12, fa6, v3\n\t" // r01 * 2.f - r03 * 2.5f + "vfmacc.vf v11, fa6, v3\n\t" // r01 * 0.5f - r03 * 2.5f + "vfmacc.vf v12, fa6, v3\n\t" // r01 * 2.f - r03 * 2.5f - "vfmacc.vf v2, fa3, v4\n\t" // r02 - r04 * 1.25f 注意 - "vfmacc.vf v10, fa3, v4\n\t" // r06 + r02 * 0.25f - r04 * 1.25f = tmp34a + "vfmacc.vf v2, fa3, v4\n\t" // r02 - r04 * 1.25f 注意 + "vfmacc.vf v10, fa3, v4\n\t" // r06 + r02 * 0.25f - r04 * 1.25f = + // tmp34a - "vfmacc.vf v8, fa1, v4\n\t" // r02 + r06 - r04 * 4.25f = tmp12a - "vfmacc.vf v9, fa1, v3\n\t" // r01 + r05 - r03 * 4.25f = tmp12b + "vfmacc.vf v8, fa1, v4\n\t" // r02 + r06 - r04 * 4.25f = tmp12a + "vfmacc.vf v9, fa1, v3\n\t" // r01 + r05 - r03 * 4.25f = tmp12b - "vfmacc.vf v11, fa7, v5\n\t" // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = tmp34b - "vfmacc.vf v12, fa5, v5\n\t" // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = tmp56b + "vfmacc.vf v11, fa7, v5\n\t" // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = + // tmp34b + "vfmacc.vf v12, fa5, v5\n\t" // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = + // tmp56b "vse.v v24, (a0)\n\t" "vse.v v31, (a7)\n\t" - "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = tmp[1][m] - "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = tmp[2][m] + "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = tmp[1][m] + "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = tmp[2][m] //--------------------------------------------- - "vfmacc.vf v6, fa4, v2\n\t" // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a + "vfmacc.vf v6, fa4, v2\n\t" // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a "vfadd.vv v27, v10, v11\n\t" // tmp34a + tmp34b = tmp[3][m] "vfsub.vv v28, v10, v11\n\t" // tmp34a - tmp34b = tmp[4][m] - "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = tmp[5][m] - "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = tmp[6][m] + "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = tmp[5][m] + "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = tmp[6][m] "vse.v v25, (a1)\n\t" "vse.v v26, (a2)\n\t" @@ -547,96 +531,102 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, //--------------------------------------------- - "add %0, %0, t1\n\t" // padding feature map 8*8 next line addr - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "add %0, %0, t1\n\t" // padding feature map 8*8 next line addr + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 8\n\t" // m = 8 + "mv t5, %2\n\t" // tmp start addr + "li t0, 8\n\t" // m = 8 - "slli t1, %5, 4\n\t" // t1 = tiles * 8 * 2 bytes - "slli t2, %5, 7\n\t" // t2 = tiles * 8 * 8 * 2 bytes + "slli t1, %5, 4\n\t" // t1 = tiles * 8 * 2 bytes + "slli t2, %5, 7\n\t" // t2 = tiles * 8 * 8 * 2 bytes - "3:\n\t" + "3:\n\t" - "mv a0, %1\n\t" // r0_tm_0 - "add a1, a0, t1\n\t" // r0_tm_1 - "add a2, a1, t1\n\t" // r0_tm_2 - "add a3, a2, t1\n\t" // r0_tm_3 - "add a4, a3, t1\n\t" // r0_tm_4 - "add a5, a4, t1\n\t" // r0_tm_5 - "add a6, a5, t1\n\t" // r0_tm_6 - "add a7, a6, t1\n\t" // r0_tm_7 + "mv a0, %1\n\t" // r0_tm_0 + "add a1, a0, t1\n\t" // r0_tm_1 + "add a2, a1, t1\n\t" // r0_tm_2 + "add a3, a2, t1\n\t" // r0_tm_3 + "add a4, a3, t1\n\t" // r0_tm_4 + "add a5, a4, t1\n\t" // r0_tm_5 + "add a6, a5, t1\n\t" // r0_tm_6 + "add a7, a6, t1\n\t" // r0_tm_7 - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" - "vle.v v6, (t5)\n\t" // tmp[m][6] + "vle.v v6, (t5)\n\t" // tmp[m][6] "addi t5, t5, 16\n\t" - "vle.v v7, (t5)\n\t" // tmp[m][7] + "vle.v v7, (t5)\n\t" // tmp[m][7] "addi t5, t5, 16\n\t" "vmv.v.v v10, v6\n\t" //--------------------------------------------- - "vfsub.vv v8, v4, v2\n\t" // tmp04 - tmp02 (tmp[m][4] - tmp[m][2]) - "vfsub.vv v9, v3, v5\n\t" // tmp03 - tmp05 + "vfsub.vv v8, v4, v2\n\t" // tmp04 - tmp02 (tmp[m][4] - tmp[m][2]) + "vfsub.vv v9, v3, v5\n\t" // tmp03 - tmp05 - "vfsub.vv v24, v0, v6\n\t" // tmp00 - tmp06 - "vfsub.vv v31, v7, v1\n\t" // tmp07 - tmp01 + "vfsub.vv v24, v0, v6\n\t" // tmp00 - tmp06 + "vfsub.vv v31, v7, v1\n\t" // tmp07 - tmp01 - "vfmacc.vf v10, fa2, v2\n\t" // tmp06 + tmp02 * 0.25f + "vfmacc.vf v10, fa2, v2\n\t" // tmp06 + tmp02 * 0.25f - "vfmul.vf v11, v1, fa5\n\t" // tmp01 * 0.5f - "vfmul.vf v12, v1, fa7\n\t" // tmp01 * 2.0f + "vfmul.vf v11, v1, fa5\n\t" // tmp01 * 0.5f + "vfmul.vf v12, v1, fa7\n\t" // tmp01 * 2.0f - "vfmacc.vf v24, fa0, v8\n\t" // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) = r0_tm_0[m] - "vfmacc.vf v31, fa0, v9\n\t" // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) = r0_tm_7[m] + "vfmacc.vf v24, fa0, v8\n\t" // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) + // = r0_tm_0[m] + "vfmacc.vf v31, fa0, v9\n\t" // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) + // = r0_tm_7[m] //--------------------------------------------- - "vfadd.vv v8, v2, v6\n\t" // tmp02 + tmp06 - "vfadd.vv v9, v1, v5\n\t" // tmp01 + tmp05 + "vfadd.vv v8, v2, v6\n\t" // tmp02 + tmp06 + "vfadd.vv v9, v1, v5\n\t" // tmp01 + tmp05 - "vfmacc.vf v11, fa6, v3\n\t" // tmp01 * 0.5f - tmp03 * 2.5f - "vfmacc.vf v12, fa6, v3\n\t" // tmp01 * 2.f - tmp03 * 2.5f + "vfmacc.vf v11, fa6, v3\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + "vfmacc.vf v12, fa6, v3\n\t" // tmp01 * 2.f - tmp03 * 2.5f - "vfmacc.vf v2, fa3, v4\n\t" // tmp02 - tmp04 * 1.25f - "vfmacc.vf v10, fa3, v4\n\t" // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f = tmp34a + "vfmacc.vf v2, fa3, v4\n\t" // tmp02 - tmp04 * 1.25f + "vfmacc.vf v10, fa3, v4\n\t" // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f + // = tmp34a - "vfmacc.vf v8, fa1, v4\n\t" // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a - "vfmacc.vf v9, fa1, v3\n\t" // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b + "vfmacc.vf v8, fa1, v4\n\t" // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a + "vfmacc.vf v9, fa1, v3\n\t" // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b - "vfmacc.vf v11, fa7, v5\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 * 2.0 = tmp34b - "vfmacc.vf v12, fa5, v5\n\t" // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * 0.5 = tmp56b + "vfmacc.vf v11, fa7, v5\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 + // * 2.0 = tmp34b + "vfmacc.vf v12, fa5, v5\n\t" // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * + // 0.5 = tmp56b "vse.v v24, (a0)\n\t" "vse.v v31, (a7)\n\t" - "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = r0_tm_1[m] - "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = r0_tm_2[m] + "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = r0_tm_1[m] + "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = r0_tm_2[m] //--------------------------------------------- - "vfmacc.vf v6, fa4, v2\n\t" // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = tmp56a + "vfmacc.vf v6, fa4, v2\n\t" // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = + // tmp56a "vfadd.vv v27, v10, v11\n\t" // tmp34a + tmp34b = r0_tm_3[m] "vfsub.vv v28, v10, v11\n\t" // tmp34a - tmp34b = r0_tm_4[m] - "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = r0_tm_5[m] - "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = r0_tm_6[m] + "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = r0_tm_5[m] + "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = r0_tm_6[m] "vse.v v25, (a1)\n\t" "vse.v v26, (a2)\n\t" @@ -650,32 +640,27 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - :"=r"(r0), // %0 - "=r"(r0_tm), // %1 - "=r"(tmp), // %2 - "=r"(ratio_ptr), // %3 - "=r"(padded_in_w), // %4 - "=r"(tiles) // %5 - :"0"(r0), - "1"(r0_tm), - "2"(tmp), - "3"(ratio_ptr), - "4"(padded_in_w), - "5"(tiles) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7" - ); - + : "=r"(r0), // %0 + "=r"(r0_tm), // %1 + "=r"(tmp), // %2 + "=r"(ratio_ptr), // %3 + "=r"(padded_in_w), // %4 + "=r"(tiles) // %5 + : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w), + "5"(tiles) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", + "a5", "a6", "a7", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"); } } - csi_mem_free(tmp); + shl_mem_free(tmp); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); /*********************************** dot ***************************************/ // reorder input_tm1_buf - __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); #pragma omp parallel for num_threads(1) for (int r = 0; r < 64; r++) { @@ -683,7 +668,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, int t = 0; for (; t + 7 < tiles; t += 8) { - __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data + __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data __fp16 *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 8; @@ -707,12 +692,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, //----------------------------- asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes - "srai t2, %3, 3\n\t" // in_ch8 + "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes + "srai t2, %3, 3\n\t" // in_ch8 - "1:\n\t" // in_ch loop8 + "1:\n\t" // in_ch loop8 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -738,17 +723,13 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "a0", "t1", "t2" - ); + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "a0", "t1", + "t2"); } for (; t + 3 < tiles; t += 4) { __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -769,12 +750,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes - "srai t2, %3, 3\n\t" // in_ch8 + "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes + "srai t2, %3, 3\n\t" // in_ch8 - "1:\n\t" // in_ch loop8 + "1:\n\t" // in_ch loop8 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -792,18 +773,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", - "a0", "t1", "t2" - ); - + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "a0", "t1", "t2"); } for (; t + 1 < tiles; t += 2) { __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -821,12 +796,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes - "srai t2, %3, 3\n\t" // in_ch8 + "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes + "srai t2, %3, 3\n\t" // in_ch8 - "1:\n\t" // in_ch loop8 + "1:\n\t" // in_ch loop8 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -840,18 +815,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", - "a0", "t1", "t2" - ); - + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "a0", "t1", "t2"); } for (; t < tiles; t++) { __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -868,12 +837,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes - "srai t2, %3, 3\n\t" // in_ch8 + "slli t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes + "srai t2, %3, 3\n\t" // in_ch8 - "1:\n\t" // in_ch loop8 + "1:\n\t" // in_ch loop8 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -886,28 +855,23 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", - "a0", "t1", "t2" - ); + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "a0", "t1", "t2"); } } - csi_mem_free(input_tm1_buf); + shl_mem_free(input_tm1_buf); // output_dot_buf: [out_c/8, 64, blocks, 8] - __fp16 *output_dot_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int p = 0; p < out_c / 8; p++) { - __fp16 *output0_tm = output_dot_buf + p * 64 * tiles * 8; __fp16 *kernel0_tm = kernel_data + p * 64 * in_c * 8; @@ -921,7 +885,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" @@ -930,9 +894,9 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "vmv.v.x v4, zero\n\t" "vmv.v.x v5, zero\n\t" "vmv.v.x v6, zero\n\t" - "vmv.v.x v7, zero\n\t" // clear + "vmv.v.x v7, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "vle.v v8, (%1)\n\t" "addi %1, %1, 16\n\t" @@ -959,34 +923,31 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v4, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v5, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v6, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v7, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v4, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v5, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v6, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v7, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" ); } @@ -996,13 +957,13 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" "vmv.v.x v2, zero\n\t" - "vmv.v.x v3, zero\n\t" // clear + "vmv.v.x v3, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "vle.v v4, (%1)\n\t" "addi %1, %1, 16\n\t" @@ -1021,25 +982,22 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", + "t0"); } for (; t + 1 < tiles; t += 2) { __fp16 *r0 = img_tm2 + t * in_c; @@ -1047,11 +1005,11 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" - "vmv.v.x v1, zero\n\t" // clear + "vmv.v.x v1, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "vle.v v2, (%1)\n\t" "addi %1, %1, 16\n\t" @@ -1066,21 +1024,17 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0"); } for (; t < tiles; t++) { __fp16 *r0 = img_tm2 + t * in_c; @@ -1088,10 +1042,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c= - "vmv.v.x v0, zero\n\t" // clear + "mv t0, %3\n\t" // t0 = in_c= + "vmv.v.x v0, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "vle.v v1, (%1)\n\t" "addi %1, %1, 16\n\t" @@ -1104,30 +1058,24 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "fa0", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "fa0", "t0"); } - } - } - csi_mem_free(input_tm2_buf); + shl_mem_free(input_tm2_buf); /*************************** transform output ****************************/ // output_tm1_buf: [out_c/8, out_h6, out_w6, 8] - __fp16 *output_tm1_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); /* AT = { @@ -1148,26 +1096,25 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, }; */ - #pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / 8; p++) - { - +#pragma omp parallel for num_threads(1) + for (int p = 0; p < out_c / 8; p++) { __fp16 *bias_tmp = bias_data + p * 8; - __fp16 *out0_tm = output_dot_buf + p * 64 * block_h * block_w * 8; // 输出转换前/dot后 第p个channel - __fp16 *out0 = output_tm1_buf + p * 6*block_h * 6*block_w * 8; // 转换后输出 第p个channel + __fp16 *out0_tm = + output_dot_buf + p * 64 * block_h * block_w * 8; // 输出转换前/dot后 第p个channel + __fp16 *out0 = + output_tm1_buf + p * 6 * block_h * 6 * block_w * 8; // 转换后输出 第p个channel - __fp16 *tmp1 = (__fp16 *)csi_mem_alloc(6 * 8 * 8 * sizeof(__fp16)); + __fp16 *tmp1 = (__fp16 *)shl_mem_alloc(6 * 8 * 8 * sizeof(__fp16)); // __fp16 tmp[6][8][8]; int out_w6 = block_w * 6; for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { + __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8; // 8*8 起始地址 - __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8; // 8*8 起始地址 - - __fp16 *output0 = out0 + (i * block_w * 6 * 6 + j * 6) * 8; // 输出 6*6 的起始地址 + __fp16 *output0 = + out0 + (i * block_w * 6 * 6 + j * 6) * 8; // 输出 6*6 的起始地址 __fp16 ratio[] = {2.0, 4.0, 8.0, 16.0, 32.0}; __fp16 *ratio_ptr = ratio; @@ -1179,65 +1126,66 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "slli t1, %4, 4\n\t" // t1 = tiles * 8 * 2 "slli t2, %4, 7\n\t" // t2 = tiles * 8 * 8 * 2 bytes - "flh fa0, 0(%3)\n\t" // fa0 = 2 - "flh fa1, 2(%3)\n\t" // fa1 = 4 - "flh fa2, 4(%3)\n\t" // fa2 = 8 - "flh fa3, 6(%3)\n\t" // fa3 = 16 - "flh fa4, 8(%3)\n\t" // fa4 = 32 + "flh fa0, 0(%3)\n\t" // fa0 = 2 + "flh fa1, 2(%3)\n\t" // fa1 = 4 + "flh fa2, 4(%3)\n\t" // fa2 = 8 + "flh fa3, 6(%3)\n\t" // fa3 = 16 + "flh fa4, 8(%3)\n\t" // fa4 = 32 "mv s1, %0\n\t" - "1:\n\t" // shape : [6 * 8] * [8 * 8] = [6 * 8] + "1:\n\t" // shape : [6 * 8] * [8 * 8] = [6 * 8] - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 128\n\t" // tmp[1][m] - "addi a2, a1, 128\n\t" // tmp[2][m] - "addi a3, a2, 128\n\t" // tmp[3][m] - "addi a4, a3, 128\n\t" // tmp[4][m] - "addi a5, a4, 128\n\t" // tmp[5][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 128\n\t" // tmp[1][m] + "addi a2, a1, 128\n\t" // tmp[2][m] + "addi a3, a2, 128\n\t" // tmp[3][m] + "addi a4, a3, 128\n\t" // tmp[4][m] + "addi a5, a4, 128\n\t" // tmp[5][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "add s1, s1, t1\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "add s1, s1, t1\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "add s1, s1, t1\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "add s1, s1, t1\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "add s1, s1, t1\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "add s1, s1, t1\n\t" - "vle.v v6, (s1)\n\t" // r06 + "vle.v v6, (s1)\n\t" // r06 "add s1, s1, t1\n\t" - "vle.v v7, (s1)\n\t" // r07 + "vle.v v7, (s1)\n\t" // r07 "add s1, s1, t1\n\t" //--------------------------------------------- - "vfadd.vv v8, v1, v2\n\t" // r01 + r02 = tmp024a - "vfsub.vv v9, v1, v2\n\t" // r01 - r02 = tmp135a + "vfadd.vv v8, v1, v2\n\t" // r01 + r02 = tmp024a + "vfsub.vv v9, v1, v2\n\t" // r01 - r02 = tmp135a - "vfadd.vv v10, v3, v4\n\t" // r03 + r04 = tmp024b - "vfsub.vv v11, v3, v4\n\t" // r03 - r04 = tmp135b + "vfadd.vv v10, v3, v4\n\t" // r03 + r04 = tmp024b + "vfsub.vv v11, v3, v4\n\t" // r03 - r04 = tmp135b - "vfadd.vv v12, v5, v6\n\t" // r05 + r06 = tmp024c - "vfsub.vv v13, v5, v6\n\t" // r05 - r06 = tmp135c + "vfadd.vv v12, v5, v6\n\t" // r05 + r06 = tmp024c + "vfsub.vv v13, v5, v6\n\t" // r05 - r06 = tmp135c - "vfadd.vv v0, v0, v8\n\t" // r00 + tmp024a - "vfadd.vv v7, v7, v9\n\t" // r07 + tmp135a - "vmv.v.v v14, v10\n\t" // v14 = tmp024b + "vfadd.vv v0, v0, v8\n\t" // r00 + tmp024a + "vfadd.vv v7, v7, v9\n\t" // r07 + tmp135a + "vmv.v.v v14, v10\n\t" // v14 = tmp024b - "vmv.v.v v26, v8\n\t" // v26 = tmp024a - "vmv.v.v v28, v8\n\t" // v28 = tmp024a + "vmv.v.v v26, v8\n\t" // v26 = tmp024a + "vmv.v.v v28, v8\n\t" // v28 = tmp024a "vfmacc.vf v26, fa1, v10\n\t" // tmp024a + tmp024b * 4 "vfmacc.vf v14, fa4, v12\n\t" // tmp024b + tmp024c * 32 "vfmacc.vf v28, fa3, v10\n\t" // tmp024a + tmp024b * 16 - "vmv.v.v v15, v13\n\t" // v15 = tmp135c - "vmv.v.v v25, v9\n\t" // v25 = tmp135a - "vmv.v.v v27, v9\n\t" // v27 = tmp135a - "vfadd.vv v24, v0, v14\n\t" // r00 + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m] + "vmv.v.v v15, v13\n\t" // v15 = tmp135c + "vmv.v.v v25, v9\n\t" // v25 = tmp135a + "vmv.v.v v27, v9\n\t" // v27 = tmp135a + "vfadd.vv v24, v0, v14\n\t" // r00 + tmp024a + tmp024b + tmp024c * 32 + // = tmp[0][m] "vfmacc.vf v25, fa0, v11\n\t" // tmp135a + tmp135b * 2 "vfmacc.vf v27, fa2, v11\n\t" // tmp135a + tmp135b * 8 @@ -1245,8 +1193,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, //--------------------------------------------- "vse.v v24, (a0)\n\t" - "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m] - "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m] + "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = + // tmp[2][m] + "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + + // tmp024c = tmp[4][m] "vfmacc.vf v15, fa4, v11\n\t" // tmp135b * 32 + tmp135c "vse.v v26, (a2)\n\t" @@ -1254,28 +1204,30 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, //--------------------------------------------- - "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m] - "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m] + "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 + // = tmp[1][m] + "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = + // tmp[3][m] - "vfadd.vv v29, v7, v15\n\t" // r07 + tmp135a + tmp135b * 32 + tmp135c + "vfadd.vv v29, v7, v15\n\t" // r07 + tmp135a + tmp135b * 32 + tmp135c "vse.v v25, (a1)\n\t" "vse.v v27, (a3)\n\t" "vse.v v29, (a5)\n\t" - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 6\n\t" // m = 6 - "slli t1, %5, 4\n\t" // t1 = out_w6 * 8 * 2bytes - "vle.v v16, (%6)\n\t" // load 8 channel bias data + "mv t5, %2\n\t" // tmp start addr + "li t0, 6\n\t" // m = 6 + "slli t1, %5, 4\n\t" // t1 = out_w6 * 8 * 2bytes + "vle.v v16, (%6)\n\t" // load 8 channel bias data - "3:\n\t" // shape : [6 * 8] * [6 * 8] = [6 * 6] + "3:\n\t" // shape : [6 * 8] * [6 * 8] = [6 * 6] "mv a0, %1\n\t" "addi a1, a0, 16\n\t" @@ -1284,48 +1236,49 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi a4, a3, 16\n\t" "addi a5, a4, 16\n\t" - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" - "vle.v v6, (t5)\n\t" // tmp[m][6] + "vle.v v6, (t5)\n\t" // tmp[m][6] "addi t5, t5, 16\n\t" - "vle.v v7, (t5)\n\t" // tmp[m][7] + "vle.v v7, (t5)\n\t" // tmp[m][7] "addi t5, t5, 16\n\t" //--------------------------------------------- - "vfadd.vv v8, v1, v2\n\t" // tmp[m][1] + tmp[m][2] = tmp024a - "vfsub.vv v9, v1, v2\n\t" // tmp[m][1] - tmp[m][2] = tmp135a + "vfadd.vv v8, v1, v2\n\t" // tmp[m][1] + tmp[m][2] = tmp024a + "vfsub.vv v9, v1, v2\n\t" // tmp[m][1] - tmp[m][2] = tmp135a - "vfadd.vv v10, v3, v4\n\t" // tmp[m][3] + tmp[m][4] = tmp024b - "vfsub.vv v11, v3, v4\n\t" // tmp[m][3] - tmp[m][4] = tmp135b + "vfadd.vv v10, v3, v4\n\t" // tmp[m][3] + tmp[m][4] = tmp024b + "vfsub.vv v11, v3, v4\n\t" // tmp[m][3] - tmp[m][4] = tmp135b - "vfadd.vv v12, v5, v6\n\t" // tmp[m][5] + tmp[m][6] = tmp024c - "vfsub.vv v13, v5, v6\n\t" // tmp[m][5] - tmp[m][6] = tmp135c + "vfadd.vv v12, v5, v6\n\t" // tmp[m][5] + tmp[m][6] = tmp024c + "vfsub.vv v13, v5, v6\n\t" // tmp[m][5] - tmp[m][6] = tmp135c - "vfadd.vv v0, v0, v8\n\t" // tmp[m][0] + tmp024a - "vfadd.vv v7, v7, v9\n\t" // tmp[m][7] + tmp135a - "vmv.v.v v14, v10\n\t" // v14 = tmp024b + "vfadd.vv v0, v0, v8\n\t" // tmp[m][0] + tmp024a + "vfadd.vv v7, v7, v9\n\t" // tmp[m][7] + tmp135a + "vmv.v.v v14, v10\n\t" // v14 = tmp024b - "vmv.v.v v26, v8\n\t" // v26 = tmp024a - "vmv.v.v v28, v8\n\t" // v28 = tmp024a + "vmv.v.v v26, v8\n\t" // v26 = tmp024a + "vmv.v.v v28, v8\n\t" // v28 = tmp024a "vfmacc.vf v26, fa1, v10\n\t" // tmp024a + tmp024b * 4 "vfmacc.vf v14, fa4, v12\n\t" // tmp024b + tmp024c * 32 "vfmacc.vf v28, fa3, v10\n\t" // tmp024a + tmp024b * 16 - "vmv.v.v v15, v13\n\t" // v15 = tmp135c - "vmv.v.v v25, v9\n\t" // v25 = tmp135a - "vmv.v.v v27, v9\n\t" // v27 = tmp135a - "vfadd.vv v24, v0, v14\n\t" // tmp[m][0] + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m] + "vmv.v.v v15, v13\n\t" // v15 = tmp135c + "vmv.v.v v25, v9\n\t" // v25 = tmp135a + "vmv.v.v v27, v9\n\t" // v27 = tmp135a + "vfadd.vv v24, v0, v14\n\t" // tmp[m][0] + tmp024a + tmp024b + + // tmp024c * 32 = tmp[0][m] "vfmacc.vf v25, fa0, v11\n\t" // tmp135a + tmp135b * 2 "vfmacc.vf v27, fa2, v11\n\t" // tmp135a + tmp135b * 8 @@ -1333,19 +1286,24 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, //--------------------------------------------- "vfadd.vv v24, v24, v16\n\t" // + bias - "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m] - "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m] + "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = + // tmp[2][m] + "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + + // tmp024c = tmp[4][m] "vfmacc.vf v15, fa4, v11\n\t" // tmp135b * 32 + tmp135c "vse.v v24, (a0)\n\t" - "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m] - "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m] + "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 + // = tmp[1][m] + "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = + // tmp[3][m] "vfadd.vv v26, v26, v16\n\t" // + bias "vfadd.vv v28, v28, v16\n\t" // + bias - "vfadd.vv v29, v7, v15\n\t" // tmp[m][7] + tmp135a + tmp135b * 32 + tmp135c + "vfadd.vv v29, v7, v15\n\t" // tmp[m][7] + tmp135a + tmp135b * 32 + + // tmp135c "vse.v v26, (a2)\n\t" "vse.v v28, (a4)\n\t" @@ -1365,71 +1323,64 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - :"=r"(output0_tm_0), // %0 - "=r"(output0), // %1 - "=r"(tmp1), // %2 - "=r"(ratio_ptr), // %3 - "=r"(tiles), // %4 - "=r"(out_w6), // %5 - "=r"(bias_tmp) // %6 - :"0"(output0_tm_0), - "1"(output0), - "2"(tmp1), - "3"(ratio_ptr), - "4"(tiles), - "5"(out_w6), - "6"(bias_tmp) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", "v26", "v27", "v28", "v29", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", - "fa0", "fa1", "fa2", "fa3", "fa4" - ); + : "=r"(output0_tm_0), // %0 + "=r"(output0), // %1 + "=r"(tmp1), // %2 + "=r"(ratio_ptr), // %3 + "=r"(tiles), // %4 + "=r"(out_w6), // %5 + "=r"(bias_tmp) // %6 + : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles), + "5"(out_w6), "6"(bias_tmp) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", + "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", "a0", "a1", + "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4"); } } - csi_mem_free(tmp1); + shl_mem_free(tmp1); } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6); + shl_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); output_data += output_size; - csi_mem_free(output_tm1_buf); + shl_mem_free(output_tm1_buf); } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; } -void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; __fp16 *kernel_data = (__fp16 *)o_kernel->data; // for kernel transform buf, 3x3 --> 6x6 - __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); // kernel transform matrix: G - const __fp16 ktm[6][3] = { - { 1.0f/4, 0.0f, 0.0f}, - { -1.0f/6, -1.0f/6, -1.0f/6}, - { -1.0f/6, 1.0f/6, -1.0f/6}, - { 1.0f/24, 1.0f/12, 1.0f/6}, - { 1.0f/24, -1.0f/12, 1.0f/6}, - { 0.0f, 0.0f, 1.0f} - }; + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const __fp16* kernel0 = kernel_data + p * inch * 9 + q * 9; - __fp16* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; // transform kernel const __fp16 *k0 = kernel0; @@ -1439,7 +1390,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor // h : first compute the transport matrix tmp = (g * GT)T __fp16 tmp[6][3]; for (int i = 0; i < 6; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -1447,21 +1397,21 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor // U for (int j = 0; j < 6; j++) { - __fp16* tmpp = &tmp[j][0]; + __fp16 *tmpp = &tmp[j][0]; for (int i = 0; i < 6; i++) { - kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } // [O, I, 6, 6] --> [O/4, 6*6, I, 4] - __fp16 *kernel_tm_pack4 = (__fp16 *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + __fp16 *kernel_tm_pack4 = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); t_kernel->data = kernel_tm_pack4; for (int oc = 0; oc < outch / 8; oc++) { - __fp16 *g0 = kernel_tm_pack4 + oc * 36 * inch * 8; const __fp16 *k0 = kernel_tm + oc * 36 * inch * 8; @@ -1474,13 +1424,10 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor const __fp16 *k7 = k6 + 36 * inch; for (int k = 0; k < 36; k++) { - __fp16 *g00 = g0 + k * inch * 8; for (int ic = 0; ic < inch / 8; ic++) { - for (int i = 0; i < 8; i++) { - const __fp16 *k00 = k0 + (ic * 8 + i) * 36; const __fp16 *k10 = k1 + (ic * 8 + i) * 36; const __fp16 *k20 = k2 + (ic * 8 + i) * 36; @@ -1505,14 +1452,13 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } -int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd43_pack8_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -1526,7 +1472,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -1545,29 +1491,31 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, int block_h = (out_h + 3) / 4; int block_w = (out_w + 3) / 4; - int padded_in_h = block_h * 4 + 2; // block * 4 for alignment with 4,kernel = 3 * 3, stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 4 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3, stride = 1,thus input_size + 2 int padded_in_w = block_w * 4 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias + bool flag_bias = 1; // default: conv2d layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16)); + bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16)); } - - for(int n = 0; n < batch; n++) { - + for (int n = 0; n < batch; n++) { // pad buffer: [in_c/4 h w 4] - __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); // pad input - csi_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); input_data += input_size; // input transform buffer1: [in_ch/4, 36, blocks, 6] - __fp16 *input_tm1_buf = (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); + __fp16 *input_tm1_buf = + (__fp16 *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); /****************************** transform input *****************************/ /* @@ -1583,22 +1531,24 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, int tiles = block_h * block_w; - #pragma omp parallel for num_threads(1) - for(int q = 0; q < in_c / 4; q++) { - - __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 8; // feature map after padding - q channel - __fp16 *img0_tm = input_tm1_buf + q * 36 * tiles * 8; // transform and interleave - q channel - - __fp16 *tmp = (__fp16 *)csi_mem_alloc(6 * 6 * 8 * sizeof(__fp16)); - - for(int i = 0; i < block_h; i++) { +#pragma omp parallel for num_threads(1) + for (int q = 0; q < in_c / 4; q++) { + __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * + 8; // feature map after padding - q channel + __fp16 *img0_tm = + input_tm1_buf + q * 36 * tiles * 8; // transform and interleave - q channel - for(int j = 0; j < block_w; j++) { + __fp16 *tmp = (__fp16 *)shl_mem_alloc(6 * 6 * 8 * sizeof(__fp16)); - __fp16 *r0 = img0 + (i * padded_in_w * 4 + j * 4) * 8; // feature map after padding 6*6 start addr - __fp16 *r0_tm = img0_tm + (i * block_w + j) * 8; // input_tm1 6*6 block start addr + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { + __fp16 *r0 = img0 + (i * padded_in_w * 4 + j * 4) * + 8; // feature map after padding 6*6 start addr + __fp16 *r0_tm = + img0_tm + (i * block_w + j) * 8; // input_tm1 6*6 block start addr - __fp16 ratio[] = {4, -4, 2, -2, -5}; // note: in fact cannot be output constrain + __fp16 ratio[] = {4, -4, 2, -2, + -5}; // note: in fact cannot be output constrain __fp16 *ratio_ptr = ratio; asm volatile( @@ -1607,139 +1557,140 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "mv t5, %2\n\t" // t5 = tmp start addr "slli t1, %4, 4\n\t" // t1 = padded_in_w * 8 * 2 bytes - "flh fa0, 0(%3)\n\t" // fa0 = 4 - "flh fa1, 2(%3)\n\t" // fa1 = -4 - "flh fa2, 4(%3)\n\t" // fa2 = 2 - "flh fa3, 6(%3)\n\t" // fa3 = -2 - "flh fa4, 8(%3)\n\t" // fa4 = -5 + "flh fa0, 0(%3)\n\t" // fa0 = 4 + "flh fa1, 2(%3)\n\t" // fa1 = -4 + "flh fa2, 4(%3)\n\t" // fa2 = 2 + "flh fa3, 6(%3)\n\t" // fa3 = -2 + "flh fa4, 8(%3)\n\t" // fa4 = -5 - "1:\n\t" - "mv s1, %0\n\t" // s1 = r00 addr + "1:\n\t" + "mv s1, %0\n\t" // s1 = r00 addr - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 96\n\t" // tmp[1][m] - "addi a2, a1, 96\n\t" // tmp[2][m] - "addi a3, a2, 96\n\t" // tmp[3][m] - "addi a4, a3, 96\n\t" // tmp[4][m] - "addi a5, a4, 96\n\t" // tmp[5][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 96\n\t" // tmp[1][m] + "addi a2, a1, 96\n\t" // tmp[2][m] + "addi a3, a2, 96\n\t" // tmp[3][m] + "addi a4, a3, 96\n\t" // tmp[4][m] + "addi a5, a4, 96\n\t" // tmp[5][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "addi s1, s1, 16\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "addi s1, s1, 16\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "addi s1, s1, 16\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "addi s1, s1, 16\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "addi s1, s1, 16\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "addi s1, s1, 16\n\t" "vmv.v.v v24, v4\n\t" "vmv.v.v v29, v5\n\t" //--------------------------------------------- - "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 - "vfmacc.vf v24, fa4, v2\n\t" // r04 + 4 * r00 - 5 * r02 + "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 + "vfmacc.vf v24, fa4, v2\n\t" // r04 + 4 * r00 - 5 * r02 "vse.v v24, (a0)\n\t" //--------------------------------------------- - "vfadd.vv v25, v3, v4\n\t" // r03 + r04 - "vfadd.vv v6, v1, v2\n\t" // r01 + r02 - "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) + "vfadd.vv v25, v3, v4\n\t" // r03 + r04 + "vfadd.vv v6, v1, v2\n\t" // r01 + r02 + "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) "vse.v v25, (a1)\n\t" //--------------------------------------------- - "vfsub.vv v26, v4, v3\n\t" // r04 - r03 - "vfsub.vv v7, v1, v2\n\t" // r01 - r02 - "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) + "vfsub.vv v26, v4, v3\n\t" // r04 - r03 + "vfsub.vv v7, v1, v2\n\t" // r01 - r02 + "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) "vse.v v26, (a2)\n\t" //--------------------------------------------- - "vfsub.vv v8, v1, v3\n\t" // r01 - r03 - "vfsub.vv v27, v4, v2\n\t" // r04 - r02 - "vfsub.vv v28, v4, v2\n\t" // r04 - r02 + "vfsub.vv v8, v1, v3\n\t" // r01 - r03 + "vfsub.vv v27, v4, v2\n\t" // r04 - r02 + "vfsub.vv v28, v4, v2\n\t" // r04 - r02 - "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) + "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) "vse.v v27, (a3)\n\t" - "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) + "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) "vse.v v28, (a4)\n\t" //--------------------------------------------- - "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 - "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 + "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 + "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 "vse.v v29, (a5)\n\t" //--------------------------------------------- - "add %0, %0, t1\n\t" // padding feature map 6*6 next line addr - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "add %0, %0, t1\n\t" // padding feature map 6*6 next line addr + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 6\n\t" // m = 6 + "mv t5, %2\n\t" // tmp start addr + "li t0, 6\n\t" // m = 6 - "slli t1, %5, 4\n\t" // t1 = tiles * 8 * 2 bytes - "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 8 channels * 2 bytes + "slli t1, %5, 4\n\t" // t1 = tiles * 8 * 2 bytes + "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 8 channels * 2 + // bytes - "3:\n\t" + "3:\n\t" - "mv a0, %1\n\t" // r0_tm_0 - "add a1, a0, t1\n\t" // r0_tm_1 - "add a2, a1, t1\n\t" // r0_tm_2 - "add a3, a2, t1\n\t" // r0_tm_3 - "add a4, a3, t1\n\t" // r0_tm_4 - "add a5, a4, t1\n\t" // r0_tm_5 + "mv a0, %1\n\t" // r0_tm_0 + "add a1, a0, t1\n\t" // r0_tm_1 + "add a2, a1, t1\n\t" // r0_tm_2 + "add a3, a2, t1\n\t" // r0_tm_3 + "add a4, a3, t1\n\t" // r0_tm_4 + "add a5, a4, t1\n\t" // r0_tm_5 - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" "vmv.v.v v24, v4\n\t" "vmv.v.v v29, v5\n\t" //--------------------------------------------- - "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 - "vfmacc.vf v24, fa4, v2\n\t" // r04 * 4 * r00 - 5 * r02 + "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 + "vfmacc.vf v24, fa4, v2\n\t" // r04 * 4 * r00 - 5 * r02 "vse.v v24, (a0)\n\t" //--------------------------------------------- - "vfadd.vv v25, v3, v4\n\t" // r03 + r04 - "vfadd.vv v6, v1, v2\n\t" // r01 + r02 - "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) + "vfadd.vv v25, v3, v4\n\t" // r03 + r04 + "vfadd.vv v6, v1, v2\n\t" // r01 + r02 + "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) "vse.v v25, (a1)\n\t" //--------------------------------------------- - "vfsub.vv v26, v4, v3\n\t" // r04 - r03 - "vfsub.vv v7, v1, v2\n\t" // r01 - r02 - "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) + "vfsub.vv v26, v4, v3\n\t" // r04 - r03 + "vfsub.vv v7, v1, v2\n\t" // r01 - r02 + "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) "vse.v v26, (a2)\n\t" //--------------------------------------------- - "vfsub.vv v8, v1, v3\n\t" // r01 - r03 - "vfsub.vv v27, v4, v2\n\t" // r04 - r02 - "vfsub.vv v28, v4, v2\n\t" // r04 - r02 + "vfsub.vv v8, v1, v3\n\t" // r01 - r03 + "vfsub.vv v27, v4, v2\n\t" // r04 - r02 + "vfsub.vv v28, v4, v2\n\t" // r04 - r02 - "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) + "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) "vse.v v27, (a3)\n\t" - "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) + "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) "vse.v v28, (a4)\n\t" //--------------------------------------------- - "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 - "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 + "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 + "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 "vse.v v29, (a5)\n\t" //--------------------------------------------- @@ -1749,35 +1700,29 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - - :"=r"(r0), // %0 - "=r"(r0_tm), // %1 - "=r"(tmp), // %2 - "=r"(ratio_ptr), // %3 - "=r"(padded_in_w), // %4 - "=r"(tiles) // %5 - :"0"(r0), - "1"(r0_tm), - "2"(tmp), - "3"(ratio_ptr), - "4"(padded_in_w), - "5"(tiles) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "v28", "v29", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5" - ); - + : "=r"(r0), // %0 + "=r"(r0_tm), // %1 + "=r"(tmp), // %2 + "=r"(ratio_ptr), // %3 + "=r"(padded_in_w), // %4 + "=r"(tiles) // %5 + : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w), + "5"(tiles) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v24", "v25", "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", + "a0", "a1", "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4", + "fa5"); } } - csi_mem_free(tmp); + shl_mem_free(tmp); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); /*********************************** dot ***************************************/ // reorder input_tm1_buf - __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int r = 0; r < 36; r++) { __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data @@ -1834,7 +1779,6 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, } tm1 += 36 * tiles * 8; } - } for (; t < tiles; t++) { __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -1851,16 +1795,16 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, } } - csi_mem_free(input_tm1_buf); + shl_mem_free(input_tm1_buf); // output_dot_buf: [out_c/4, 36, blocks, 4] - __fp16 *output_dot_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int p = 0; p < out_c / 8; p++) { - - __fp16 *output0_tm = output_dot_buf + p * 36 * tiles * 8; // 8 channel dot output - __fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8; // 8 channel kernel + __fp16 *output0_tm = output_dot_buf + p * 36 * tiles * 8; // 8 channel dot output + __fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8; // 8 channel kernel for (int r = 0; r < 36; r++) { __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel @@ -1872,7 +1816,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" @@ -1881,9 +1825,9 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "vmv.v.x v4, zero\n\t" "vmv.v.x v5, zero\n\t" "vmv.v.x v6, zero\n\t" - "vmv.v.x v7, zero\n\t" // clear + "vmv.v.x v7, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flh fa0, (%0)\n\t" "flh fa1, 2(%0)\n\t" @@ -1910,34 +1854,31 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v4, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v5, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v6, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v7, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v4, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v5, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v6, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v7, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" ); } @@ -1947,13 +1888,13 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" "vmv.v.x v2, zero\n\t" - "vmv.v.x v3, zero\n\t" // clear + "vmv.v.x v3, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flh fa0, (%0)\n\t" "flh fa1, 2(%0)\n\t" @@ -1972,25 +1913,22 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", + "t0"); } for (; t + 1 < tiles; t += 2) { __fp16 *r0 = img_tm2 + t * in_c; @@ -1998,11 +1936,11 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" - "vmv.v.x v1, zero\n\t" // clear + "vmv.v.x v1, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flh fa0, (%0)\n\t" "flh fa1, 2(%0)\n\t" @@ -2017,21 +1955,17 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0"); } for (; t < tiles; t++) { __fp16 *r0 = img_tm2 + t * in_c; @@ -2039,10 +1973,10 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c - "vmv.v.x v0, zero\n\t" // clear + "mv t0, %3\n\t" // t0 = in_c + "vmv.v.x v0, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "addi %0, %0, 2\n\t" @@ -2055,30 +1989,24 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "fa0", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "fa0", "t0"); } - } - } - csi_mem_free(input_tm2_buf); + shl_mem_free(input_tm2_buf); /*************************** transform output ****************************/ // output_tm1_buf: [out_c/4, out_h4, out_w4, 4] - __fp16 *output_tm1_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(__fp16)); + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(__fp16)); /* AT = { @@ -2089,124 +2017,124 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, }; */ - #pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / 8; p++) - { - +#pragma omp parallel for num_threads(1) + for (int p = 0; p < out_c / 8; p++) { __fp16 *bias_tmp = bias_data + p * 8; - __fp16 *out0_tm = output_dot_buf + p * 36 * block_h * block_w * 8; // 输出转换前/dot后 第p个channel - __fp16 *out0 = output_tm1_buf + p * 4*block_h * 4*block_w * 8; // 转换后输出 第p个channel + __fp16 *out0_tm = + output_dot_buf + p * 36 * block_h * block_w * 8; // 输出转换前/dot后 第p个channel + __fp16 *out0 = + output_tm1_buf + p * 4 * block_h * 4 * block_w * 8; // 转换后输出 第p个channel - __fp16 *tmp1 = (__fp16 *)csi_mem_alloc(4 * 6 * 8 * sizeof(__fp16)); + __fp16 *tmp1 = (__fp16 *)shl_mem_alloc(4 * 6 * 8 * sizeof(__fp16)); int out_w4 = block_w * 4; for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { + __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8; // 6*6 起始地址 - __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8; // 6*6 起始地址 - - __fp16 *output0 = out0 + (i * block_w * 4 * 4 + j * 4) * 8; // 输出 4*4 的起始地址 + __fp16 *output0 = + out0 + (i * block_w * 4 * 4 + j * 4) * 8; // 输出 4*4 的起始地址 __fp16 ratio[] = {2.0, 4.0, 8.0}; __fp16 *ratio_ptr = ratio; asm volatile( "vsetvli zero, zero, e16, m1\n\t" - "li t0, 6\n\t" // m = 6 - "mv t5, %2\n\t" // t5 = tmp start addr - "slli t1, %4, 4\n\t" // t1 = tiles * 8 * 2 - "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 8 channels * 2 bytes + "li t0, 6\n\t" // m = 6 + "mv t5, %2\n\t" // t5 = tmp start addr + "slli t1, %4, 4\n\t" // t1 = tiles * 8 * 2 + "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 8 channels * 2 + // bytes - "flh fa0, 0(%3)\n\t" // fa0 = 2 - "flh fa1, 2(%3)\n\t" // fa1 = 4 - "flh fa2, 4(%3)\n\t" // fa2 = 8 + "flh fa0, 0(%3)\n\t" // fa0 = 2 + "flh fa1, 2(%3)\n\t" // fa1 = 4 + "flh fa2, 4(%3)\n\t" // fa2 = 8 "mv s1, %0\n\t" - "1:\n\t" // shape : [4 * 6] * [6 * 6] = [4 * 6] + "1:\n\t" // shape : [4 * 6] * [6 * 6] = [4 * 6] - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 96\n\t" // tmp[1][m] - "addi a2, a1, 96\n\t" // tmp[2][m] - "addi a3, a2, 96\n\t" // tmp[3][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 96\n\t" // tmp[1][m] + "addi a2, a1, 96\n\t" // tmp[2][m] + "addi a3, a2, 96\n\t" // tmp[3][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "add s1, s1, t1\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "add s1, s1, t1\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "add s1, s1, t1\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "add s1, s1, t1\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "add s1, s1, t1\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "add s1, s1, t1\n\t" //--------------------------------------------- - "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a - "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a + "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a + "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a - "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b - "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b - "vmv.v.v v25, v6\n\t" // v25 = tmp13a + "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b + "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b + "vmv.v.v v25, v6\n\t" // v25 = tmp13a //--------------------------------------------- - "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a - "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b + "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a + "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b "vse.v v24, (a0)\n\t" - "vfmacc.vf v25, fa0, v8\n\t" // tmp13a + 2 * tmp13b + "vfmacc.vf v25, fa0, v8\n\t" // tmp13a + 2 * tmp13b "vse.v v25, (a1)\n\t" - "vfmacc.vf v26, fa1, v7\n\t" // tmp02a + 4 * tmp02b + "vfmacc.vf v26, fa1, v7\n\t" // tmp02a + 4 * tmp02b "vse.v v26, (a2)\n\t" - "vfadd.vv v27, v5, v6\n\t" // r05 + tmp13a - "vfmacc.vf v27, fa2, v8\n\t" // r05 + tmp13a * 8 tmp13b + "vfadd.vv v27, v5, v6\n\t" // r05 + tmp13a + "vfmacc.vf v27, fa2, v8\n\t" // r05 + tmp13a * 8 tmp13b "vse.v v27, (a3)\n\t" //--------------------------------------------- - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 4\n\t" // m = 4 - "slli t1, %5, 4\n\t" // t1 = out_w4 * 8 * 2 bytes - "vle.v v16, (%6)\n\t" // load 8 channel bias data + "mv t5, %2\n\t" // tmp start addr + "li t0, 4\n\t" // m = 4 + "slli t1, %5, 4\n\t" // t1 = out_w4 * 8 * 2 bytes + "vle.v v16, (%6)\n\t" // load 8 channel bias data - "3:\n\t" // shape : [4 * 6] * [6 * 4] = [4 * 4] + "3:\n\t" // shape : [4 * 6] * [6 * 4] = [4 * 4] "mv a0, %1\n\t" "addi a1, a0, 16\n\t" "addi a2, a1, 16\n\t" "addi a3, a2, 16\n\t" - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" //--------------------------------------------- - "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a - "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a + "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a + "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a - "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b - "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b - "vmv.v.v v25, v6\n\t" // v25 = tmp13a + "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b + "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b + "vmv.v.v v25, v6\n\t" // v25 = tmp13a //--------------------------------------------- "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b @@ -2231,59 +2159,49 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - :"=r"(output0_tm_0), // %0 - "=r"(output0), // %1 - "=r"(tmp1), // %2 - "=r"(ratio_ptr), // %3 - "=r"(tiles), // %4 - "=r"(out_w4), // %5 - "=r"(bias_tmp) // %6 - :"0"(output0_tm_0), - "1"(output0), - "2"(tmp1), - "3"(ratio_ptr), - "4"(tiles), - "5"(out_w4), - "6"(bias_tmp) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v24", "v25", "v26", "v27", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", - "fa0", "fa1", "fa2" - ); + : "=r"(output0_tm_0), // %0 + "=r"(output0), // %1 + "=r"(tmp1), // %2 + "=r"(ratio_ptr), // %3 + "=r"(tiles), // %4 + "=r"(out_w4), // %5 + "=r"(bias_tmp) // %6 + : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles), + "5"(out_w4), "6"(bias_tmp) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v16", "v24", "v25", "v26", "v27", "t0", "t1", "t2", "t5", "s1", "a0", + "a1", "a2", "a3", "fa0", "fa1", "fa2"); } } - csi_mem_free(tmp1); + shl_mem_free(tmp1); } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4); + shl_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); output_data += output_size; - csi_mem_free(output_tm1_buf); + shl_mem_free(output_tm1_buf); } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; } - -void csi_c906_conv3x3s1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +void shl_c906_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { /* to do */ } -void csi_c906_conv3x3s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +void shl_c906_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { /* to do */ } diff --git a/source/c906_opt/convolution_3x3_fp32.c b/source/c906_opt/convolution_3x3_fp32.c index 56218cd4..9e31258d 100644 --- a/source/c906_opt/convolution_3x3_fp32.c +++ b/source/c906_opt/convolution_3x3_fp32.c @@ -16,8 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - +/* CSI-NN2 version 2.0.x */ /* the conditions for using winograd convolution @@ -27,34 +26,27 @@ input_width <= 120 */ -#include "csi_c906.h" - +#include "shl_c906.h" -void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd23_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 4x4 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 4 * 4 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 4 * 4 * sizeof(float)); // kernel transform matrix: G - const float ktm[4][3] = { - {1, 0, 0}, - {0.5, 0.5, 0.5}, - {0.5, -0.5, 0.5}, - {0, 0, 1} - }; - - csi_tensor_copy(t_kernel, o_kernel); + const float ktm[4][3] = {{1, 0, 0}, {0.5, 0.5, 0.5}, {0.5, -0.5, 0.5}, {0, 0, 1}}; + + csinn_tensor_copy(t_kernel, o_kernel); t_kernel->data = kernel_tm; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tm0 = kernel_tm + p * inch * 16 + q * 16; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 16 + q * 16; // transform kernel const float *k0 = kernel0; @@ -64,7 +56,6 @@ void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel, // h : first compute the transport matrix tmp = (g * GT)T // tmp = G * gT float tmp[4][3]; for (int i = 0; i < 4; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -72,21 +63,20 @@ void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel, // U for (int j = 0; j < 4; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 4; i++) { - kernel_tm0[i * 4 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[i * 4 + j] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } } -int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd23(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -100,7 +90,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -119,19 +109,23 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, int block_h = (out_h + 1) / 2; int block_w = (out_w + 1) / 2; - int padded_in_h = block_h * 2 + 2; // block * 2 for alignment with 2,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 2 + + 2; // block * 2 for alignment with 2,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 2 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel // buffer addr - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); - float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 4 * 4 * sizeof(float)); - float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 2 * 2 * sizeof(float)); - - for(int n = 0; n < batch; n++) { + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_trans_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 4 * 4 * sizeof(float)); + float *output_trans_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 2 * 2 * sizeof(float)); + for (int n = 0; n < batch; n++) { // pad input - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, + pad_top, pad_left); input_data += input_size; // transform input @@ -148,20 +142,17 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, const int tiles = block_h * block_w; - for(int q = 0; q < in_c; q++) { - + for (int q = 0; q < in_c; q++) { const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w; float *img0_tm = input_trans_buf + q * block_h * block_w * 4 * 4; float tmp[4][4]; - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { const float *r0 = img0 + i * padded_in_w * 2 + j * 2; - for(int m = 0; m < 4; m++) { + for (int m = 0; m < 4; m++) { tmp[0][m] = r0[0] - r0[2]; tmp[1][m] = r0[1] + r0[2]; tmp[2][m] = r0[2] - r0[1]; @@ -174,8 +165,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, float *r0_tm_2 = r0_tm_1 + in_w_tm; float *r0_tm_3 = r0_tm_2 + in_w_tm; - for(int m = 0; m < 4; m++) { - + for (int m = 0; m < 4; m++) { const float *tmp0 = tmp[m]; r0_tm_0[m] = tmp0[0] - tmp0[2]; r0_tm_1[m] = tmp0[1] + tmp0[2]; @@ -187,11 +177,12 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, } // dot - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); - for(int i = 0; i < out_c; i++) { - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int i = 0; i < out_c; i++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { float *input_0 = input_trans_buf + j * 4 * 4 * block_w + k * 4; float *input_1 = input_0 + block_w * 4; float *input_2 = input_1 + block_w * 4; @@ -202,12 +193,13 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, float *kernel_2 = kernel_1 + 4; float *kernel_3 = kernel_2 + 4; - float *output_0 = output_dot_buf + i * block_h * block_w * 16 + j * 16 * block_w + k * 4; + float *output_0 = + output_dot_buf + i * block_h * block_w * 16 + j * 16 * block_w + k * 4; float *output_1 = output_0 + block_w * 4; float *output_2 = output_1 + block_w * 4; float *output_3 = output_2 + block_w * 4; - for(int a = 0; a < in_c; a++) { + for (int a = 0; a < in_c; a++) { output_0[0] += input_0[0] * kernel_0[0]; output_0[1] += input_0[1] * kernel_0[1]; output_0[2] += input_0[2] * kernel_0[2]; @@ -249,18 +241,17 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, { 0 1 -1 1 } }; */ - for(int i = 0; i < out_c; i++) { - + for (int i = 0; i < out_c; i++) { const float bias = bias_data ? bias_data[i] : 0.f; const float *img1 = output_dot_buf + i * block_h * block_w * 4 * 4; float *img1_tm = output_trans_buf + i * block_h * block_w * 2 * 2; float tmp[2][4]; - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { const float *r1 = img1 + j * block_w * 4 * 4 + k * 4; - for(int m = 0; m < 4; m++) { + for (int m = 0; m < 4; m++) { tmp[0][m] = r1[0] + r1[1] + r1[2]; tmp[1][m] = r1[1] - r1[2] + r1[3]; r1 += block_w * 4; @@ -268,7 +259,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, float *r1_tm_0 = img1_tm + j * block_w * 2 * 2 + k * 2; float *r1_tm_1 = r1_tm_0 + block_w * 2; - for(int m = 0; m < 2; m++) { + for (int m = 0; m < 2; m++) { const float *tmp1 = tmp[m]; r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + bias; r1_tm_1[m] = tmp1[1] - tmp1[2] + tmp1[3] + bias; @@ -276,47 +267,43 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, } } } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 2, block_w * 2); + shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 2, + block_w * 2); output_data += output_size; } - csi_mem_free(input_padd_buf); - csi_mem_free(input_trans_buf); - csi_mem_free(output_trans_buf); + shl_mem_free(input_padd_buf); + shl_mem_free(input_trans_buf); + shl_mem_free(output_trans_buf); return CSINN_TRUE; } - - -void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd43_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 6x6 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); // kernel transform matrix: G - const float ktm[6][3] = { - { 1.0f/4, 0.0f, 0.0f}, - { -1.0f/6, -1.0f/6, -1.0f/6}, - { -1.0f/6, 1.0f/6, -1.0f/6}, - { 1.0f/24, 1.0f/12, 1.0f/6}, - { 1.0f/24, -1.0f/12, 1.0f/6}, - { 0.0f, 0.0f, 1.0f} - }; - - csi_tensor_copy(t_kernel, o_kernel); + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(t_kernel, o_kernel); t_kernel->data = kernel_tm; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; // transform kernel const float *k0 = kernel0; @@ -326,7 +313,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel, // h : first compute the transport matrix tmp = (g * GT)T float tmp[6][3]; for (int i = 0; i < 6; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -334,22 +320,20 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel, // U for (int j = 0; j < 6; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 6; i++) { - kernel_tm0[i * 6 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[i * 6 + j] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } - } -int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd43(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -363,7 +347,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -382,19 +366,23 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, int block_h = (out_h + 3) / 4; int block_w = (out_w + 3) / 4; - int padded_in_h = block_h * 4 + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 4 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 4 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel // buffer addr - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); - float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float)); - float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); - - for(int n = 0; n < batch; n++) { + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_trans_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *output_trans_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); + for (int n = 0; n < batch; n++) { // pad input - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, + pad_top, pad_left); input_data += input_size; // transform input @@ -413,20 +401,17 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, const int tiles = block_h * block_w; - for(int q = 0; q < in_c; q++) { - + for (int q = 0; q < in_c; q++) { const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w; float *img0_tm = input_trans_buf + q * block_h * block_w * 6 * 6; float tmp[6][6]; - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { const float *r0 = img0 + i * padded_in_w * 4 + j * 4; - for(int m = 0; m < 6; m++) { + for (int m = 0; m < 6; m++) { tmp[0][m] = 4 * r0[0] - 5 * r0[2] + r0[4]; tmp[1][m] = r0[3] + r0[4] - 4 * r0[1] - 4 * r0[2]; tmp[2][m] = 4 * r0[1] + r0[4] - 4 * r0[2] - r0[3]; @@ -443,8 +428,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, float *r0_tm_4 = r0_tm_3 + in_w_tm; float *r0_tm_5 = r0_tm_4 + in_w_tm; - for(int m = 0; m < 6; m++) { - + for (int m = 0; m < 6; m++) { const float *tmp0 = tmp[m]; r0_tm_0[m] = 4 * tmp0[0] - 5 * tmp0[2] + tmp0[4]; r0_tm_1[m] = tmp0[3] + tmp0[4] - 4 * tmp0[1] - 4 * tmp0[2]; @@ -458,11 +442,12 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, } // dot - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); - for(int i = 0; i < out_c; i++) { - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int i = 0; i < out_c; i++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { float *input_0 = input_trans_buf + j * 6 * 6 * block_w + k * 6; float *input_1 = input_0 + block_w * 6; float *input_2 = input_1 + block_w * 6; @@ -477,14 +462,15 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, float *kernel_4 = kernel_3 + 6; float *kernel_5 = kernel_4 + 6; - float *output_0 = output_dot_buf + i * block_h * block_w * 36 + j * 36 * block_w + k * 6; + float *output_0 = + output_dot_buf + i * block_h * block_w * 36 + j * 36 * block_w + k * 6; float *output_1 = output_0 + block_w * 6; float *output_2 = output_1 + block_w * 6; float *output_3 = output_2 + block_w * 6; float *output_4 = output_3 + block_w * 6; float *output_5 = output_4 + block_w * 6; - for(int a = 0; a < in_c; a++) { + for (int a = 0; a < in_c; a++) { output_0[0] += input_0[0] * kernel_0[0]; output_0[1] += input_0[1] * kernel_0[1]; output_0[2] += input_0[2] * kernel_0[2]; @@ -554,18 +540,17 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, { 0 1 -1 8 -8 1 } }; */ - for(int i = 0; i < out_c; i++) { - + for (int i = 0; i < out_c; i++) { const float bias = bias_data ? bias_data[i] : 0.f; const float *img1 = output_dot_buf + i * block_h * block_w * 6 * 6; float *img1_tm = output_trans_buf + i * block_h * block_w * 4 * 4; float tmp[4][6]; - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { const float *r1 = img1 + j * block_w * 6 * 6 + k * 6; - for(int m = 0; m < 6; m++) { + for (int m = 0; m < 6; m++) { tmp[0][m] = r1[0] + r1[1] + r1[2] + r1[3] + r1[4]; tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4]; tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4]; @@ -577,7 +562,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, float *r1_tm_2 = r1_tm_1 + block_w * 4; float *r1_tm_3 = r1_tm_2 + block_w * 4; - for(int m = 0; m < 4; m++) { + for (int m = 0; m < 4; m++) { const float *tmp1 = tmp[m]; r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + bias; r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2 * tmp1[4] + bias; @@ -587,38 +572,36 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, } } } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4); + shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 4, + block_w * 4); output_data += output_size; } - csi_mem_free(input_padd_buf); - csi_mem_free(input_trans_buf); - csi_mem_free(output_trans_buf); + shl_mem_free(input_padd_buf); + shl_mem_free(input_trans_buf); + shl_mem_free(output_trans_buf); return CSINN_TRUE; } - -void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd64_transform_kernel(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 8x8 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); // kernel transform matrix: G - const float ktm[8][3] = { - {1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; // const float ktm[8][3] = { // {1.0f, 0.0f, 0.0f}, @@ -631,14 +614,13 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel, // {0.0f, 0.0f, 1.0f} // }; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); t_kernel->data = kernel_tm; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tm0 = kernel_tm + p * inch * 64 + q * 64; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 64 + q * 64; // transform kernel const float *k0 = kernel0; @@ -648,7 +630,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel, // h : first compute the transport matrix tmp = (g * GT)T float tmp[8][3]; for (int i = 0; i < 8; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -656,24 +637,21 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel, // U for (int j = 0; j < 8; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 8; i++) { - kernel_tm0[i * 8 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[i * 8 + j] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } - } -int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd64(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - float *input_data = (float *)input->data; float *output_data = (float *)output->data; float *kernel_data = (float *)params->conv_extra.kernel_tm->data; @@ -686,7 +664,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -705,19 +683,23 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, int block_h = (out_h + 5) / 6; int block_w = (out_w + 5) / 6; - int padded_in_h = block_h * 6 + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 6 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 6 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel // buffer addr - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); - float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); - float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); - - for(int n = 0; n < batch; n++) { + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_trans_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); + float *output_trans_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + for (int n = 0; n < batch; n++) { // pad input - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, + pad_top, pad_left); input_data += input_size; // transform input @@ -738,20 +720,17 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, const int tiles = block_h * block_w; - for(int q = 0; q < in_c; q++) { - + for (int q = 0; q < in_c; q++) { const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w; float *img0_tm = input_trans_buf + q * block_h * block_w * 8 * 8; float tmp[8][8]; - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { const float *r0 = img0 + i * padded_in_w * 6 + j * 6; - for(int m = 0; m < 8; m++) { + for (int m = 0; m < 8; m++) { tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]); tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]); @@ -773,11 +752,12 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, // tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]); // tmp[1][m] = r0[1] + r0[2] + r0[5] + r0[6] - 4.25 * (r0[3] + r0[4]); // tmp[2][m] = r0[2] - r0[1] + r0[6] - r0[5] + 4.25 * (r0[3] - r0[4]); - // tmp[3][m] = 0.5 * r0[1] + 0.25 * r0[2] - 2.5 * r0[3] - 1.25 * r0[4] + 2 * r0[5] + r0[6]; - // tmp[4][m] = 0.25 * r0[2] - 0.5 * r0[1] + 2.5 * r0[3] - 1.25 * r0[4] - 2 * r0[5] + r0[6]; - // tmp[5][m] = 2 * r0[1] + 4 * r0[2] - 2.5 * r0[3] - 5 * r0[4] + 0.5 * r0[5] + r0[6]; - // tmp[6][m] = 4 * r0[2] - 2 * r0[1] + 2.5 * r0[3] - 5 * r0[4] - 0.5 * r0[5] + r0[6]; - // tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]); + // tmp[3][m] = 0.5 * r0[1] + 0.25 * r0[2] - 2.5 * r0[3] - 1.25 * r0[4] + 2 * + // r0[5] + r0[6]; tmp[4][m] = 0.25 * r0[2] - 0.5 * r0[1] + 2.5 * r0[3] + // - 1.25 * r0[4] - 2 * r0[5] + r0[6]; tmp[5][m] = 2 * r0[1] + 4 * r0[2] + // - 2.5 * r0[3] - 5 * r0[4] + 0.5 * r0[5] + r0[6]; tmp[6][m] = 4 * r0[2] - + // 2 * r0[1] + 2.5 * r0[3] - 5 * r0[4] - 0.5 * r0[5] + r0[6]; tmp[7][m] = + // r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]); r0 += padded_in_w; } @@ -791,8 +771,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, float *r0_tm_6 = r0_tm_5 + in_w_tm; float *r0_tm_7 = r0_tm_6 + in_w_tm; - for(int m = 0; m < 8; m++) { - + for (int m = 0; m < 8; m++) { const float *tmp0 = tmp[m]; r0_tm_0[m] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]); @@ -813,27 +792,28 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, r0_tm_5[m] = tmp56a + tmp56b; r0_tm_6[m] = tmp56a - tmp56b; - // r0_tm_0[m] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]); - // r0_tm_1[m] = tmp0[1] + tmp0[2] + tmp0[5] + tmp0[6] - 4.25 * (tmp0[3] + tmp0[4]); - // r0_tm_2[m] = tmp0[2] - tmp0[1] + tmp0[6] - tmp0[5] + 4.25 * (tmp0[3] - tmp0[4]); - // r0_tm_3[m] = 0.5 * tmp0[1] + 0.25 * tmp0[2] - 2.5 * tmp0[3] - 1.25 * tmp0[4] + 2 * tmp0[5] + tmp0[6]; - // r0_tm_4[m] = 0.25 * tmp0[2] - 0.5 * tmp0[1] + 2.5 * tmp0[3] - 1.25 * tmp0[4] - 2 * tmp0[5] + tmp0[6]; - // r0_tm_5[m] = 2 * tmp0[1] + 4 * tmp0[2] - 2.5 * tmp0[3] - 5 * tmp0[4] + 0.5 * tmp0[5] + tmp0[6]; - // r0_tm_6[m] = 4 * tmp0[2] - 2 * tmp0[1] + 2.5 * tmp0[3] - 5 * tmp0[4] - 0.5 * tmp0[5] + tmp0[6]; - // r0_tm_7[m] = tmp0[7] - tmp0[1] + 5.25 * (tmp0[3] - tmp0[5]); - + // r0_tm_1[m] = tmp0[1] + tmp0[2] + tmp0[5] + tmp0[6] - 4.25 * (tmp0[3] + + // tmp0[4]); r0_tm_2[m] = tmp0[2] - tmp0[1] + tmp0[6] - tmp0[5] + 4.25 * + // (tmp0[3] - tmp0[4]); r0_tm_3[m] = 0.5 * tmp0[1] + 0.25 * tmp0[2] - 2.5 * + // tmp0[3] - 1.25 * tmp0[4] + 2 * tmp0[5] + tmp0[6]; r0_tm_4[m] = 0.25 * + // tmp0[2] - 0.5 * tmp0[1] + 2.5 * tmp0[3] - 1.25 * tmp0[4] - 2 * tmp0[5] + + // tmp0[6]; r0_tm_5[m] = 2 * tmp0[1] + 4 * tmp0[2] - 2.5 * tmp0[3] - 5 * + // tmp0[4] + 0.5 * tmp0[5] + tmp0[6]; r0_tm_6[m] = 4 * tmp0[2] - 2 * tmp0[1] + // + 2.5 * tmp0[3] - 5 * tmp0[4] - 0.5 * tmp0[5] + tmp0[6]; r0_tm_7[m] = + // tmp0[7] - tmp0[1] + 5.25 * (tmp0[3] - tmp0[5]); } } } } // dot - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); - for(int i = 0; i < out_c; i++) { - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int i = 0; i < out_c; i++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { float *input_0 = input_trans_buf + j * 8 * 8 * block_w + k * 8; float *input_1 = input_0 + block_w * 8; float *input_2 = input_1 + block_w * 8; @@ -852,7 +832,8 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, float *kernel_6 = kernel_5 + 8; float *kernel_7 = kernel_6 + 8; - float *output_0 = output_dot_buf + i * block_h * block_w * 64 + j * 64 * block_w + k * 8; + float *output_0 = + output_dot_buf + i * block_h * block_w * 64 + j * 64 * block_w + k * 8; float *output_1 = output_0 + block_w * 8; float *output_2 = output_1 + block_w * 8; float *output_3 = output_2 + block_w * 8; @@ -861,7 +842,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, float *output_6 = output_5 + block_w * 8; float *output_7 = output_6 + block_w * 8; - for(int a = 0; a < in_c; a++) { + for (int a = 0; a < in_c; a++) { output_0[0] += input_0[0] * kernel_0[0]; output_0[1] += input_0[1] * kernel_0[1]; output_0[2] += input_0[2] * kernel_0[2]; @@ -975,18 +956,17 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, { 0 1 -1 32 -32 1 -1 1 } }; */ - for(int i = 0; i < out_c; i++) { - + for (int i = 0; i < out_c; i++) { const float bias = bias_data ? bias_data[i] : 0.f; const float *img1 = output_dot_buf + i * block_h * block_w * 8 * 8; float *img1_tm = output_trans_buf + i * block_h * block_w * 6 * 6; float tmp[6][8]; - for(int j = 0; j < block_h; j++) { - for(int k = 0; k < block_w; k++) { + for (int j = 0; j < block_h; j++) { + for (int k = 0; k < block_w; k++) { const float *r1 = img1 + j * block_w * 8 * 8 + k * 8; - for(int m = 0; m < 8; m++) { + for (int m = 0; m < 8; m++) { float tmp024a = r1[1] + r1[2]; float tmp135a = r1[1] - r1[2]; @@ -1004,13 +984,13 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4; tmp[5][m] = r1[7] + tmp135a + tmp135b * 32 + tmp135c; - // tmp[0][m] = r1[0] + r1[1] + r1[2] + r1[3] + r1[4] + r1[5] + r1[6]; - // tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4] + 0.5 * r1[5] - 0.5 * r1[6]; - // tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4] + 0.25 * r1[5] + 0.25 * r1[6]; - // tmp[3][m] = r1[1] - r1[2] + 8 * r1[3] - 8 * r1[4] + 0.125 * r1[5] - 0.125 * r1[6]; - // tmp[4][m] = r1[1] + r1[2] + 16 * r1[3] + 16 * r1[4] + 0.0625 * r1[5] + 0.0625 * r1[6]; - // tmp[5][m] = r1[1] - r1[2] + 32 * r1[3] - 32 * r1[4] + 0.03125 * r1[5] - 0.03125 * r1[6] + r1[7]; + // tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4] + 0.5 * r1[5] - 0.5 * + // r1[6]; tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4] + 0.25 * r1[5] + + // 0.25 * r1[6]; tmp[3][m] = r1[1] - r1[2] + 8 * r1[3] - 8 * r1[4] + 0.125 * + // r1[5] - 0.125 * r1[6]; tmp[4][m] = r1[1] + r1[2] + 16 * r1[3] + 16 * + // r1[4] + 0.0625 * r1[5] + 0.0625 * r1[6]; tmp[5][m] = r1[1] - r1[2] + 32 * + // r1[3] - 32 * r1[4] + 0.03125 * r1[5] - 0.03125 * r1[6] + r1[7]; r1 += block_w * 8; } @@ -1021,7 +1001,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, float *r1_tm_4 = r1_tm_3 + block_w * 6; float *r1_tm_5 = r1_tm_4 + block_w * 6; - for(int m = 0; m < 6; m++) { + for (int m = 0; m < 6; m++) { const float *tmp1 = tmp[m]; float tmp024a = tmp1[1] + tmp1[2]; @@ -1041,50 +1021,51 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, r1_tm_3[m] = tmp135a + tmp135b * 8 + tmp135c * 4 + bias; r1_tm_5[m] = tmp1[7] + tmp135a + tmp135b * 32 + tmp135c + bias; - // r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + tmp1[5] + tmp1[6] + bias_data[i]; - // r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2 * tmp1[4] + 0.5 * tmp1[5] - 0.5 * tmp1[6] + bias_data[i]; - // r1_tm_2[m] = tmp1[1] + tmp1[2] + 4 * tmp1[3] + 4 * tmp1[4] + 0.25 * tmp1[5] + 0.25 * tmp1[6] + bias_data[i]; - // r1_tm_3[m] = tmp1[1] - tmp1[2] + 8 * tmp1[3] - 8 * tmp1[4] + 0.125 * tmp1[5] - 0.125 * tmp1[6] + bias_data[i]; - // r1_tm_4[m] = tmp1[1] + tmp1[2] + 16 * tmp1[3] + 16 * tmp1[4] + 0.0625 * tmp1[5] + 0.0625 * tmp1[6] + bias_data[i]; - // r1_tm_5[m] = tmp1[1] - tmp1[2] + 32 * tmp1[3] - 32 * tmp1[4] + 0.03125 * tmp1[5] - 0.03125 * tmp1[6] + tmp1[7] + bias_data[i]; - + // r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + tmp1[5] + + // tmp1[6] + bias_data[i]; r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2 + // * tmp1[4] + 0.5 * tmp1[5] - 0.5 * tmp1[6] + bias_data[i]; r1_tm_2[m] = + // tmp1[1] + tmp1[2] + 4 * tmp1[3] + 4 * tmp1[4] + 0.25 * tmp1[5] + 0.25 * + // tmp1[6] + bias_data[i]; r1_tm_3[m] = tmp1[1] - tmp1[2] + 8 * tmp1[3] - 8 + // * tmp1[4] + 0.125 * tmp1[5] - 0.125 * tmp1[6] + bias_data[i]; r1_tm_4[m] + // = tmp1[1] + tmp1[2] + 16 * tmp1[3] + 16 * tmp1[4] + 0.0625 * tmp1[5] + + // 0.0625 * tmp1[6] + bias_data[i]; r1_tm_5[m] = tmp1[1] - tmp1[2] + 32 * + // tmp1[3] - 32 * tmp1[4] + 0.03125 * tmp1[5] - 0.03125 * tmp1[6] + tmp1[7] + // + bias_data[i]; } } } } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6); + shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, + block_w * 6); output_data += output_size; } - csi_mem_free(input_padd_buf); - csi_mem_free(input_trans_buf); - csi_mem_free(output_trans_buf); + shl_mem_free(input_padd_buf); + shl_mem_free(input_trans_buf); + shl_mem_free(output_trans_buf); return CSINN_TRUE; } - // reference by ncnn -void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd64_transform_kernel_1(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 8x8 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); // kernel transform matrix: G - const float ktm[8][3] = { - {1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; // const float ktm[8][3] = { // {1.0f, 0.0f, 0.0f}, @@ -1097,13 +1078,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne // {0.0f, 0.0f, 1.0f} // }; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tm0 = kernel_tm + p * inch * 64 + q * 64; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 64 + q * 64; // transform kernel const float *k0 = kernel0; @@ -1113,7 +1093,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne // h : first compute the transport matrix tmp = (g * GT)T float tmp[8][3]; for (int i = 0; i < 8; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -1121,10 +1100,11 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne // U for (int j = 0; j < 8; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 8; i++) { - kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } @@ -1133,12 +1113,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne // interleave kernel int outch4 = outch >> 2; int remain_outch_start = outch4 << 2; - // float *kernel_tm2 = (float *)csi_mem_alloc(8 * 8 * inch * 4 * (outch4 + (outch % 4 + 3) / 4) * sizeof(float)); - float *kernel_tm2 = (float *)csi_mem_alloc(8 * 8 * inch * outch * sizeof(float)); + // float *kernel_tm2 = (float *)shl_mem_alloc(8 * 8 * inch * 4 * (outch4 + (outch % 4 + 3) / 4) + // * sizeof(float)); + float *kernel_tm2 = (float *)shl_mem_alloc(8 * 8 * inch * outch * sizeof(float)); t_kernel->data = kernel_tm2; - for(int pp = 0; pp < outch4; pp++) { - + for (int pp = 0; pp < outch4; pp++) { int p = pp * 4; float *ktm2 = kernel_tm2 + pp * 8 * 8 * inch * 4; @@ -1148,8 +1128,7 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne const float *kernel3_tm = kernel2_tm + 64 * inch; int q = 0; - for(; q + 1 < inch; q += 2) { - + for (; q + 1 < inch; q += 2) { const float *k00 = kernel0_tm + q * 64; const float *k01 = k00 + 64; const float *k10 = kernel1_tm + q * 64; @@ -1159,10 +1138,8 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne const float *k30 = kernel3_tm + q * 64; const float *k31 = k30 + 64; - for(int r = 0; r < 16; r++) { - + for (int r = 0; r < 16; r++) { for (int m = 0; m < 4; m++) { - ktm2[0 + m] = k00[m]; ktm2[4 + m] = k01[m]; ktm2[8 + m] = k10[m]; @@ -1184,17 +1161,14 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne ktm2 += 32; } } - for(; q < inch; q++) { - - const float* k00 = kernel0_tm + q * 64; - const float* k10 = kernel1_tm + q * 64; - const float* k20 = kernel2_tm + q * 64; - const float* k30 = kernel3_tm + q * 64; - - for(int r = 0; r < 16; r++) { + for (; q < inch; q++) { + const float *k00 = kernel0_tm + q * 64; + const float *k10 = kernel1_tm + q * 64; + const float *k20 = kernel2_tm + q * 64; + const float *k30 = kernel3_tm + q * 64; + for (int r = 0; r < 16; r++) { for (int m = 0; m < 4; m++) { - ktm2[0 + m] = k00[m]; ktm2[4 + m] = k10[m]; ktm2[8 + m] = k20[m]; @@ -1211,17 +1185,14 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne } // remain outch - for(int p = remain_outch_start; p < outch; p++) { - + for (int p = remain_outch_start; p < outch; p++) { float *ktm2 = kernel_tm2 + p * 64 * inch; const float *kernel0_tm = kernel_tm + p * 64 * inch; int q = 0; - for(; q < inch; q++) { - + for (; q < inch; q++) { const float *k00 = kernel0_tm + q * 64; - for(int r = 0; r < 16; r++) { - - for(int m = 0; m < 4; m++) { + for (int r = 0; r < 16; r++) { + for (int m = 0; m < 4; m++) { ktm2[m] = k00[m]; } k00 += 4; @@ -1229,19 +1200,16 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne } } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } - // reference by ncnn -int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd64_1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { // uint64_t start_time, end_time; - // start_time = csi_get_timespec(); + // start_time = shl_get_timespec(); float *input_data = (float *)input->data; float *output_data = (float *)output->data; float *kernel_data = (float *)params->conv_extra.kernel_tm->data; @@ -1254,7 +1222,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -1273,21 +1241,25 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, int block_h = (out_h + 5) / 6; int block_w = (out_w + 5) / 6; - int padded_in_h = block_h * 6 + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 6 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 6 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel // buffer addr - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); // interleave by (4, 16 * block_h * block_w, in_c) - float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); - - float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *input_trans_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); - for(int n = 0; n < batch; n++) { + float *output_trans_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + for (int n = 0; n < batch; n++) { // pad input - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, + pad_top, pad_left); input_data += input_size; // transform input @@ -1308,20 +1280,19 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, const int tiles = block_h * block_w; - for(int q = 0; q < in_c; q++) { - - const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w; // pad后padinput的第q个channle - float *img0_tm = input_trans_buf + q * block_h * block_w * 8 * 8; // transform and interleave 后的第q个channel + for (int q = 0; q < in_c; q++) { + const float *img0 = + input_padd_buf + q * padded_in_h * padded_in_w; // pad后padinput的第q个channle + float *img0_tm = input_trans_buf + q * block_h * block_w * 8 * + 8; // transform and interleave 后的第q个channel float tmp[8][8]; - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { const float *r0 = img0 + i * padded_in_w * 6 + j * 6; - for(int m = 0; m < 8; m++) { + for (int m = 0; m < 8; m++) { tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]); tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]); @@ -1346,8 +1317,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, float *r0_tm_0 = img0_tm + 4 * (i * block_w + j); float *r0_tm_4 = img0_tm + 4 * (i * block_w + j + block_h * block_w); - for(int m = 0; m < 8; m++) { - + for (int m = 0; m < 8; m++) { const float *tmp0 = tmp[m]; r0_tm_0[0] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]); @@ -1370,7 +1340,6 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, r0_tm_0 += 4 * block_h * block_w * 2; r0_tm_4 += 4 * block_h * block_w * 2; - } } } @@ -1378,14 +1347,14 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, // dot // interleave by (4, 16 * block_h * block_w, out_c) - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); int outch4 = out_c >> 2; int remain_outch_start = outch4 << 2; - for(int pp = 0; pp < outch4; pp++) { - + for (int pp = 0; pp < outch4; pp++) { int p = pp * 4; - float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w; // 每一个输出面 + float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w; // 每一个输出面 float *out1_tm = out0_tm + 4 * 16 * block_h * block_w; float *out2_tm = out1_tm + 4 * 16 * block_h * block_w; float *out3_tm = out2_tm + 4 * 16 * block_h * block_w; @@ -1394,8 +1363,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, int q = 0; - for(; q + 1 < in_c; q += 2) { - + for (; q + 1 < in_c; q += 2) { const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w; const float *r1 = r0 + 4 * 16 * block_h * block_w; @@ -1404,12 +1372,9 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, float *output2_tm = out2_tm; float *output3_tm = out3_tm; - for(int r = 0; r < 16; r++) { - - for(int t = 0; t < block_h * block_w; t++) { - - for(int m = 0; m < 4; m++) { - + for (int r = 0; r < 16; r++) { + for (int t = 0; t < block_h * block_w; t++) { + for (int m = 0; m < 4; m++) { output0_tm[m] += r0[m] * ktm[0 + m]; output0_tm[m] += r1[m] * ktm[4 + m]; output1_tm[m] += r0[m] * ktm[8 + m]; @@ -1428,24 +1393,18 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, } ktm += 32; } - - } - for(; q < in_c; q++) { - + for (; q < in_c; q++) { const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w; - float* output0_tm = out0_tm; - float* output1_tm = out1_tm; - float* output2_tm = out2_tm; - float* output3_tm = out3_tm; - - for(int r = 0; r < 16; r++) { - - for(int t = 0; t < block_h * block_w; t++) { - - for(int m = 0; m < 4; m++) { + float *output0_tm = out0_tm; + float *output1_tm = out1_tm; + float *output2_tm = out2_tm; + float *output3_tm = out3_tm; + for (int r = 0; r < 16; r++) { + for (int t = 0; t < block_h * block_w; t++) { + for (int m = 0; m < 4; m++) { output0_tm[m] += r0[m] * ktm[0 + m]; output1_tm[m] += r0[m] * ktm[4 + m]; output2_tm[m] += r0[m] * ktm[8 + m]; @@ -1460,26 +1419,20 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, } ktm += 16; } - } } // dot remain outch - for(int p = remain_outch_start; p < out_c; p++) { - + for (int p = remain_outch_start; p < out_c; p++) { float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w; const float *ktm = kernel_data + p * 64 * in_c; int q = 0; - for(; q < in_c; q++) { - + for (; q < in_c; q++) { const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w; float *output0_tm = out0_tm; - for(int r = 0; r < 16; r++) { - - for(int t = 0; t < block_h * block_w; t++) { - - for(int m = 0; m < 4; m++) { - + for (int r = 0; r < 16; r++) { + for (int t = 0; t < block_h * block_w; t++) { + for (int m = 0; m < 4; m++) { output0_tm[m] += r0[m] * ktm[m]; } r0 += 4; @@ -1510,23 +1463,19 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, }; */ - for(int p = 0; p < out_c; p++) { - + for (int p = 0; p < out_c; p++) { const float bias = bias_data ? bias_data[p] : 0.f; const float *out0_tm = output_dot_buf + p * 64 * block_h * block_w; float *out0 = output_trans_buf + p * 36 * block_h * block_w; float tmp[6][8]; - for(int i = 0; i < block_h; i++) { - - for(int j = 0; j < block_w; j++) { - + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { const float *output0_tm_0 = out0_tm + 4 * (i * block_w + j); const float *output0_tm_4 = out0_tm + 4 * (i * block_w + j + block_h * block_w); - for(int m = 0; m < 8; m++) { - + for (int m = 0; m < 8; m++) { float tmp024a = output0_tm_0[1] + output0_tm_0[2]; float tmp135a = output0_tm_0[1] - output0_tm_0[2]; @@ -1546,13 +1495,11 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, output0_tm_0 += 4 * block_h * block_w * 2; output0_tm_4 += 4 * block_h * block_w * 2; - } float *output0 = out0 + i * 6 * block_w * 6 + j * 6; for (int m = 0; m < 6; m++) { - const float *tmp0 = tmp[m]; float tmp024a = tmp0[1] + tmp0[2]; @@ -1574,142 +1521,129 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, output0 += block_w * 6; } - } } } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6); + shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, + block_w * 6); output_data += output_size; } - csi_mem_free(input_padd_buf); - csi_mem_free(input_trans_buf); - csi_mem_free(output_trans_buf); + shl_mem_free(input_padd_buf); + shl_mem_free(input_trans_buf); + shl_mem_free(output_trans_buf); return CSINN_TRUE; } - /* padding input for winograd input transform , and change memory layout to [n c/4 h w 4] input layout: [n c h w] input_padded layout: [n c/4 h w 4] constrain: input channel % 4 = 0 */ -void csi_c906_pad_input_pack1to4(const float *input, float *input_padded, int inc, int inh, int inw, +void shl_c906_pad_input_pack1to4(const float *input, float *input_padded, int inc, int inh, int inw, int padded_h, int padded_w, int pad_top, int pad_left) { - int inc4= inc / 4; + int inc4 = inc / 4; int padded_hw = padded_h * padded_w; float *pad_ptr = input_padded; float *inp_ptr = (float *)input; - int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) + int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "vmv.v.x v2, zero\n\t" // clear v2, for memset value 0 - "mulw t1, %6, %7\n\t" // pad_top * padded_w - "mulw t2, %6, %9\n\t" // pad_down * padded_w - "mulw t0, %3, %4\n\t" // input_size per_channel - "slli t0, t0, 2\n\t" // load stride = input_size * 4 - "slli t6, t0, 2\n\t" // t6 = input_size * 4(channel) * 4 bytes + "vmv.v.x v2, zero\n\t" // clear v2, for memset value 0 + "mulw t1, %6, %7\n\t" // pad_top * padded_w + "mulw t2, %6, %9\n\t" // pad_down * padded_w + "mulw t0, %3, %4\n\t" // input_size per_channel + "slli t0, t0, 2\n\t" // load stride = input_size * 4 + "slli t6, t0, 2\n\t" // t6 = input_size * 4(channel) * 4 bytes - "1:\n\t" // channel loop [inc/8] - "mv a0, %0\n\t" // update input_addr - "mv t5, %3\n\t" // t5 = in_h - "beqz %7, 3f\n\t" // if pad_top = 0 - "mv t3, t1\n\t" // t3 = num to memset + "1:\n\t" // channel loop [inc/8] + "mv a0, %0\n\t" // update input_addr + "mv t5, %3\n\t" // t5 = in_h + "beqz %7, 3f\n\t" // if pad_top = 0 + "mv t3, t1\n\t" // t3 = num to memset - "2:\n\t" // pad h_top - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "2:\n\t" // pad h_top + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 2b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" - "3:\n\t" // pad h_mid - "mv t4, %4\n\t" // t4 = in_w - "beqz %8, 5f\n\t" // if pad_left = 0 - "mv t3, %8\n\t" // t3 = pad_left + "3:\n\t" // pad h_mid + "mv t4, %4\n\t" // t4 = in_w + "beqz %8, 5f\n\t" // if pad_left = 0 + "mv t3, %8\n\t" // t3 = pad_left - "4:\n\t" // pad w_left - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "4:\n\t" // pad w_left + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 4b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" - "5:\n\t" // pad w_mid - "vlse.v v4, (a0), t0\n\t" - "addi a0, a0, 4\n\t" - "vse.v v4, (%1)\n\t" - "addi %1, %1, 16\n\t" + "5:\n\t" // pad w_mid + "vlse.v v4, (a0), t0\n\t" + "addi a0, a0, 4\n\t" + "vse.v v4, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t4, t4, -1\n\t" - "bnez t4, 5b\n\t" + "addi t4, t4, -1\n\t" + "bnez t4, 5b\n\t" - "beqz %10, 7f\n\t" // if pad_right = 0 - "mv t3, %10\n\t" // t3 = pad_right + "beqz %10, 7f\n\t" // if pad_right = 0 + "mv t3, %10\n\t" // t3 = pad_right - "6:\n\t" // pad w_right - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "6:\n\t" // pad w_right + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 6b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" "7:\n\t" - "addi t5, t5, -1\n\t" - "bnez t5, 3b\n\t" + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" - "beqz %9, 9f\n\t" // if pad_down = 0 - "mv t3, t2\n\t" // t3 = num to memset 0 + "beqz %9, 9f\n\t" // if pad_down = 0 + "mv t3, t2\n\t" // t3 = num to memset 0 - "8:\n\t" // pad h_down - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" + "8:\n\t" // pad h_down + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" - "addi t3, t3, -1\n\t" - "bnez t3, 8b\n\t" + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" - "9:\n\t" - "add %0, %0, t6\n\t" // input_data jump to next 4 channel + "9:\n\t" + "add %0, %0, t6\n\t" // input_data jump to next 4 channel "addi %2, %2, -1\n\t" "bnez %2, 1b\n\t" - :"=r"(inp_ptr), // %0 - "=r"(pad_ptr), // %1 - "=r"(inc4), // %2 - "=r"(inh), // %3 - "=r"(inw), // %4 - "=r"(padded_hw), // %5 - "=r"(padded_w), // %6 - "=r"(pad_top), // %7 - "=r"(pad_left), // %8 - "=r"(resi_h), // %9 - "=r"(resi_w) // %10 - :"0"(inp_ptr), - "1"(pad_ptr), - "2"(inc4), - "3"(inh), - "4"(inw), - "5"(padded_hw), - "6"(padded_w), - "7"(pad_top), - "8"(pad_left), - "9"(resi_h), - "10"(resi_w) - :"cc", "memory", "v2", "v4", - "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); - + : "=r"(inp_ptr), // %0 + "=r"(pad_ptr), // %1 + "=r"(inc4), // %2 + "=r"(inh), // %3 + "=r"(inw), // %4 + "=r"(padded_hw), // %5 + "=r"(padded_w), // %6 + "=r"(pad_top), // %7 + "=r"(pad_left), // %8 + "=r"(resi_h), // %9 + "=r"(resi_w) // %10 + : "0"(inp_ptr), "1"(pad_ptr), "2"(inc4), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w), + "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w) + : "cc", "memory", "v2", "v4", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); } -void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int out_c, int out_h, int out_w, - int wino_h, int wino_w) +void shl_c906_crop_output_pack4to1(const float *output_trans, float *output, int out_c, int out_h, + int out_w, int wino_h, int wino_w) { int out_c4 = out_c / 4; float *out_tm_ptr = (float *)output_trans; @@ -1718,65 +1652,58 @@ void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mulw t0, %3, %4\n\t" // output_size per_channel - "slli t0, t0, 2\n\t" // store_stride = output_size * 4 + "mulw t0, %3, %4\n\t" // output_size per_channel + "slli t0, t0, 2\n\t" // store_stride = output_size * 4 "slli t3, t0, 2\n\t" // t3 = output_size * 4(channel) * 4bytes "slli t4, %6, 4\n\t" // t4 = wino_w * 4(channel) * 4 - "mulw t5, %5, %6\n\t" // crop_size per_channel - "slli t5, t5, 4\n\t" // t5 = crop_size * 4(channel) * 4 + "mulw t5, %5, %6\n\t" // crop_size per_channel + "slli t5, t5, 4\n\t" // t5 = crop_size * 4(channel) * 4 - "1:\n\t" // channel loop [out_ch / 4] - "mv a1, %1\n\t" // update output_addr - "mv a0, %0\n\t" // update crop_addr per-channel + "1:\n\t" // channel loop [out_ch / 4] + "mv a1, %1\n\t" // update output_addr + "mv a0, %0\n\t" // update crop_addr per-channel - "mv t1, %3\n\t" // t1 = out_h + "mv t1, %3\n\t" // t1 = out_h - "2:\n\t" // crop h - "mv t2, %4\n\t" // t2 = out_w - "mv s1, a0\n\t" // update crop_addr per-row + "2:\n\t" // crop h + "mv t2, %4\n\t" // t2 = out_w + "mv s1, a0\n\t" // update crop_addr per-row - "3:\n\t" // crop w - "vle.v v2, (s1)\n\t" - "addi s1, s1, 16\n\t" - "vsse.v v2, (a1), t0\n\t" - "addi a1, a1, 4\n\t" + "3:\n\t" // crop w + "vle.v v2, (s1)\n\t" + "addi s1, s1, 16\n\t" + "vsse.v v2, (a1), t0\n\t" + "addi a1, a1, 4\n\t" - "addi t2, t2, -1\n\t" - "bnez t2, 3b\n\t" + "addi t2, t2, -1\n\t" + "bnez t2, 3b\n\t" - "add a0, a0, t4\n\t" // crop-data jump to next row + "add a0, a0, t4\n\t" // crop-data jump to next row - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" - "4:\n\t" - "add %1, %1, t3\n\t" // output_data jump to next 4 channel - "add %0, %0, t5\n\t" // crop-data jump to next 4 channel + "4:\n\t" + "add %1, %1, t3\n\t" // output_data jump to next 4 channel + "add %0, %0, t5\n\t" // crop-data jump to next 4 channel "addi %2, %2, -1\n\t" "bnez %2, 1b\n\t" - :"=r"(out_tm_ptr), // %0 - "=r"(out_ptr), // %1 - "=r"(out_c4), // %2 - "=r"(out_h), // %3 - "=r"(out_w), // %4 - "=r"(wino_h), // %5 - "=r"(wino_w) // %6 - :"0"(out_tm_ptr), - "1"(out_ptr), - "2"(out_c4), - "3"(out_h), - "4"(out_w), - "5"(wino_h), - "6"(wino_w) - :"cc", "memory", "v2", "v3", "a0", "a1", "s1", - "t0", "t1", "t2", "t3", "t4", "t5" + : "=r"(out_tm_ptr), // %0 + "=r"(out_ptr), // %1 + "=r"(out_c4), // %2 + "=r"(out_h), // %3 + "=r"(out_w), // %4 + "=r"(wino_h), // %5 + "=r"(wino_w) // %6 + : "0"(out_tm_ptr), "1"(out_ptr), "2"(out_c4), "3"(out_h), "4"(out_w), "5"(wino_h), + "6"(wino_w) + : "cc", "memory", "v2", "v3", "a0", "a1", "s1", "t0", "t1", "t2", "t3", "t4", "t5" ); - } /* @@ -1785,26 +1712,24 @@ void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int kernel before: [O I 3*3] kernel after : [O/4 8*8 I 4] */ -void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 8x8 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); // kernel transform matrix: G - const float ktm[8][3] = { - {1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; // const float ktm[8][3] = { // {1.0f, 0.0f, 0.0f}, @@ -1817,13 +1742,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k // {0.0f, 0.0f, 1.0f} // }; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; // transform kernel const float *k0 = kernel0; @@ -1833,7 +1757,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k // h : first compute the transport matrix tmp = (g * GT)T float tmp[8][3]; for (int i = 0; i < 8; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -1841,20 +1764,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k // U for (int j = 0; j < 8; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 8; i++) { - kernel_tmp[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } // optimized layout for winograd64 - float *kernel_tm_pack4 = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + float *kernel_tm_pack4 = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); t_kernel->data = kernel_tm_pack4; for (int oc = 0; oc < outch / 4; oc++) { - float *g0 = kernel_tm_pack4 + oc * 64 * inch * 4; const float *k0 = kernel_tm + oc * 64 * inch * 4; @@ -1863,13 +1786,10 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k const float *k3 = k2 + 64 * inch; for (int k = 0; k < 64; k++) { - float *g00 = g0 + k * inch * 4; for (int ic = 0; ic < inch / 4; ic++) { - for (int i = 0; i < 4; i++) { - const float *k00 = k0 + (ic * 4 + i) * 64; const float *k10 = k1 + (ic * 4 + i) * 64; const float *k20 = k2 + (ic * 4 + i) * 64; @@ -1886,18 +1806,16 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } /* constrain: output channel % 4 = 0 input channel % 4 = 0 */ -int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd64_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -1911,7 +1829,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -1930,29 +1848,31 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, int block_h = (out_h + 5) / 6; int block_w = (out_w + 5) / 6; - int padded_in_h = block_h * 6 + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 6 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 int padded_in_w = block_w * 6 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias + bool flag_bias = 1; // default: conv2d layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (float *)csi_mem_alloc(out_c * sizeof(float)); + bias_data = (float *)shl_mem_alloc(out_c * sizeof(float)); } - - for(int n = 0; n < batch; n++) { - + for (int n = 0; n < batch; n++) { // pad buffer: [in_c/8 h w 8] - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); // pad input - csi_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); input_data += input_size; // input transform buffer1: [in_ch/8, 64, blocks, 8] - float *input_tm1_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); + float *input_tm1_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); /****************************** transform input *****************************/ /* @@ -1973,22 +1893,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, int tiles = block_h * block_w; - #pragma omp parallel for num_threads(1) - for(int q = 0; q < in_c / 4; q++) { - - float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 4; // feature map after padding - q channel - float *img0_tm = input_tm1_buf + q * 64 * tiles * 4; // transform and interleave - q channel - - float *tmp = (float *)csi_mem_alloc(8 * 8 * 4 * sizeof(float)); +#pragma omp parallel for num_threads(1) + for (int q = 0; q < in_c / 4; q++) { + float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * + 4; // feature map after padding - q channel + float *img0_tm = + input_tm1_buf + q * 64 * tiles * 4; // transform and interleave - q channel - for(int i = 0; i < block_h; i++) { + float *tmp = (float *)shl_mem_alloc(8 * 8 * 4 * sizeof(float)); - for(int j = 0; j < block_w; j++) { - - float *r0 = img0 + (i * padded_in_w * 6 + j * 6) * 4; // feature map after padding 8*8 start addr - float *r0_tm = img0_tm + (i * block_w + j) * 4; // input_tm1 8*8 block start addr + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { + float *r0 = img0 + (i * padded_in_w * 6 + j * 6) * + 4; // feature map after padding 8*8 start addr + float *r0_tm = + img0_tm + (i * block_w + j) * 4; // input_tm1 8*8 block start addr - float ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0, 0.5, -2.5, 2.0}; // note: in fact cannot be output constrain + float ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0, + 0.5, -2.5, 2.0}; // note: in fact cannot be output constrain float *ratio_ptr = ratio; asm volatile( @@ -1997,91 +1919,96 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "mv t5, %2\n\t" // t5 = tmp start addr "slli t1, %4, 4\n\t" // t1 = padded_in_w * 4 * 4bytes - "flw fa0, 0(%3)\n\t" // fa0 = 5.25 - "flw fa1, 4(%3)\n\t" // fa1 = -4.25 - "flw fa2, 8(%3)\n\t" // fa2 = 0.25 - "flw fa3, 12(%3)\n\t" // fa3 = -1.25 - "flw fa4, 16(%3)\n\t" // fa4 = 4.0 - "flw fa5, 20(%3)\n\t" // fa5 = 0.5 - "flw fa6, 24(%3)\n\t" // fa6 = -2.5 - "flw fa7, 28(%3)\n\t" // fa7 = 2.0 - - "1:\n\t" - "mv s1, %0\n\t" // s1 = r00 addr - - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 128\n\t" // tmp[1][m] - "addi a2, a1, 128\n\t" // tmp[2][m] - "addi a3, a2, 128\n\t" // tmp[3][m] - "addi a4, a3, 128\n\t" // tmp[4][m] - "addi a5, a4, 128\n\t" // tmp[5][m] - "addi a6, a5, 128\n\t" // tmp[6][m] - "addi a7, a6, 128\n\t" // tmp[7][m] - - "vle.v v0, (s1)\n\t" // r00 + "flw fa0, 0(%3)\n\t" // fa0 = 5.25 + "flw fa1, 4(%3)\n\t" // fa1 = -4.25 + "flw fa2, 8(%3)\n\t" // fa2 = 0.25 + "flw fa3, 12(%3)\n\t" // fa3 = -1.25 + "flw fa4, 16(%3)\n\t" // fa4 = 4.0 + "flw fa5, 20(%3)\n\t" // fa5 = 0.5 + "flw fa6, 24(%3)\n\t" // fa6 = -2.5 + "flw fa7, 28(%3)\n\t" // fa7 = 2.0 + + "1:\n\t" + "mv s1, %0\n\t" // s1 = r00 addr + + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 128\n\t" // tmp[1][m] + "addi a2, a1, 128\n\t" // tmp[2][m] + "addi a3, a2, 128\n\t" // tmp[3][m] + "addi a4, a3, 128\n\t" // tmp[4][m] + "addi a5, a4, 128\n\t" // tmp[5][m] + "addi a6, a5, 128\n\t" // tmp[6][m] + "addi a7, a6, 128\n\t" // tmp[7][m] + + "vle.v v0, (s1)\n\t" // r00 "addi s1, s1, 16\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "addi s1, s1, 16\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "addi s1, s1, 16\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "addi s1, s1, 16\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "addi s1, s1, 16\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "addi s1, s1, 16\n\t" - "vle.v v6, (s1)\n\t" // r06 + "vle.v v6, (s1)\n\t" // r06 "addi s1, s1, 16\n\t" - "vle.v v7, (s1)\n\t" // r07 + "vle.v v7, (s1)\n\t" // r07 "addi s1, s1, 16\n\t" "vmv.v.v v10, v6\n\t" //--------------------------------------------- - "vfsub.vv v8, v4, v2\n\t" // r04 - r02 - "vfsub.vv v9, v3, v5\n\t" // r03 - r05 + "vfsub.vv v8, v4, v2\n\t" // r04 - r02 + "vfsub.vv v9, v3, v5\n\t" // r03 - r05 - "vfsub.vv v24, v0, v6\n\t" // r00 - r06 - "vfsub.vv v31, v7, v1\n\t" // r07 - r01 + "vfsub.vv v24, v0, v6\n\t" // r00 - r06 + "vfsub.vv v31, v7, v1\n\t" // r07 - r01 - "vfmacc.vf v10, fa2, v2\n\t" // r06 + r02 * 0.25f + "vfmacc.vf v10, fa2, v2\n\t" // r06 + r02 * 0.25f - "vfmul.vf v11, v1, fa5\n\t" // r01 * 0.5f - "vfmul.vf v12, v1, fa7\n\t" // r01 * 2.0f + "vfmul.vf v11, v1, fa5\n\t" // r01 * 0.5f + "vfmul.vf v12, v1, fa7\n\t" // r01 * 2.0f - "vfmacc.vf v24, fa0, v8\n\t" // r00 - r06 + 5.25 * (r04 - r02) = tmp[0][m] - "vfmacc.vf v31, fa0, v9\n\t" // r07 - r01 + 5.25 * (r03 - r05) = tmp[7][m] + "vfmacc.vf v24, fa0, v8\n\t" // r00 - r06 + 5.25 * (r04 - r02) = + // tmp[0][m] + "vfmacc.vf v31, fa0, v9\n\t" // r07 - r01 + 5.25 * (r03 - r05) = + // tmp[7][m] //--------------------------------------------- - "vfadd.vv v8, v2, v6\n\t" // r02 + r06 - "vfadd.vv v9, v1, v5\n\t" // r01 + r05 + "vfadd.vv v8, v2, v6\n\t" // r02 + r06 + "vfadd.vv v9, v1, v5\n\t" // r01 + r05 - "vfmacc.vf v11, fa6, v3\n\t" // r01 * 0.5f - r03 * 2.5f - "vfmacc.vf v12, fa6, v3\n\t" // r01 * 2.f - r03 * 2.5f + "vfmacc.vf v11, fa6, v3\n\t" // r01 * 0.5f - r03 * 2.5f + "vfmacc.vf v12, fa6, v3\n\t" // r01 * 2.f - r03 * 2.5f - "vfmacc.vf v2, fa3, v4\n\t" // r02 - r04 * 1.25f - "vfmacc.vf v10, fa3, v4\n\t" // r06 + r02 * 0.25f - r04 * 1.25f = tmp34a + "vfmacc.vf v2, fa3, v4\n\t" // r02 - r04 * 1.25f + "vfmacc.vf v10, fa3, v4\n\t" // r06 + r02 * 0.25f - r04 * 1.25f = + // tmp34a - "vfmacc.vf v8, fa1, v4\n\t" // r02 + r06 - r04 * 4.25f = tmp12a - "vfmacc.vf v9, fa1, v3\n\t" // r01 + r05 - r03 * 4.25f = tmp12b + "vfmacc.vf v8, fa1, v4\n\t" // r02 + r06 - r04 * 4.25f = tmp12a + "vfmacc.vf v9, fa1, v3\n\t" // r01 + r05 - r03 * 4.25f = tmp12b - "vfmacc.vf v11, fa7, v5\n\t" // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = tmp34b - "vfmacc.vf v12, fa5, v5\n\t" // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = tmp56b + "vfmacc.vf v11, fa7, v5\n\t" // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = + // tmp34b + "vfmacc.vf v12, fa5, v5\n\t" // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = + // tmp56b "vse.v v24, (a0)\n\t" "vse.v v31, (a7)\n\t" - "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = tmp[1][m] - "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = tmp[2][m] + "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = tmp[1][m] + "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = tmp[2][m] //--------------------------------------------- - "vfmacc.vf v6, fa4, v2\n\t" // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a + "vfmacc.vf v6, fa4, v2\n\t" // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a "vfadd.vv v27, v10, v11\n\t" // tmp34a + tmp34b = tmp[3][m] "vfsub.vv v28, v10, v11\n\t" // tmp34a - tmp34b = tmp[4][m] - "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = tmp[5][m] - "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = tmp[6][m] + "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = tmp[5][m] + "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = tmp[6][m] "vse.v v25, (a1)\n\t" "vse.v v26, (a2)\n\t" @@ -2092,95 +2019,101 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, //--------------------------------------------- - "add %0, %0, t1\n\t" // padding feature map 8*8 next line addr - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "add %0, %0, t1\n\t" // padding feature map 8*8 next line addr + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 8\n\t" // m = 8 + "mv t5, %2\n\t" // tmp start addr + "li t0, 8\n\t" // m = 8 - "slli t1, %5, 4\n\t" // t1 = tiles * 4 * 4 bytes - "slli t2, %5, 7\n\t" // t2 = tiles * 8 * 4 * 4 bytes + "slli t1, %5, 4\n\t" // t1 = tiles * 4 * 4 bytes + "slli t2, %5, 7\n\t" // t2 = tiles * 8 * 4 * 4 bytes - "3:\n\t" + "3:\n\t" - "mv a0, %1\n\t" // r0_tm_0 - "add a1, a0, t1\n\t" // r0_tm_1 - "add a2, a1, t1\n\t" // r0_tm_2 - "add a3, a2, t1\n\t" // r0_tm_3 - "add a4, a3, t1\n\t" // r0_tm_4 - "add a5, a4, t1\n\t" // r0_tm_5 - "add a6, a5, t1\n\t" // r0_tm_6 - "add a7, a6, t1\n\t" // r0_tm_7 + "mv a0, %1\n\t" // r0_tm_0 + "add a1, a0, t1\n\t" // r0_tm_1 + "add a2, a1, t1\n\t" // r0_tm_2 + "add a3, a2, t1\n\t" // r0_tm_3 + "add a4, a3, t1\n\t" // r0_tm_4 + "add a5, a4, t1\n\t" // r0_tm_5 + "add a6, a5, t1\n\t" // r0_tm_6 + "add a7, a6, t1\n\t" // r0_tm_7 - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" - "vle.v v6, (t5)\n\t" // tmp[m][6] + "vle.v v6, (t5)\n\t" // tmp[m][6] "addi t5, t5, 16\n\t" - "vle.v v7, (t5)\n\t" // tmp[m][7] + "vle.v v7, (t5)\n\t" // tmp[m][7] "addi t5, t5, 16\n\t" "vmv.v.v v10, v6\n\t" //--------------------------------------------- - "vfsub.vv v8, v4, v2\n\t" // tmp04 - tmp02 (tmp[m][4] - tmp[m][2]) - "vfsub.vv v9, v3, v5\n\t" // tmp03 - tmp05 + "vfsub.vv v8, v4, v2\n\t" // tmp04 - tmp02 (tmp[m][4] - tmp[m][2]) + "vfsub.vv v9, v3, v5\n\t" // tmp03 - tmp05 - "vfsub.vv v24, v0, v6\n\t" // tmp00 - tmp06 - "vfsub.vv v31, v7, v1\n\t" // tmp07 - tmp01 + "vfsub.vv v24, v0, v6\n\t" // tmp00 - tmp06 + "vfsub.vv v31, v7, v1\n\t" // tmp07 - tmp01 - "vfmacc.vf v10, fa2, v2\n\t" // tmp06 + tmp02 * 0.25f + "vfmacc.vf v10, fa2, v2\n\t" // tmp06 + tmp02 * 0.25f - "vfmul.vf v11, v1, fa5\n\t" // tmp01 * 0.5f - "vfmul.vf v12, v1, fa7\n\t" // tmp01 * 2.0f + "vfmul.vf v11, v1, fa5\n\t" // tmp01 * 0.5f + "vfmul.vf v12, v1, fa7\n\t" // tmp01 * 2.0f - "vfmacc.vf v24, fa0, v8\n\t" // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) = r0_tm_0[m] - "vfmacc.vf v31, fa0, v9\n\t" // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) = r0_tm_7[m] + "vfmacc.vf v24, fa0, v8\n\t" // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) + // = r0_tm_0[m] + "vfmacc.vf v31, fa0, v9\n\t" // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) + // = r0_tm_7[m] //--------------------------------------------- - "vfadd.vv v8, v2, v6\n\t" // tmp02 + tmp06 - "vfadd.vv v9, v1, v5\n\t" // tmp01 + tmp05 + "vfadd.vv v8, v2, v6\n\t" // tmp02 + tmp06 + "vfadd.vv v9, v1, v5\n\t" // tmp01 + tmp05 - "vfmacc.vf v11, fa6, v3\n\t" // tmp01 * 0.5f - tmp03 * 2.5f - "vfmacc.vf v12, fa6, v3\n\t" // tmp01 * 2.f - tmp03 * 2.5f + "vfmacc.vf v11, fa6, v3\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + "vfmacc.vf v12, fa6, v3\n\t" // tmp01 * 2.f - tmp03 * 2.5f - "vfmacc.vf v2, fa3, v4\n\t" // tmp02 - tmp04 * 1.25f - "vfmacc.vf v10, fa3, v4\n\t" // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f = tmp34a + "vfmacc.vf v2, fa3, v4\n\t" // tmp02 - tmp04 * 1.25f + "vfmacc.vf v10, fa3, v4\n\t" // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f + // = tmp34a - "vfmacc.vf v8, fa1, v4\n\t" // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a - "vfmacc.vf v9, fa1, v3\n\t" // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b + "vfmacc.vf v8, fa1, v4\n\t" // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a + "vfmacc.vf v9, fa1, v3\n\t" // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b - "vfmacc.vf v11, fa7, v5\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 * 2.0 = tmp34b - "vfmacc.vf v12, fa5, v5\n\t" // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * 0.5 = tmp56b + "vfmacc.vf v11, fa7, v5\n\t" // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 + // * 2.0 = tmp34b + "vfmacc.vf v12, fa5, v5\n\t" // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * + // 0.5 = tmp56b "vse.v v24, (a0)\n\t" "vse.v v31, (a7)\n\t" - "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = r0_tm_1[m] - "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = r0_tm_2[m] + "vfadd.vv v25, v8, v9\n\t" // tmp12a + tmp12b = r0_tm_1[m] + "vfsub.vv v26, v8, v9\n\t" // tmp12a - tmp12b = r0_tm_2[m] //--------------------------------------------- - "vfmacc.vf v6, fa4, v2\n\t" // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = tmp56a + "vfmacc.vf v6, fa4, v2\n\t" // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = + // tmp56a "vfadd.vv v27, v10, v11\n\t" // tmp34a + tmp34b = r0_tm_3[m] "vfsub.vv v28, v10, v11\n\t" // tmp34a - tmp34b = r0_tm_4[m] - "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = r0_tm_5[m] - "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = r0_tm_6[m] + "vfadd.vv v29, v6, v12\n\t" // tmp56a + tmp56b = r0_tm_5[m] + "vfsub.vv v30, v6, v12\n\t" // tmp56a - tmp56b = r0_tm_6[m] "vse.v v25, (a1)\n\t" "vse.v v26, (a2)\n\t" @@ -2194,42 +2127,35 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - - :"=r"(r0), // %0 - "=r"(r0_tm), // %1 - "=r"(tmp), // %2 - "=r"(ratio_ptr), // %3 - "=r"(padded_in_w), // %4 - "=r"(tiles) // %5 - :"0"(r0), - "1"(r0_tm), - "2"(tmp), - "3"(ratio_ptr), - "4"(padded_in_w), - "5"(tiles) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7" - ); - + : "=r"(r0), // %0 + "=r"(r0_tm), // %1 + "=r"(tmp), // %2 + "=r"(ratio_ptr), // %3 + "=r"(padded_in_w), // %4 + "=r"(tiles) // %5 + : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w), + "5"(tiles) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", + "a5", "a6", "a7", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"); } } - csi_mem_free(tmp); + shl_mem_free(tmp); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); /*********************************** dot ***************************************/ // reorder input_tm1_buf - float *input_tm2_buf = (float *)csi_mem_alloc(64 * tiles * in_c * sizeof(float)); + float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { - float *tm2 = img_tm2 + t * in_c; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -2251,12 +2177,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, // } asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes - "srai t2, %3, 2\n\t" // in_ch4 + "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes + "srai t2, %3, 2\n\t" // in_ch4 - "1:\n\t" // in_ch loop4 + "1:\n\t" // in_ch loop4 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -2282,17 +2208,13 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "a0", "t1", "t2" - ); + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "a0", "t1", + "t2"); } for (; t + 3 < tiles; t += 4) { float *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -2312,12 +2234,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, // } asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes - "srai t2, %3, 2\n\t" // in_ch4 + "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes + "srai t2, %3, 2\n\t" // in_ch4 - "1:\n\t" // in_ch loop4 + "1:\n\t" // in_ch loop4 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -2335,17 +2257,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", - "a0", "t1", "t2" - ); + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "a0", "t1", "t2"); } for (; t + 1 < tiles; t += 2) { float *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -2363,12 +2280,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes - "srai t2, %3, 2\n\t" // in_ch4 + "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes + "srai t2, %3, 2\n\t" // in_ch4 - "1:\n\t" // in_ch loop4 + "1:\n\t" // in_ch loop4 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -2382,18 +2299,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", "v1", - "a0", "t1", "t2" - ); - + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "v1", "a0", "t1", "t2"); } for (; t < tiles; t++) { float *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -2410,12 +2321,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes - "srai t2, %3, 2\n\t" // in_ch4 + "slli t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes + "srai t2, %3, 2\n\t" // in_ch4 - "1:\n\t" // in_ch loop4 + "1:\n\t" // in_ch loop4 - "mv a0, %1\n\t" // updata tm1 addr + "mv a0, %1\n\t" // updata tm1 addr "vle.v v0, (a0)\n\t" "addi a0, a0, 16\n\t" @@ -2428,45 +2339,37 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t2, t2, -1\n\t" "bnez t2, 1b\n\t" - :"=r"(tm2), // %0 - "=r"(tm1), // %1 - "=r"(tiles), // %2 - "=r"(in_c) // %3 - :"0"(tm2), - "1"(tm1), - "2"(tiles), - "3"(in_c) - :"cc", "memory", "v0", - "a0", "t1", "t2" - ); - + : "=r"(tm2), // %0 + "=r"(tm1), // %1 + "=r"(tiles), // %2 + "=r"(in_c) // %3 + : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c) + : "cc", "memory", "v0", "a0", "t1", "t2"); } } - csi_mem_free(input_tm1_buf); + shl_mem_free(input_tm1_buf); // output_dot_buf: [out_c/4, 64, blocks, 4] - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int p = 0; p < out_c / 4; p++) { - - float *output0_tm = output_dot_buf + p * 64 * tiles * 4; // 4 channel dot output - float *kernel0_tm = kernel_data + p * 64 * in_c * 4; // 4 channel kernel + float *output0_tm = output_dot_buf + p * 64 * tiles * 4; // 4 channel dot output + float *kernel0_tm = kernel_data + p * 64 * in_c * 4; // 4 channel kernel for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { - float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" @@ -2475,9 +2378,9 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "vmv.v.x v4, zero\n\t" "vmv.v.x v5, zero\n\t" "vmv.v.x v6, zero\n\t" - "vmv.v.x v7, zero\n\t" // clear + "vmv.v.x v7, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -2504,34 +2407,31 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v4, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v5, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v6, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v7, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v4, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v5, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v6, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v7, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" ); } @@ -2541,13 +2441,13 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" "vmv.v.x v2, zero\n\t" - "vmv.v.x v3, zero\n\t" // clear + "vmv.v.x v3, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -2566,25 +2466,22 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", + "t0"); } for (; t + 1 < tiles; t += 2) { float *r0 = img_tm2 + t * in_c; @@ -2592,11 +2489,11 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" - "vmv.v.x v1, zero\n\t" // clear + "vmv.v.x v1, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -2611,33 +2508,28 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0"); } for (; t < tiles; t++) { - float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c - "vmv.v.x v0, zero\n\t" // clear + "mv t0, %3\n\t" // t0 = in_c + "vmv.v.x v0, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "addi %0, %0, 4\n\t" @@ -2650,30 +2542,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "fa0", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "fa0", "t0"); } - } - } - csi_mem_free(input_tm2_buf); + shl_mem_free(input_tm2_buf); /*************************** transform output ****************************/ // output_tm1_buf: [out_c/4, out_h6, out_w6, 4] - float *output_tm1_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); /* AT = { @@ -2694,25 +2580,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, }; */ - #pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / 4; p++) - { - +#pragma omp parallel for num_threads(1) + for (int p = 0; p < out_c / 4; p++) { float *bias_tmp = bias_data + p * 4; - float *out0_tm = output_dot_buf + p * 64 * block_h * block_w * 4; // 输出转换前/dot后 第p个channel - float *out0 = output_tm1_buf + p * 6*block_h * 6*block_w * 4; // 转换后输出 第p个channel + float *out0_tm = + output_dot_buf + p * 64 * block_h * block_w * 4; // 输出转换前/dot后 第p个channel + float *out0 = + output_tm1_buf + p * 6 * block_h * 6 * block_w * 4; // 转换后输出 第p个channel - float *tmp1 = (float *)csi_mem_alloc(6 * 8 * 4 * sizeof(float)); + float *tmp1 = (float *)shl_mem_alloc(6 * 8 * 4 * sizeof(float)); int out_w6 = block_w * 6; for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { + float *output0_tm_0 = out0_tm + (i * block_w + j) * 4; // 8*8 起始地址 - float *output0_tm_0 = out0_tm + (i * block_w + j) * 4; // 8*8 起始地址 - - float *output0 = out0 + (i * block_w * 6 * 6 + j * 6) * 4; // 输出 6*6 的起始地址 + float *output0 = + out0 + (i * block_w * 6 * 6 + j * 6) * 4; // 输出 6*6 的起始地址 float ratio[] = {2.0, 4.0, 8.0, 16.0, 32.0}; float *ratio_ptr = ratio; @@ -2724,65 +2609,66 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "slli t1, %4, 4\n\t" // t1 = tiles * 4 * 4 "slli t2, %4, 7\n\t" // t2 = tiles * 8 * 4 * 4 bytes - "flw fa0, 0(%3)\n\t" // fa0 = 2 - "flw fa1, 4(%3)\n\t" // fa1 = 4 - "flw fa2, 8(%3)\n\t" // fa2 = 8 - "flw fa3, 12(%3)\n\t" // fa3 = 16 - "flw fa4, 16(%3)\n\t" // fa4 = 32 + "flw fa0, 0(%3)\n\t" // fa0 = 2 + "flw fa1, 4(%3)\n\t" // fa1 = 4 + "flw fa2, 8(%3)\n\t" // fa2 = 8 + "flw fa3, 12(%3)\n\t" // fa3 = 16 + "flw fa4, 16(%3)\n\t" // fa4 = 32 "mv s1, %0\n\t" - "1:\n\t" // shape : [6 * 8] * [8 * 8] = [6 * 8] + "1:\n\t" // shape : [6 * 8] * [8 * 8] = [6 * 8] - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 128\n\t" // tmp[1][m] - "addi a2, a1, 128\n\t" // tmp[2][m] - "addi a3, a2, 128\n\t" // tmp[3][m] - "addi a4, a3, 128\n\t" // tmp[4][m] - "addi a5, a4, 128\n\t" // tmp[5][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 128\n\t" // tmp[1][m] + "addi a2, a1, 128\n\t" // tmp[2][m] + "addi a3, a2, 128\n\t" // tmp[3][m] + "addi a4, a3, 128\n\t" // tmp[4][m] + "addi a5, a4, 128\n\t" // tmp[5][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "add s1, s1, t1\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "add s1, s1, t1\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "add s1, s1, t1\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "add s1, s1, t1\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "add s1, s1, t1\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "add s1, s1, t1\n\t" - "vle.v v6, (s1)\n\t" // r06 + "vle.v v6, (s1)\n\t" // r06 "add s1, s1, t1\n\t" - "vle.v v7, (s1)\n\t" // r07 + "vle.v v7, (s1)\n\t" // r07 "add s1, s1, t1\n\t" //--------------------------------------------- - "vfadd.vv v8, v1, v2\n\t" // r01 + r02 = tmp024a - "vfsub.vv v9, v1, v2\n\t" // r01 - r02 = tmp135a + "vfadd.vv v8, v1, v2\n\t" // r01 + r02 = tmp024a + "vfsub.vv v9, v1, v2\n\t" // r01 - r02 = tmp135a - "vfadd.vv v10, v3, v4\n\t" // r03 + r04 = tmp024b - "vfsub.vv v11, v3, v4\n\t" // r03 - r04 = tmp135b + "vfadd.vv v10, v3, v4\n\t" // r03 + r04 = tmp024b + "vfsub.vv v11, v3, v4\n\t" // r03 - r04 = tmp135b - "vfadd.vv v12, v5, v6\n\t" // r05 + r06 = tmp024c - "vfsub.vv v13, v5, v6\n\t" // r05 - r06 = tmp135c + "vfadd.vv v12, v5, v6\n\t" // r05 + r06 = tmp024c + "vfsub.vv v13, v5, v6\n\t" // r05 - r06 = tmp135c - "vfadd.vv v0, v0, v8\n\t" // r00 + tmp024a - "vfadd.vv v7, v7, v9\n\t" // r07 + tmp135a - "vmv.v.v v14, v10\n\t" // v14 = tmp024b + "vfadd.vv v0, v0, v8\n\t" // r00 + tmp024a + "vfadd.vv v7, v7, v9\n\t" // r07 + tmp135a + "vmv.v.v v14, v10\n\t" // v14 = tmp024b - "vmv.v.v v26, v8\n\t" // v26 = tmp024a - "vmv.v.v v28, v8\n\t" // v28 = tmp024a + "vmv.v.v v26, v8\n\t" // v26 = tmp024a + "vmv.v.v v28, v8\n\t" // v28 = tmp024a "vfmacc.vf v26, fa1, v10\n\t" // tmp024a + tmp024b * 4 "vfmacc.vf v14, fa4, v12\n\t" // tmp024b + tmp024c * 32 "vfmacc.vf v28, fa3, v10\n\t" // tmp024a + tmp024b * 16 - "vmv.v.v v15, v13\n\t" // v15 = tmp135c - "vmv.v.v v25, v9\n\t" // v25 = tmp135a - "vmv.v.v v27, v9\n\t" // v27 = tmp135a - "vfadd.vv v24, v0, v14\n\t" // r00 + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m] + "vmv.v.v v15, v13\n\t" // v15 = tmp135c + "vmv.v.v v25, v9\n\t" // v25 = tmp135a + "vmv.v.v v27, v9\n\t" // v27 = tmp135a + "vfadd.vv v24, v0, v14\n\t" // r00 + tmp024a + tmp024b + tmp024c * 32 + // = tmp[0][m] "vfmacc.vf v25, fa0, v11\n\t" // tmp135a + tmp135b * 2 "vfmacc.vf v27, fa2, v11\n\t" // tmp135a + tmp135b * 8 @@ -2790,36 +2676,40 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, //--------------------------------------------- "vse.v v24, (a0)\n\t" - "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m] - "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m] + "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = + // tmp[2][m] + "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + + // tmp024c = tmp[4][m] "vfmacc.vf v15, fa4, v11\n\t" // tmp135b * 32 + tmp135c "vse.v v26, (a2)\n\t" "vse.v v28, (a4)\n\t" //--------------------------------------------- - "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m] - "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m] + "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 + // = tmp[1][m] + "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = + // tmp[3][m] - "vfadd.vv v29, v7, v15\n\t" // r07 + tmp135a + tmp135b * 32 + tmp135c + "vfadd.vv v29, v7, v15\n\t" // r07 + tmp135a + tmp135b * 32 + tmp135c "vse.v v25, (a1)\n\t" "vse.v v27, (a3)\n\t" "vse.v v29, (a5)\n\t" - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 6\n\t" // m = 6 - "slli t1, %5, 4\n\t" // t1 = out_w6 * 4 * 4bytes - "vle.v v16, (%6)\n\t" // load 4 channel bias data + "mv t5, %2\n\t" // tmp start addr + "li t0, 6\n\t" // m = 6 + "slli t1, %5, 4\n\t" // t1 = out_w6 * 4 * 4bytes + "vle.v v16, (%6)\n\t" // load 4 channel bias data - "3:\n\t" // shape : [6 * 8] * [6 * 8] = [6 * 6] + "3:\n\t" // shape : [6 * 8] * [6 * 8] = [6 * 6] "mv a0, %1\n\t" "addi a1, a0, 16\n\t" @@ -2828,48 +2718,49 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi a4, a3, 16\n\t" "addi a5, a4, 16\n\t" - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" - "vle.v v6, (t5)\n\t" // tmp[m][6] + "vle.v v6, (t5)\n\t" // tmp[m][6] "addi t5, t5, 16\n\t" - "vle.v v7, (t5)\n\t" // tmp[m][7] + "vle.v v7, (t5)\n\t" // tmp[m][7] "addi t5, t5, 16\n\t" //--------------------------------------------- - "vfadd.vv v8, v1, v2\n\t" // tmp[m][1] + tmp[m][2] = tmp024a - "vfsub.vv v9, v1, v2\n\t" // tmp[m][1] - tmp[m][2] = tmp135a + "vfadd.vv v8, v1, v2\n\t" // tmp[m][1] + tmp[m][2] = tmp024a + "vfsub.vv v9, v1, v2\n\t" // tmp[m][1] - tmp[m][2] = tmp135a - "vfadd.vv v10, v3, v4\n\t" // tmp[m][3] + tmp[m][4] = tmp024b - "vfsub.vv v11, v3, v4\n\t" // tmp[m][3] - tmp[m][4] = tmp135b + "vfadd.vv v10, v3, v4\n\t" // tmp[m][3] + tmp[m][4] = tmp024b + "vfsub.vv v11, v3, v4\n\t" // tmp[m][3] - tmp[m][4] = tmp135b - "vfadd.vv v12, v5, v6\n\t" // tmp[m][5] + tmp[m][6] = tmp024c - "vfsub.vv v13, v5, v6\n\t" // tmp[m][5] - tmp[m][6] = tmp135c + "vfadd.vv v12, v5, v6\n\t" // tmp[m][5] + tmp[m][6] = tmp024c + "vfsub.vv v13, v5, v6\n\t" // tmp[m][5] - tmp[m][6] = tmp135c - "vfadd.vv v0, v0, v8\n\t" // tmp[m][0] + tmp024a - "vfadd.vv v7, v7, v9\n\t" // tmp[m][7] + tmp135a - "vmv.v.v v14, v10\n\t" // v14 = tmp024b + "vfadd.vv v0, v0, v8\n\t" // tmp[m][0] + tmp024a + "vfadd.vv v7, v7, v9\n\t" // tmp[m][7] + tmp135a + "vmv.v.v v14, v10\n\t" // v14 = tmp024b - "vmv.v.v v26, v8\n\t" // v26 = tmp024a - "vmv.v.v v28, v8\n\t" // v28 = tmp024a + "vmv.v.v v26, v8\n\t" // v26 = tmp024a + "vmv.v.v v28, v8\n\t" // v28 = tmp024a "vfmacc.vf v26, fa1, v10\n\t" // tmp024a + tmp024b * 4 "vfmacc.vf v14, fa4, v12\n\t" // tmp024b + tmp024c * 32 "vfmacc.vf v28, fa3, v10\n\t" // tmp024a + tmp024b * 16 - "vmv.v.v v15, v13\n\t" // v15 = tmp135c - "vmv.v.v v25, v9\n\t" // v25 = tmp135a - "vmv.v.v v27, v9\n\t" // v27 = tmp135a - "vfadd.vv v24, v0, v14\n\t" // tmp[m][0] + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m] + "vmv.v.v v15, v13\n\t" // v15 = tmp135c + "vmv.v.v v25, v9\n\t" // v25 = tmp135a + "vmv.v.v v27, v9\n\t" // v27 = tmp135a + "vfadd.vv v24, v0, v14\n\t" // tmp[m][0] + tmp024a + tmp024b + + // tmp024c * 32 = tmp[0][m] "vfmacc.vf v25, fa0, v11\n\t" // tmp135a + tmp135b * 2 "vfmacc.vf v27, fa2, v11\n\t" // tmp135a + tmp135b * 8 @@ -2877,19 +2768,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, //--------------------------------------------- "vfadd.vv v24, v24, v16\n\t" // + bias - "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m] - "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m] + "vfmacc.vf v26, fa2, v12\n\t" // tmp024a + tmp024b * 4 + tmp024c * 8 = + // tmp[2][m] + "vfmacc.vf v28, fa0, v12\n\t" // tmp024a + tmp024b * 16 + tmp024c + + // tmp024c = tmp[4][m] "vfmacc.vf v15, fa4, v11\n\t" // tmp135b * 32 + tmp135c "vse.v v24, (a0)\n\t" - "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m] - "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m] + "vfmacc.vf v25, fa3, v13\n\t" // tmp135a + tmp135b * 2 + tmp135c * 16 + // = tmp[1][m] + "vfmacc.vf v27, fa1, v13\n\t" // tmp135a + tmp135b * 8 + tmp135c * 4 = + // tmp[3][m] "vfadd.vv v26, v26, v16\n\t" // + bias "vfadd.vv v28, v28, v16\n\t" // + bias - "vfadd.vv v29, v7, v15\n\t" // tmp[m][7] + tmp135a + tmp135b * 32 + tmp135c + "vfadd.vv v29, v7, v15\n\t" // tmp[m][7] + tmp135a + tmp135b * 32 + + // tmp135c "vse.v v26, (a2)\n\t" "vse.v v28, (a4)\n\t" @@ -2909,73 +2805,64 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - :"=r"(output0_tm_0), // %0 - "=r"(output0), // %1 - "=r"(tmp1), // %2 - "=r"(ratio_ptr), // %3 - "=r"(tiles), // %4 - "=r"(out_w6), // %5 - "=r"(bias_tmp) // %6 - :"0"(output0_tm_0), - "1"(output0), - "2"(tmp1), - "3"(ratio_ptr), - "4"(tiles), - "5"(out_w6), - "6"(bias_tmp) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", "v26", "v27", "v28", "v29", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", - "fa0", "fa1", "fa2", "fa3", "fa4" - ); + : "=r"(output0_tm_0), // %0 + "=r"(output0), // %1 + "=r"(tmp1), // %2 + "=r"(ratio_ptr), // %3 + "=r"(tiles), // %4 + "=r"(out_w6), // %5 + "=r"(bias_tmp) // %6 + : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles), + "5"(out_w6), "6"(bias_tmp) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", + "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", "a0", "a1", + "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4"); } } - csi_mem_free(tmp1); + shl_mem_free(tmp1); } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6); + shl_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, + block_w * 6); output_data += output_size; - csi_mem_free(output_tm1_buf); + shl_mem_free(output_tm1_buf); } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; } - - -void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +void shl_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csinn_tensor *o_kernel, + struct csinn_tensor *t_kernel) { int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + int32_t inch = o_kernel->dim[1]; float *kernel_data = (float *)o_kernel->data; // for kernel transform buf, 3x3 --> 6x6 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); // kernel transform matrix: G - const float ktm[6][3] = { - { 1.0f/4, 0.0f, 0.0f}, - { -1.0f/6, -1.0f/6, -1.0f/6}, - { -1.0f/6, 1.0f/6, -1.0f/6}, - { 1.0f/24, 1.0f/12, 1.0f/6}, - { 1.0f/24, -1.0f/12, 1.0f/6}, - { 0.0f, 0.0f, 1.0f} - }; + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { - - const float* kernel0 = kernel_data + p * inch * 9 + q * 9; - float* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; // transform kernel const float *k0 = kernel0; @@ -2985,7 +2872,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k // h : first compute the transport matrix tmp = (g * GT)T float tmp[6][3]; for (int i = 0; i < 6; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; @@ -2993,21 +2879,21 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k // U for (int j = 0; j < 6; j++) { - float* tmpp = &tmp[j][0]; + float *tmpp = &tmp[j][0]; for (int i = 0; i < 6; i++) { - kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } } } } // [O, I, 6, 6] --> [O/4, 6*6, I, 4] - float *kernel_tm_pack4 = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + float *kernel_tm_pack4 = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); t_kernel->data = kernel_tm_pack4; for (int oc = 0; oc < outch / 4; oc++) { - float *g0 = kernel_tm_pack4 + oc * 36 * inch * 4; const float *k0 = kernel_tm + oc * 36 * inch * 4; @@ -3016,13 +2902,10 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k const float *k3 = k2 + 36 * inch; for (int k = 0; k < 36; k++) { - float *g00 = g0 + k * inch * 4; for (int ic = 0; ic < inch / 4; ic++) { - for (int i = 0; i < 4; i++) { - const float *k00 = k0 + (ic * 4 + i) * 36; const float *k10 = k1 + (ic * 4 + i) * 36; const float *k20 = k2 + (ic * 4 + i) * 36; @@ -3039,15 +2922,12 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } - -int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv3x3s1_winograd43_pack4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -3061,7 +2941,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, int stride_w = params->stride_width; int dilation_h = params->dilation_height; int dilation_w = params->dilation_width; - int pad_left = params->pad_left; + int pad_left = params->pad_left; int pad_top = params->pad_top; int batch = input->dim[0]; @@ -3080,29 +2960,31 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, int block_h = (out_h + 3) / 4; int block_w = (out_w + 3) / 4; - int padded_in_h = block_h * 4 + 2; // block * 4 for alignment with 4,kernel = 3 * 3, stride = 1,thus input_size + 2 + int padded_in_h = + block_h * 4 + + 2; // block * 4 for alignment with 4,kernel = 3 * 3, stride = 1,thus input_size + 2 int padded_in_w = block_w * 4 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias + bool flag_bias = 1; // default: conv2d layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (float *)csi_mem_alloc(out_c * sizeof(float)); + bias_data = (float *)shl_mem_alloc(out_c * sizeof(float)); } - - for(int n = 0; n < batch; n++) { - + for (int n = 0; n < batch; n++) { // pad buffer: [in_c/4 h w 4] - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); // pad input - csi_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); + shl_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); input_data += input_size; // input transform buffer1: [in_ch/4, 36, blocks, 6] - float *input_tm1_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *input_tm1_buf = + (float *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float)); /****************************** transform input *****************************/ /* @@ -3118,22 +3000,23 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, int tiles = block_h * block_w; - #pragma omp parallel for num_threads(1) - for(int q = 0; q < in_c / 4; q++) { - - float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 4; // feature map after padding - q channel - float *img0_tm = input_tm1_buf + q * 36 * tiles * 4; // transform and interleave - q channel - - float *tmp = (float *)csi_mem_alloc(6 * 6 * 4 * sizeof(float)); +#pragma omp parallel for num_threads(1) + for (int q = 0; q < in_c / 4; q++) { + float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * + 4; // feature map after padding - q channel + float *img0_tm = + input_tm1_buf + q * 36 * tiles * 4; // transform and interleave - q channel - for(int i = 0; i < block_h; i++) { + float *tmp = (float *)shl_mem_alloc(6 * 6 * 4 * sizeof(float)); - for(int j = 0; j < block_w; j++) { - - float *r0 = img0 + (i * padded_in_w * 4 + j * 4) * 4; // feature map after padding 6*6 start addr - float *r0_tm = img0_tm + (i * block_w + j) * 4; // input_tm1 6*6 block start addr + for (int i = 0; i < block_h; i++) { + for (int j = 0; j < block_w; j++) { + float *r0 = img0 + (i * padded_in_w * 4 + j * 4) * + 4; // feature map after padding 6*6 start addr + float *r0_tm = + img0_tm + (i * block_w + j) * 4; // input_tm1 6*6 block start addr - float ratio[] = {4, -4, 2, -2, -5}; // note: in fact cannot be output constrain + float ratio[] = {4, -4, 2, -2, -5}; // note: in fact cannot be output constrain float *ratio_ptr = ratio; asm volatile( @@ -3142,139 +3025,140 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "mv t5, %2\n\t" // t5 = tmp start addr "slli t1, %4, 4\n\t" // t1 = padded_in_w * 4 * 4bytes - "flw fa0, 0(%3)\n\t" // fa0 = 4 - "flw fa1, 4(%3)\n\t" // fa1 = -4 - "flw fa2, 8(%3)\n\t" // fa2 = 2 - "flw fa3, 12(%3)\n\t" // fa3 = -2 - "flw fa4, 16(%3)\n\t" // fa4 = -5 + "flw fa0, 0(%3)\n\t" // fa0 = 4 + "flw fa1, 4(%3)\n\t" // fa1 = -4 + "flw fa2, 8(%3)\n\t" // fa2 = 2 + "flw fa3, 12(%3)\n\t" // fa3 = -2 + "flw fa4, 16(%3)\n\t" // fa4 = -5 - "1:\n\t" - "mv s1, %0\n\t" // s1 = r00 addr + "1:\n\t" + "mv s1, %0\n\t" // s1 = r00 addr - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 96\n\t" // tmp[1][m] - "addi a2, a1, 96\n\t" // tmp[2][m] - "addi a3, a2, 96\n\t" // tmp[3][m] - "addi a4, a3, 96\n\t" // tmp[4][m] - "addi a5, a4, 96\n\t" // tmp[5][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 96\n\t" // tmp[1][m] + "addi a2, a1, 96\n\t" // tmp[2][m] + "addi a3, a2, 96\n\t" // tmp[3][m] + "addi a4, a3, 96\n\t" // tmp[4][m] + "addi a5, a4, 96\n\t" // tmp[5][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "addi s1, s1, 16\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "addi s1, s1, 16\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "addi s1, s1, 16\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "addi s1, s1, 16\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "addi s1, s1, 16\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "addi s1, s1, 16\n\t" "vmv.v.v v24, v4\n\t" "vmv.v.v v29, v5\n\t" //--------------------------------------------- - "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 - "vfmacc.vf v24, fa4, v2\n\t" // r04 + 4 * r00 - 5 * r02 + "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 + "vfmacc.vf v24, fa4, v2\n\t" // r04 + 4 * r00 - 5 * r02 "vse.v v24, (a0)\n\t" //--------------------------------------------- - "vfadd.vv v25, v3, v4\n\t" // r03 + r04 - "vfadd.vv v6, v1, v2\n\t" // r01 + r02 - "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) + "vfadd.vv v25, v3, v4\n\t" // r03 + r04 + "vfadd.vv v6, v1, v2\n\t" // r01 + r02 + "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) "vse.v v25, (a1)\n\t" //--------------------------------------------- - "vfsub.vv v26, v4, v3\n\t" // r04 - r03 - "vfsub.vv v7, v1, v2\n\t" // r01 - r02 - "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) + "vfsub.vv v26, v4, v3\n\t" // r04 - r03 + "vfsub.vv v7, v1, v2\n\t" // r01 - r02 + "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) "vse.v v26, (a2)\n\t" //--------------------------------------------- - "vfsub.vv v8, v1, v3\n\t" // r01 - r03 - "vfsub.vv v27, v4, v2\n\t" // r04 - r02 - "vfsub.vv v28, v4, v2\n\t" // r04 - r02 + "vfsub.vv v8, v1, v3\n\t" // r01 - r03 + "vfsub.vv v27, v4, v2\n\t" // r04 - r02 + "vfsub.vv v28, v4, v2\n\t" // r04 - r02 - "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) + "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) "vse.v v27, (a3)\n\t" - "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) + "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) "vse.v v28, (a4)\n\t" //--------------------------------------------- - "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 - "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 + "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 + "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 "vse.v v29, (a5)\n\t" //--------------------------------------------- - "add %0, %0, t1\n\t" // padding feature map 6*6 next line addr - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "add %0, %0, t1\n\t" // padding feature map 6*6 next line addr + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 6\n\t" // m = 6 + "mv t5, %2\n\t" // tmp start addr + "li t0, 6\n\t" // m = 6 - "slli t1, %5, 4\n\t" // t1 = tiles * 4 * 4 bytes - "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 4 channels * 4 bytes + "slli t1, %5, 4\n\t" // t1 = tiles * 4 * 4 bytes + "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 4 channels * 4 + // bytes - "3:\n\t" + "3:\n\t" - "mv a0, %1\n\t" // r0_tm_0 - "add a1, a0, t1\n\t" // r0_tm_1 - "add a2, a1, t1\n\t" // r0_tm_2 - "add a3, a2, t1\n\t" // r0_tm_3 - "add a4, a3, t1\n\t" // r0_tm_4 - "add a5, a4, t1\n\t" // r0_tm_5 + "mv a0, %1\n\t" // r0_tm_0 + "add a1, a0, t1\n\t" // r0_tm_1 + "add a2, a1, t1\n\t" // r0_tm_2 + "add a3, a2, t1\n\t" // r0_tm_3 + "add a4, a3, t1\n\t" // r0_tm_4 + "add a5, a4, t1\n\t" // r0_tm_5 - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" "vmv.v.v v24, v4\n\t" "vmv.v.v v29, v5\n\t" //--------------------------------------------- - "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 - "vfmacc.vf v24, fa4, v2\n\t" // r04 * 4 * r00 - 5 * r02 + "vfmacc.vf v24, fa0, v0\n\t" // r04 + 4 * r00 + "vfmacc.vf v24, fa4, v2\n\t" // r04 * 4 * r00 - 5 * r02 "vse.v v24, (a0)\n\t" //--------------------------------------------- - "vfadd.vv v25, v3, v4\n\t" // r03 + r04 - "vfadd.vv v6, v1, v2\n\t" // r01 + r02 - "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) + "vfadd.vv v25, v3, v4\n\t" // r03 + r04 + "vfadd.vv v6, v1, v2\n\t" // r01 + r02 + "vfmacc.vf v25, fa1, v6\n\t" // r03 + r04 - 4 * (r01 - r02) "vse.v v25, (a1)\n\t" //--------------------------------------------- - "vfsub.vv v26, v4, v3\n\t" // r04 - r03 - "vfsub.vv v7, v1, v2\n\t" // r01 - r02 - "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) + "vfsub.vv v26, v4, v3\n\t" // r04 - r03 + "vfsub.vv v7, v1, v2\n\t" // r01 - r02 + "vfmacc.vf v26, fa0, v7\n\t" // r04 - r03 + 4 * (r01 - r02) "vse.v v26, (a2)\n\t" //--------------------------------------------- - "vfsub.vv v8, v1, v3\n\t" // r01 - r03 - "vfsub.vv v27, v4, v2\n\t" // r04 - r02 - "vfsub.vv v28, v4, v2\n\t" // r04 - r02 + "vfsub.vv v8, v1, v3\n\t" // r01 - r03 + "vfsub.vv v27, v4, v2\n\t" // r04 - r02 + "vfsub.vv v28, v4, v2\n\t" // r04 - r02 - "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) + "vfmacc.vf v27, fa3, v8\n\t" // r04 - r02 - 2 * (r01 - r03) "vse.v v27, (a3)\n\t" - "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) + "vfmacc.vf v28, fa2, v8\n\t" // r04 - r02 + 2 * (r01 - r03) "vse.v v28, (a4)\n\t" //--------------------------------------------- - "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 - "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 + "vfmacc.vf v29, fa0, v1\n\t" // r05 + 4 * r01 + "vfmacc.vf v29, fa4, v3\n\t" // r05 + 4 * r01 - 5 * r03 "vse.v v29, (a5)\n\t" //--------------------------------------------- @@ -3284,42 +3168,35 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - - :"=r"(r0), // %0 - "=r"(r0_tm), // %1 - "=r"(tmp), // %2 - "=r"(ratio_ptr), // %3 - "=r"(padded_in_w), // %4 - "=r"(tiles) // %5 - :"0"(r0), - "1"(r0_tm), - "2"(tmp), - "3"(ratio_ptr), - "4"(padded_in_w), - "5"(tiles) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "v28", "v29", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5" - ); - + : "=r"(r0), // %0 + "=r"(r0_tm), // %1 + "=r"(tmp), // %2 + "=r"(ratio_ptr), // %3 + "=r"(padded_in_w), // %4 + "=r"(tiles) // %5 + : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w), + "5"(tiles) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v24", "v25", "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", + "a0", "a1", "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4", + "fa5"); } } - csi_mem_free(tmp); + shl_mem_free(tmp); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); /*********************************** dot ***************************************/ // reorder input_tm1_buf - float *input_tm2_buf = (float *)csi_mem_alloc(36 * tiles * in_c * sizeof(float)); + float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int r = 0; r < 36; r++) { - float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data int t = 0; for (; t + 7 < tiles; t += 8) { - float *tm2 = img_tm2 + t * in_c; // img_tm2 row data + float *tm2 = img_tm2 + t * in_c; // img_tm2 row data float *tm1 = input_tm1_buf; tm1 += (r * tiles + t) * 4; @@ -3370,7 +3247,6 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } tm1 += 36 * tiles * 4; } - } for (; t < tiles; t++) { float *tm2 = img_tm2 + t * in_c; // img_tm2 row data @@ -3387,30 +3263,28 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, } } - csi_mem_free(input_tm1_buf); + shl_mem_free(input_tm1_buf); // output_dot_buf: [out_c/4, 36, blocks, 4] - float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); + float *output_dot_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); - #pragma omp parallel for num_threads(1) +#pragma omp parallel for num_threads(1) for (int p = 0; p < out_c / 4; p++) { - - float *output0_tm = output_dot_buf + p * 36 * tiles * 4; // 4 channel dot output - float *kernel0_tm = kernel_data + p * 36 * in_c * 4; // 4 channel kernel + float *output0_tm = output_dot_buf + p * 36 * tiles * 4; // 4 channel dot output + float *kernel0_tm = kernel_data + p * 36 * in_c * 4; // 4 channel kernel for (int r = 0; r < 36; r++) { - float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel int t = 0; for (; t + 7 < tiles; t += 8) { - float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" @@ -3419,9 +3293,9 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "vmv.v.x v4, zero\n\t" "vmv.v.x v5, zero\n\t" "vmv.v.x v6, zero\n\t" - "vmv.v.x v7, zero\n\t" // clear + "vmv.v.x v7, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -3448,34 +3322,31 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v4, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v5, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v6, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v7, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v4, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v5, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v6, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v7, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0" ); } @@ -3485,13 +3356,13 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" "vmv.v.x v1, zero\n\t" "vmv.v.x v2, zero\n\t" - "vmv.v.x v3, zero\n\t" // clear + "vmv.v.x v3, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -3510,25 +3381,22 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v2, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v3, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v2, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v3, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", + "t0"); } for (; t + 1 < tiles; t += 2) { float *r0 = img_tm2 + t * in_c; @@ -3536,11 +3404,11 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c + "mv t0, %3\n\t" // t0 = in_c "vmv.v.x v0, zero\n\t" - "vmv.v.x v1, zero\n\t" // clear + "vmv.v.x v1, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "flw fa1, 4(%0)\n\t" @@ -3555,33 +3423,28 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vse.v v1, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vse.v v1, (%2)\n\t" + "addi %2, %2, 16\n\t" + + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0"); } for (; t < tiles; t++) { - float *r0 = img_tm2 + t * in_c; float *k0 = kernel0_tm + r * in_c * 4; asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "mv t0, %3\n\t" // t0 = in_c - "vmv.v.x v0, zero\n\t" // clear + "mv t0, %3\n\t" // t0 = in_c + "vmv.v.x v0, zero\n\t" // clear - "1:\n\t" + "1:\n\t" "flw fa0, (%0)\n\t" "addi %0, %0, 4\n\t" @@ -3594,30 +3457,24 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "vse.v v0, (%2)\n\t" - "addi %2, %2, 16\n\t" - - :"=r"(r0), // %0 - "=r"(k0), // %1 - "=r"(output0_tm), // %2 - "=r"(in_c) // %3 - :"0"(r0), - "1"(k0), - "2"(output0_tm), - "3"(in_c) - :"cc", "memory", "v0", "v1", "fa0", "t0" - ); + "vse.v v0, (%2)\n\t" + "addi %2, %2, 16\n\t" + : "=r"(r0), // %0 + "=r"(k0), // %1 + "=r"(output0_tm), // %2 + "=r"(in_c) // %3 + : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c) + : "cc", "memory", "v0", "v1", "fa0", "t0"); } - } - } - csi_mem_free(input_tm2_buf); + shl_mem_free(input_tm2_buf); /*************************** transform output ****************************/ // output_tm1_buf: [out_c/4, out_h4, out_w4, 4] - float *output_tm1_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float)); /* AT = { @@ -3628,124 +3485,124 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, }; */ - #pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / 4; p++) - { - +#pragma omp parallel for num_threads(1) + for (int p = 0; p < out_c / 4; p++) { float *bias_tmp = bias_data + p * 4; - float *out0_tm = output_dot_buf + p * 36 * block_h * block_w * 4; // 输出转换前/dot后 第p个channel - float *out0 = output_tm1_buf + p * 4*block_h * 4*block_w * 4; // 转换后输出 第p个channel + float *out0_tm = + output_dot_buf + p * 36 * block_h * block_w * 4; // 输出转换前/dot后 第p个channel + float *out0 = + output_tm1_buf + p * 4 * block_h * 4 * block_w * 4; // 转换后输出 第p个channel - float *tmp1 = (float *)csi_mem_alloc(4 * 6 * 4 * sizeof(float)); + float *tmp1 = (float *)shl_mem_alloc(4 * 6 * 4 * sizeof(float)); int out_w4 = block_w * 4; for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { + float *output0_tm_0 = out0_tm + (i * block_w + j) * 4; // 6*6 起始地址 - float *output0_tm_0 = out0_tm + (i * block_w + j) * 4; // 6*6 起始地址 - - float *output0 = out0 + (i * block_w * 4 * 4 + j * 4) * 4; // 输出 4*4 的起始地址 + float *output0 = + out0 + (i * block_w * 4 * 4 + j * 4) * 4; // 输出 4*4 的起始地址 float ratio[] = {2.0, 4.0, 8.0}; float *ratio_ptr = ratio; asm volatile( "vsetvli zero, zero, e32, m1\n\t" - "li t0, 6\n\t" // m = 6 - "mv t5, %2\n\t" // t5 = tmp start addr - "slli t1, %4, 4\n\t" // t1 = tiles * 4 * 4 - "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 4 channels * 4 bytes + "li t0, 6\n\t" // m = 6 + "mv t5, %2\n\t" // t5 = tmp start addr + "slli t1, %4, 4\n\t" // t1 = tiles * 4 * 4 + "mulw t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 4 channels * 4 + // bytes - "flw fa0, 0(%3)\n\t" // fa0 = 2 - "flw fa1, 4(%3)\n\t" // fa1 = 4 - "flw fa2, 8(%3)\n\t" // fa2 = 8 + "flw fa0, 0(%3)\n\t" // fa0 = 2 + "flw fa1, 4(%3)\n\t" // fa1 = 4 + "flw fa2, 8(%3)\n\t" // fa2 = 8 "mv s1, %0\n\t" - "1:\n\t" // shape : [4 * 6] * [6 * 6] = [4 * 6] + "1:\n\t" // shape : [4 * 6] * [6 * 6] = [4 * 6] - "mv a0, t5\n\t" // tmp[0][m] - "addi a1, a0, 96\n\t" // tmp[1][m] - "addi a2, a1, 96\n\t" // tmp[2][m] - "addi a3, a2, 96\n\t" // tmp[3][m] + "mv a0, t5\n\t" // tmp[0][m] + "addi a1, a0, 96\n\t" // tmp[1][m] + "addi a2, a1, 96\n\t" // tmp[2][m] + "addi a3, a2, 96\n\t" // tmp[3][m] - "vle.v v0, (s1)\n\t" // r00 + "vle.v v0, (s1)\n\t" // r00 "add s1, s1, t1\n\t" - "vle.v v1, (s1)\n\t" // r01 + "vle.v v1, (s1)\n\t" // r01 "add s1, s1, t1\n\t" - "vle.v v2, (s1)\n\t" // r02 + "vle.v v2, (s1)\n\t" // r02 "add s1, s1, t1\n\t" - "vle.v v3, (s1)\n\t" // r03 + "vle.v v3, (s1)\n\t" // r03 "add s1, s1, t1\n\t" - "vle.v v4, (s1)\n\t" // r04 + "vle.v v4, (s1)\n\t" // r04 "add s1, s1, t1\n\t" - "vle.v v5, (s1)\n\t" // r05 + "vle.v v5, (s1)\n\t" // r05 "add s1, s1, t1\n\t" //--------------------------------------------- - "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a - "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a + "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a + "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a - "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b - "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b - "vmv.v.v v25, v6\n\t" // v25 = tmp13a + "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b + "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b + "vmv.v.v v25, v6\n\t" // v25 = tmp13a //--------------------------------------------- - "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a - "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b + "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a + "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b "vse.v v24, (a0)\n\t" - "vfmacc.vf v25, fa0, v8\n\t" // tmp13a + 2 * tmp13b + "vfmacc.vf v25, fa0, v8\n\t" // tmp13a + 2 * tmp13b "vse.v v25, (a1)\n\t" - "vfmacc.vf v26, fa1, v7\n\t" // tmp02a + 4 * tmp02b + "vfmacc.vf v26, fa1, v7\n\t" // tmp02a + 4 * tmp02b "vse.v v26, (a2)\n\t" - "vfadd.vv v27, v5, v6\n\t" // r05 + tmp13a - "vfmacc.vf v27, fa2, v8\n\t" // r05 + tmp13a * 8 tmp13b + "vfadd.vv v27, v5, v6\n\t" // r05 + tmp13a + "vfmacc.vf v27, fa2, v8\n\t" // r05 + tmp13a * 8 tmp13b "vse.v v27, (a3)\n\t" //--------------------------------------------- - "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] + "addi t5, t5, 16\n\t" // tmp[0][0] --> tmp[0][1] "addi t0, t0, -1\n\t" "bnez t0, 1b\n\t" - "2:\n\t" + "2:\n\t" - "mv t5, %2\n\t" // tmp start addr - "li t0, 4\n\t" // m = 4 - "slli t1, %5, 4\n\t" // t1 = out_w4 * 4 * 4bytes - "vle.v v16, (%6)\n\t" // load 4 channel bias data + "mv t5, %2\n\t" // tmp start addr + "li t0, 4\n\t" // m = 4 + "slli t1, %5, 4\n\t" // t1 = out_w4 * 4 * 4bytes + "vle.v v16, (%6)\n\t" // load 4 channel bias data - "3:\n\t" // shape : [4 * 6] * [6 * 4] = [4 * 4] + "3:\n\t" // shape : [4 * 6] * [6 * 4] = [4 * 4] "mv a0, %1\n\t" "addi a1, a0, 16\n\t" "addi a2, a1, 16\n\t" "addi a3, a2, 16\n\t" - "vle.v v0, (t5)\n\t" // tmp[m][0] + "vle.v v0, (t5)\n\t" // tmp[m][0] "addi t5, t5, 16\n\t" - "vle.v v1, (t5)\n\t" // tmp[m][1] + "vle.v v1, (t5)\n\t" // tmp[m][1] "addi t5, t5, 16\n\t" - "vle.v v2, (t5)\n\t" // tmp[m][2] + "vle.v v2, (t5)\n\t" // tmp[m][2] "addi t5, t5, 16\n\t" - "vle.v v3, (t5)\n\t" // tmp[m][3] + "vle.v v3, (t5)\n\t" // tmp[m][3] "addi t5, t5, 16\n\t" - "vle.v v4, (t5)\n\t" // tmp[m][4] + "vle.v v4, (t5)\n\t" // tmp[m][4] "addi t5, t5, 16\n\t" - "vle.v v5, (t5)\n\t" // tmp[m][5] + "vle.v v5, (t5)\n\t" // tmp[m][5] "addi t5, t5, 16\n\t" //--------------------------------------------- - "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a - "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a + "vfadd.vv v26, v1, v2\n\t" // r01 + r02 = tmp02a + "vfsub.vv v6, v1, v2\n\t" // r01 - r02 = tmp13a - "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b - "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b - "vmv.v.v v25, v6\n\t" // v25 = tmp13a + "vfadd.vv v7, v3, v4\n\t" // r03 + r04 = tmp02b + "vfsub.vv v8, v3, v4\n\t" // r03 - r04 = tmp13b + "vmv.v.v v25, v6\n\t" // v25 = tmp13a //--------------------------------------------- "vfadd.vv v24, v0, v26\n\t" // r00 + tmp02a "vfadd.vv v24, v24, v7\n\t" // r00 + tmp02a + tmp02b @@ -3770,58 +3627,49 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, "addi t0, t0, -1\n\t" "bnez t0, 3b" - :"=r"(output0_tm_0), // %0 - "=r"(output0), // %1 - "=r"(tmp1), // %2 - "=r"(ratio_ptr), // %3 - "=r"(tiles), // %4 - "=r"(out_w4), // %5 - "=r"(bias_tmp) // %6 - :"0"(output0_tm_0), - "1"(output0), - "2"(tmp1), - "3"(ratio_ptr), - "4"(tiles), - "5"(out_w4), - "6"(bias_tmp) - - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v24", "v25", "v26", "v27", - "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", - "fa0", "fa1", "fa2" - ); + : "=r"(output0_tm_0), // %0 + "=r"(output0), // %1 + "=r"(tmp1), // %2 + "=r"(ratio_ptr), // %3 + "=r"(tiles), // %4 + "=r"(out_w4), // %5 + "=r"(bias_tmp) // %6 + : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles), + "5"(out_w4), "6"(bias_tmp) + + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v16", "v24", "v25", "v26", "v27", "t0", "t1", "t2", "t5", "s1", "a0", + "a1", "a2", "a3", "fa0", "fa1", "fa2"); } } - csi_mem_free(tmp1); + shl_mem_free(tmp1); } - csi_mem_free(output_dot_buf); + shl_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) - csi_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4); + shl_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4, + block_w * 4); output_data += output_size; - csi_mem_free(output_tm1_buf); + shl_mem_free(output_tm1_buf); } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; } -void csi_c906_conv3x3s1(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +void shl_c906_conv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { /* to do */ } -void csi_c906_conv3x3s2(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +void shl_c906_conv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { /* to do */ } diff --git a/source/c906_opt/convolution_gemm_fp16.c b/source/c906_opt/convolution_gemm_fp16.c index 41573054..82ff0bcf 100644 --- a/source/c906_opt/convolution_gemm_fp16.c +++ b/source/c906_opt/convolution_gemm_fp16.c @@ -16,36 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -/* - pack kernel_data inplace, means the origin kernel_data be destoried. - The reason to do this is that the packaging process must not consume more memory. -*/ -void csi_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params) +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { __fp16 *kernel_data = (__fp16 *)kernel->data; int group = params->group; - int m = kernel->dim[0] / group; // m = out_ch / group + int m = kernel->dim[0] / group; // m = out_ch / group int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16)); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); for (int g = 0; g < group; g++) { - csi_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -75,29 +73,32 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, int32_t k = channel_col; int32_t n = out_height * out_width; - __fp16 *im2col_data = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); - __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); - if(pad_if_zero) - { + if (pad_if_zero) { for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { // im2col - for(int c = 0; c < channel_col; ++c) { + for (int c = 0; c < channel_col; ++c) { int w_offset = c % ksize_w; int h_offset = c / ksize_w % ksize_h; int c_im = c / ksize_h / ksize_w; - for(int h = 0; h < out_height; ++h) { - for(int w = 0; w < out_width; ++w) { + for (int h = 0; h < out_height; ++h) { + for (int w = 0; w < out_width; ++w) { int im_row = h_offset + h * stride_h; int im_col = w_offset + w * stride_w; - int col_index = (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] + int col_index = (c * out_height + h) * out_width + + w; // [channel_col, out_h, out_w] im_row = im_row - params->pad_top; im_col = im_col - params->pad_left; - if(im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { + if (im_row < 0 || im_col < 0 || im_row >= in_height || + im_col >= in_width) { im2col_data[col_index] = 0.0f; } else { - im2col_data[col_index] = input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + im_col]; + im2col_data[col_index] = + input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + + im_col]; } } } @@ -108,25 +109,24 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, __fp16 *pc = output_data; // pack - csi_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n); + shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n); // GEMM - csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } - } - else{ + } else { for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { // im2col - for(int c = 0; c < channel_col; ++c) { + for (int c = 0; c < channel_col; ++c) { int w_offset = c % ksize_w; int h_offset = c / ksize_w % ksize_h; int c_im = c / ksize_h / ksize_w; int input_h = c_im * in_height; - int im_row =h_offset; - int col_index_tmp = (c * out_height ) * out_width; + int im_row = h_offset; + int col_index_tmp = (c * out_height) * out_width; for (int h = 0; h < out_height; ++h) { int im_col = w_offset; @@ -165,18 +165,18 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, __fp16 *pc = output_data; // pack - csi_nn_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n); - // csi_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n); + shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n); + // shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); - // csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); + // shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } } - - csi_mem_free(pb_reorder); - csi_mem_free(im2col_data); + + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); return CSINN_TRUE; } diff --git a/source/c906_opt/convolution_relu.c b/source/c906_opt/convolution_relu.c index 5a2c1e0d..eb55361d 100644 --- a/source/c906_opt/convolution_relu.c +++ b/source/c906_opt/convolution_relu.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* only support layout:NCHW @@ -26,11 +26,9 @@ kernel layout: O I h w output layout: N O H W */ -int csi_c906_conv2d_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int32_t out_c = kernel->dim[0]; int32_t in_c = kernel->dim[1]; @@ -42,37 +40,25 @@ int csi_c906_conv2d_relu_init(struct csi_tensor *input, int32_t stride_w = params->stride_width; int32_t dalition_h = params->dilation_height; int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; - if(kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) { - - csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, params); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, params); params->conv_extra.conv_mode = CSINN_GEMM; - params->base.bc = csi_c906_conv1x1s1_sgemm_fuse_relu; - - // } else if(kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) { - - // struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - // conv3x3s1_winograd64_transform_kernel_1(kernel, t_kernel); - // params->conv_extra.kernel_tm = t_kernel; - // params->conv_extra.conv_mode = CSINN_WINOGRAD; - // params->base.bc = conv3x3s1_winograd64_1; - + cb->exec = shl_c906_conv1x1s1_sgemm_fuse_relu; } else { - - csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params); + shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params); params->conv_extra.conv_mode = CSINN_GEMM; - params->base.bc = csi_c906_conv_im2col_sgemm_fuse_relu; + cb->exec = shl_c906_conv_im2col_sgemm_fuse_relu; } return CSINN_TRUE; } - -int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int32_t batch = input->dim[0]; int32_t in_ch = input->dim[1]; @@ -87,22 +73,22 @@ int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input, int32_t kernel_w = kernel->dim[3]; int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { - params->base.bc = csi_c906_dwconv3x3s1_fuse_relu; + cb->exec = shl_c906_dwconv3x3s1_fuse_relu; } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { - params->base.bc = csi_c906_dwconv3x3s2_fuse_relu; + cb->exec = shl_c906_dwconv3x3s2_fuse_relu; } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 1 && stride_w == 1) { - params->base.bc = csi_c906_dwconv5x5s1_fuse_relu; + cb->exec = shl_c906_dwconv5x5s1_fuse_relu; } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 2 && stride_w == 2) { - params->base.bc = csi_c906_dwconv5x5s2_fuse_relu; + cb->exec = shl_c906_dwconv5x5s2_fuse_relu; } else { - params->base.bc = csi_ref_depthwise_conv2d_relu_f32; - + cb->exec = shl_ref_depthwise_conv2d_relu_f32; } return CSINN_TRUE; diff --git a/source/c906_opt/convolution_sgemm.c b/source/c906_opt/convolution_sgemm_fp32.c similarity index 54% rename from source/c906_opt/convolution_sgemm.c rename to source/c906_opt/convolution_sgemm_fp32.c index 509bf595..41e4b68c 100644 --- a/source/c906_opt/convolution_sgemm.c +++ b/source/c906_opt/convolution_sgemm_fp32.c @@ -16,38 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -/* - pack kernel_data inplace, means the origin kernel_data be destoried. - The reason to do this is that the packaging process must not consume more memory. -*/ -void csi_c906_conv_im2col_sgemm_transform_kernel(struct csi_tensor *kernel, - struct conv2d_params *params) +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c906_conv_im2col_sgemm_transform_kernel(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { float *kernel_data = (float *)kernel->data; int group = params->group; - int m = kernel->dim[0] / group; // m = out_ch / group + int m = kernel->dim[0] / group; // m = out_ch / group int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float)); + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); for (int g = 0; g < group; g++) { - csi_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } - -static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params, - bool fuse_relu) +static int shl_c906_conv_im2col_sgemm_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, bool fuse_relu) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -77,28 +73,30 @@ static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input, int32_t k = channel_col; int32_t n = out_height * out_width; - float *im2col_data = (float *)csi_mem_alloc(k * n * sizeof(float)); - float* pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float)); + float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { - // im2col - for(int c = 0; c < channel_col; ++c) { + for (int c = 0; c < channel_col; ++c) { int w_offset = c % ksize_w; int h_offset = c / ksize_w % ksize_h; int c_im = c / ksize_h / ksize_w; - for(int h = 0; h < out_height; ++h) { - for(int w = 0; w < out_width; ++w) { + for (int h = 0; h < out_height; ++h) { + for (int w = 0; w < out_width; ++w) { int im_row = h_offset + h * stride_h; int im_col = w_offset + w * stride_w; - int col_index = (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] + int col_index = + (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] im_row = im_row - params->pad_top; im_col = im_col - params->pad_left; - if(im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { + if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { im2col_data[col_index] = 0.0f; } else { - im2col_data[col_index] = input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + im_col]; + im2col_data[col_index] = + input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + + im_col]; } } } @@ -109,35 +107,30 @@ static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input, float *pc = output_data; // pack - csi_c906_reorder_input_1(im2col_data, pb, k, n, n); + shl_c906_reorder_input_1(im2col_data, pb, k, n, n); // GEMM - csi_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu); + shl_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } - csi_mem_free(pb_reorder); - csi_mem_free(im2col_data); + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); return CSINN_TRUE; } -int csi_c906_conv_im2col_sgemm(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv_im2col_sgemm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { bool fuse_relu = 0; - return csi_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu); + return shl_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu); } - -int csi_c906_conv_im2col_sgemm_fuse_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_conv_im2col_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { bool fuse_relu = 1; - return csi_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu); + return shl_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu); } diff --git a/source/c906_opt/depthwise_convolution_3x3.c b/source/c906_opt/depthwise_convolution_3x3.c deleted file mode 100644 index e7dcd292..00000000 --- a/source/c906_opt/depthwise_convolution_3x3.c +++ /dev/null @@ -1,970 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" - -#ifndef DWCONV3X3S1 -#define DWCONV3X3S1 csi_c906_dwconv3x3s1 -#endif - -#ifndef DWCONV3X3S2 -#define DWCONV3X3S2 csi_c906_dwconv3x3s2 -#endif - - - -/* - (1) Algorithm works as follows: - out_h2: out_h2_w8_loop --> out_h2_w4 --> out_h2_wtail - out_h_tail: out_h1_w8_loop --> out_h1_w4 --> out_h1_wtail - - out_h2_w8: out_h2_w4: || out_h1_w8: out_h1_w4: - outptr0[0-7]: outptr1[0-7]: outptr0[0-3]: outptr1[0-3] || outptr0[0-7]: outptr0[0-3]: - k00 * r0[0-7] k00 * r1[0-7] k00 * r0[0-3] k00 * r1[0-3] || k00 * r0[0-7] k00 * r0[0-3] - k01 * r0[1-8] k01 * r1[1-8] k01 * r0[1-4] k01 * r1[1-4] || k01 * r0[1-8] k01 * r0[1-4] - k02 * r0[2-9] k02 * r1[2-9] k02 * r0[2-5] k02 * r1[2-5] || k02 * r0[2-9] k02 * r0[2-5] - k10 * r1[0-7] k10 * r2[0-7] k10 * r1[0-3] k10 * r2[0-3] || k10 * r1[0-7] k10 * r1[0-3] - k11 * r1[1-8] k11 * r2[1-8] k11 * r1[1-4] k11 * r2[1-4] || k11 * r1[1-8] k11 * r1[1-4] - k12 * r1[2-9] k12 * r2[2-9] k12 * r1[2-5] k12 * r2[2-5] || k12 * r1[2-9] k12 * r1[2-5] - k20 * r2[0-7] k20 * r3[0-7] k20 * r2[0-3] k20 * r3[0-3] || k20 * r2[0-7] k20 * r2[0-3] - k21 * r2[1-8] k21 * r3[1-8] k21 * r2[1-4] k21 * r3[1-4] || k21 * r2[1-8] k21 * r2[1-4] - k22 * r2[2-9] k22 * r3[2-9] k22 * r2[2-5] k22 * r3[2-5] || k22 * r2[2-9] k22 * r2[2-5] - - h2_w8_loop execution process: - - load r0[0-7] --> load r0[1-8] --> load r0[2-9] --> // Load r0[0-7] r0[1-8] r0[-9] before the loop to facilitate pipeline work - - --> load bias0[0-7] --> load r3[0-7] --> load bias1[0-7] --> load r3[1-8] --> k00*r0[0-7] / k20*r3[0-7] --> - - - - load r3[2-9] --> k01*r0[1-8] / k21*r3[1-8] --> load r1[0-7] --> k02*r0[2-9] / k22*r3[2-9] --> load r1[1-8] --> k10*r1[0-7] / k00*r1[0-7] --> - - - - load r1[2-9] --> k11*r1[1-8] / k01*r1[1-8] --> load r2[0-7] --> k12*r1[2-9] / k02*r1[2-9] --> load r2[1-8] --> k20*r2[0-7] / k10*r2[0-7] --> - - - - load r2[2-9] --> k21*r2[1-8] / k11*r2[1-8] --> load r0[0-7] --> k22*r2[2-9] / k12*r2[2-9] --> load r0[1-8] --> load r0[2-9] ---------------- - - - - ----------------------------------------------------------------------------------------------------------------------------------------------------------- - - - h1_w8_loop execution process: - - load r0[0-7] --> load r0[1-8] --> load r0[2-9] --> - - --> load bias0[0-7] --> k00*r0[0-7] --> load r1[0-7] --> k01*r0[1-8] --> load r1[1-8] --> k02*r0[2-9] --> load r1[2-9] --> k10*r1[0-7] --> - - - - load r2[0-7] --> k11*r1[1-8] --> load r2[1-8] --> k12*r1[2-9] --> load r2[2-9] --> k20*r2[0-7] --> load r0[0-7] --> k21*r2[1-8] --> - - - - load r0[1-8] --> k22*r2[2-9] --> load r0[2-9] ------------------------------------------------------------------------------------------------- - - - - -------------------------------------------------------------------------------------------------------------------------------------------------------- - - (2) register definition: - t0: i_out_h - t1-t2: i_out_w - v0-v1: bias0[0-7], output_data(acc) - v2-v3: bias1[0-7], output_data(acc) - v4-v9: r0 v4,v5:r0[0-7] v6,v7:r0[1-8] v8,v9:r0[2-9] - v10-v15: r3 - v16-v21: r1 - v22-v27: r2 - ft0-ft8: [ k00,k01,k02,k10,k11,k12,k20,k21,k22 ] - ft11: constant float 0.0f, used by fusing relu - - (3) // TODO: support channel mult ?? - opt padding - -*/ - -int DWCONV3X3S1(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *kernel_data = (float *)kernel->data; - float *bias_data = (float *)bias->data; - - int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel - int32_t in_h = input->dim[2]; - int32_t in_w = input->dim[3]; - - int32_t out_c = output->dim[1]; - int32_t out_h = output->dim[2]; - int32_t out_w = output->dim[3]; - - float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); - - in_h = in_h + params->pad_top + params->pad_down; - in_w = in_w + params->pad_left + params->pad_right; - -#pragma omp parallel for num_threads(1) - for (int c = 0; c < in_c; c++) { - float *out = output_data + c * out_h * out_w; - float *outptr0 = out; - float *outptr1 = outptr0 + out_w; - - const float bias0 = bias_data ? bias_data[c] : 0.0f; - - const float *img0 = input_padd_buf + c * in_h * in_w; - const float *r0 = img0; - const float *r1 = r0 + in_w; - const float *r2 = r1 + in_w; - const float *r3 = r2 + in_w; - - const float *kernel0 = kernel_data + c * 9; - -#if __riscv_vector == 128 - - asm volatile( - "vsetvli zero, zero, e32, m2\n\t" - -#ifdef FUSE_CONV_RELU - "fmv.w.x ft11, zero\n\t" -#endif // FUSE_CONV_RELU - - "flw ft0, 0(%0)\n\t" // k00 - "flw ft1, 4(%0)\n\t" // k01 - "flw ft2, 8(%0)\n\t" // k02 - "flw ft3, 12(%0)\n\t" // k10 - "flw ft4, 16(%0)\n\t" // k11 - "flw ft5, 20(%0)\n\t" // k12 - "flw ft6, 24(%0)\n\t" // k20 - "flw ft7, 28(%0)\n\t" // k21 - "flw ft8, 32(%0)\n\t" // k22 - - "srai t0, %7, 1\n\t" // t0 = out_h >> 1 - "beqz t0, 7f\n\t" - - "1:\n\t" // out_h_loop2 - - "srai t1, %8, 3\n\t" // t1 = out_w >> 3 - "beqz t1, 3f\n\t" - - "vsetvli zero, zero, e32, m2\n\t" // set vl = 8 - "vlw.v v4, (%1)\n\t" // r0[0-7] - "addi %1, %1, 4\n\t" // r0++ - "vlw.v v6, (%1)\n\t" // r0[1-8] - "addi %1, %1, 4\n\t" // r0++ - "vlw.v v8, (%1)\n\t" // r0[2-9] - - "2:\n\t" // out_w_loop8 - - "vfmv.v.f v0, %20\n\t" // bias0[0-7] - "addi %1, %1, 24\n\t" // r0 += 6 - - "vlw.v v10, (%4)\n\t" // r3[0-7] - "addi %4, %4, 4\n\t" // r3++ - "vfmv.v.f v2, %20\n\t" // bias1[0-7] - - "vlw.v v12, (%4)\n\t" // r3[1-8] - "addi %4, %4, 4\n\t" // r3++ - - "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-7] - "vfmacc.vf v2, ft6, v10\n\t" // k20 * r3[0-7] - - "vlw.v v14, (%4)\n\t" // r3[2-9] - "addi %4, %4, 24\n\t" // r3 += 6 - - "vfmacc.vf v0, ft1, v6\n\t" // k01 * r0[1-8] - "vfmacc.vf v2, ft7, v12\n\t" // k21 * r3[1-8] - - "vlw.v v16, (%2)\n\t" // r1[0-7] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft2, v8\n\t" // k02 * r0[2-9] - "vfmacc.vf v2, ft8, v14\n\t" // k22 * r3[2-9] - - "vlw.v v18, (%2)\n\t" // r1[1-8] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-7] - "vfmacc.vf v2, ft0, v16\n\t" // k00 * r1[0-7] - - "vlw.v v20, (%2)\n\t" // r1[2-9] - "addi %2, %2, 24\n\t" // r1 += 6 - - "vfmacc.vf v0, ft4, v18\n\t" // k11 * r1[1-8] - "vfmacc.vf v2, ft1, v18\n\t" // k01 * r1[1-8] - - "vlw.v v22, (%3)\n\t" // r2[0-7] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft5, v20\n\t" // k12 * r1[2-9] - "vfmacc.vf v2, ft2, v20\n\t" // k02 * r1[2-9] - - "vlw.v v24, (%3)\n\t" // r2[1-8] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-7] - "vfmacc.vf v2, ft3, v22\n\t" // k10 * r2[0-7] - - "vlw.v v26, (%3)\n\t" // r2[2-9] - "addi %3, %3, 24\n\t" // r2 += 6 - - "vfmacc.vf v0, ft7, v24\n\t" // k21 * r2[1-8] - "vfmacc.vf v2, ft4, v24\n\t" // k11 * r2[1-8] - - "vlw.v v4, (%1)\n\t" // r0[0-7] load r0 for next loop - "addi %1, %1, 4\n\t" // r0++ - - "vfmacc.vf v0, ft8, v26\n\t" // k22 * r2[2-9] - - "vlw.v v6, (%1)\n\t" // r0[1-8] - "addi %1, %1, 4\n\t" // r0++ - -#ifdef FUSE_CONV_RELU - "vfmax.vf v0, v0, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v0, (%5)\n\t" // store line0 8 elements on outptr0 - "addi %5, %5, 32\n\t" // outptr0 += 8 - - "vfmacc.vf v2, ft5, v26\n\t" // k12 * r2[2-9] - - "vlw.v v8, (%1)\n\t" // r0[2-9] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v2, v2, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v2, (%6)\n\t" // store line1 8 elements on outptr1 - "addi %6, %6, 32\n\t" // outptr1 += 8 - - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" - - "addi %1, %1, -8\n\t" // r0 -= 2 ********* bump r0 to origin addr ************ - - "3:\n\t" // out_w4 // h2循环中只有执行一次的机会 - "andi t1, %8, 7\n\t" // t1 = out_w & 7 - "srai t2, t1, 2\n\t" // t2 = (out_w & 7) >> 2 - "beqz t2, 4f\n\t" - - "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 - - "vlw.v v4, (%1)\n\t" // r0[0-3] - "addi %1, %1, 4\n\t" // r0++ - - "vfmv.v.f v0, %20\n\t" // bias0[0-3] - - "vlw.v v10, (%4)\n\t" // r3[0-3] - "addi %4, %4, 4\n\t" // r3++ - - "vfmv.v.f v2, %20\n\t" // bias1[0-3] - - "vlw.v v5, (%1)\n\t" // r0[1-4] - "addi %1, %1, 4\n\t" // r0++ - - "vlw.v v11, (%4)\n\t" // r3[1-4] - "addi %4, %4, 4\n\t" // r3++ - - "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-3] - "vfmacc.vf v2, ft6, v10\n\t" // k20 * r3[0-3] - - "vlw.v v6, (%1)\n\t" // r0[2-5] - "addi %1, %1, 8\n\t" // r0 += 2 - - "vlw.v v12, (%4)\n\t" // r3[2-5] - "addi %4, %4, 8\n\t" // r3 += 2 - - "vfmacc.vf v0, ft1, v5\n\t" // k01 * r0[1-4] - "vfmacc.vf v2, ft7, v11\n\t" // k21 * r3[1-4] - - "vlw.v v16, (%2)\n\t" // r1[0-3] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft2, v6\n\t" // k02 * r0[2-5] - "vfmacc.vf v2, ft8, v12\n\t" // k22 * r3[2-5] - - "vlw.v v17, (%2)\n\t" // r1[1-4] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-3] - "vfmacc.vf v2, ft0, v16\n\t" // k00 * r1[0-3] - - "vlw.v v18, (%2)\n\t" // r1[2-5] - "addi %2, %2, 8\n\t" // r1 += 2 - - "vfmacc.vf v0, ft4, v17\n\t" // k11 * r1[1-4] - "vfmacc.vf v2, ft1, v17\n\t" // k01 * r1[1-4] - - "vlw.v v22, (%3)\n\t" // r2[0-3] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft5, v18\n\t" // k12 * r1[2-5] - "vfmacc.vf v2, ft2, v18\n\t" // k02 * r1[2-5]] - - "vlw.v v23, (%3)\n\t" // r2[1-4] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-3] - "vfmacc.vf v2, ft3, v22\n\t" // k10 * r2[0-3] - - "vlw.v v24, (%3)\n\t" // r2[2-5] - "addi %3, %3, 8\n\t" // r2 += 2 - - "vfmacc.vf v0, ft7, v23\n\t" // k21 * r2[1-4] - "vfmacc.vf v2, ft4, v23\n\t" // k11 * r2[1-4] - - "vfmacc.vf v0, ft8, v24\n\t" // k22 * r2[2-5] - "vfmacc.vf v2, ft5, v24\n\t" // k12 * r2[2-5] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v0, v0, ft11\n\t" // **** relu **** - "vfmax.vf v2, v2, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v0, (%5)\n\t" // store line0 4 elements on outptr0 - "addi %5, %5, 16\n\t" // outptr0 += 4 - "vsw.v v2, (%6)\n\t" // store line1 4 elements on outptr1 - "addi %6, %6, 16\n\t" // outptr1 += 4 - - "4:\n\t" // out_w_tail - "andi t2, t1, 3\n\t" // t2 = (out_w & 7) & 3 - "beqz t2, 6f\n\t" - - "vfmv.v.f v0, %20\n\t" // bias0[0-3] / bias1[0-3] - "li t5, 3\n\t" - "vsetvli zero, t5, e32, m1\n\t" // set vl = 3 - - "vlw.v v5, (%0)\n\t" // k0 - "addi %0, %0, 12\n\t" - "vlw.v v6, (%0)\n\t" // k1 - "addi %0, %0, 12\n\t" - "vlw.v v7, (%0)\n\t" // k2 - - "5:\n\t" // out_w_tail - - "vlw.v v4, (%1)\n\t" // r0 - "addi %1, %1, 4\n\t" // r0++ - - "vlw.v v16, (%2)\n\t" // r1 - "addi %2, %2, 4\n\t" // r1++ - - "vlw.v v22, (%3)\n\t" // r2 - "addi %3, %3, 4\n\t" // r2++ - - "vlw.v v10, (%4)\n\t" // r3 - "addi %4, %4, 4\n\t" // r3++ - - "vfmul.vv v8, v4, v5\n\t" // r0 * k0 - "vfmacc.vv v8, v16, v6\n\t" // += r1 * k1 - "vfmacc.vv v8, v22, v7\n\t" // += r2 * k2 - - "vfredsum.vs v11, v8, v0\n\t" // v11[0] = v0[0] + sum(v8[0..2]) - "vfmv.f.s ft9, v11\n\t" // ft9 = v11[0] - - - "vfmul.vv v9, v16, v5\n\t" // r1 * k0 - "vfmacc.vv v9, v22, v6\n\t" // += r2 * k1 - "vfmacc.vv v9, v10, v7\n\t" // += r3 * k2 - - "vfredsum.vs v12, v9, v0\n\t" // v12[0] = v0[0] + sum(v9[0..2]) - "vfmv.f.s ft10, v12\n\t" // ft10 = v12[0] - -#ifdef FUSE_CONV_RELU - "fmax.s ft9, ft9, ft11\n\t" // **** relu **** - "fmax.s ft10, ft10, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "fsw ft9, 0(%5)\n\t" - "addi %5, %5, 4\n\t" - "fsw ft10, 0(%6)\n\t" - "addi %6, %6, 4\n\t" - - "addi t2, t2, -1\n\t" - "bnez t2, 5b\n\t" - - "addi %0, %0, -24\n\t" // kernel -= 6 ********* bump kernel_data to origin addr ************ - - "6:\n\t" // out_h_loop2 cnt - - "slli t3, %9, 2\n\t" // in_w * 4 - "addi t3, t3, 8\n\t" // in_w * 4 + 8 - - "slli t4, %8, 2\n\t" // out_w * 4 - - "add %1, %1, t3\n\t" // r0 += 2 + in_w - "add %2, %2, t3\n\t" // r1 += 2 + in_w - "add %3, %3, t3\n\t" // r2 += 2 + in_w - "add %4, %4, t3\n\t" // r3 += 2 + in_w - - "add %5, %5, t4\n\t" // outptr0 += out_w - "add %6, %6, t4\n\t" // outptr1 += out_w - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - "7:\n\t" // out_h_tail // 只有执行一次的机会 - "andi t0, %7, 1\n\t" // t0 = out_h & 1 - "beqz t0, 12f\n\t" - - "srai t1, %8, 3\n\t" // t1 = out_w >> 3 - "beqz t1, 9f\n\t" - - "vsetvli zero, zero, e32, m2\n\t" // set vl = 8 - "vlw.v v4, (%1)\n\t" // r0[0-7] - "addi %1, %1, 4\n\t" // r0++ - "vlw.v v6, (%1)\n\t" // r0[1-8] - "addi %1, %1, 4\n\t" // r0++ - "vlw.v v8, (%1)\n\t" // r0[2-9] - - "8:\n\t" // out_w_loop8 (可以考虑用m1,指令更多,但是还可以再错开,便于流水?) - - "vfmv.v.f v0, %20\n\t" // bias0[0-7] - "addi %1, %1, 24\n\t" // r0 += 6 - - "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-7] - - "vlw.v v16, (%2)\n\t" // r1[0-7] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft1, v6\n\t" // k01 * r0[1-8] - - "vlw.v v18, (%2)\n\t" // r1[1-8] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft2, v8\n\t" // k02 * r0[2-9] - - "vlw.v v20, (%2)\n\t" // r1[2-9] - "addi %2, %2, 24\n\t" // r1 += 6 - - "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-7] - - "vlw.v v22, (%3)\n\t" // r2[0-7] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft4, v18\n\t" // k11 * r1[1-8] - - "vlw.v v24, (%3)\n\t" // r2[1-8] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft5, v20\n\t" // k12 * r1[2-9] - - "vlw.v v26, (%3)\n\t" // r2[2-9] - "addi %3, %3, 24\n\t" // r2 += 6 - - "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-7] - - "vlw.v v4, (%1)\n\t" // r0[0-7] - "addi %1, %1, 4\n\t" // r0++ - - "vfmacc.vf v0, ft7, v24\n\t" // k21 * r2[1-8] - - "vlw.v v6, (%1)\n\t" // r0[1-8] - "addi %1, %1, 4\n\t" // r0++ - - "vfmacc.vf v0, ft8, v26\n\t" // k22 * r2[2-9] - - "vlw.v v8, (%1)\n\t" // r0[2-9] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v0, v0, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v0, (%5)\n\t" // store line0 8 elements on outptr0 - "addi %5, %5, 32\n\t" // outptr0 += 8 - - "addi t1, t1, -1\n\t" - "bnez t1, 8b\n\t" - - "addi %1, %1, -8\n\t" // r0 -= 8 ********* bump r0 to origin addr ************ - - "9:\n\t" // out_w4 - "andi t1, %8, 7\n\t" // t1 = out_w & 7 - "srai t2, t1, 2\n\t" // t2 = (out_w & 7) >> 2 - "beqz t2, 10f\n\t" - - "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 - - "vlw.v v4, (%1)\n\t" // r0[0-3] - "addi %1, %1, 4\n\t" // r0++ - - "vfmv.v.f v0, %20\n\t" // bias0[0-3] - - "vlw.v v5, (%1)\n\t" // r0[1-4] - "addi %1, %1, 4\n\t" // r0++ - - "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-3] - - "vlw.v v6, (%1)\n\t" // r0[2-5] - "addi %1, %1, 8\n\t" // r0 += 2 - - "vfmacc.vf v0, ft1, v5\n\t" // k01 * r0[1-4] - - "vlw.v v16, (%2)\n\t" // r1[0-3] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft2, v6\n\t" // k02 * r0[2-5] - - "vlw.v v17, (%2)\n\t" // r1[1-4] - "addi %2, %2, 4\n\t" // r1++ - - "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-3] - - "vlw.v v18, (%2)\n\t" // r1[2-5] - "addi %2, %2, 8\n\t" // r1 += 2 - - "vfmacc.vf v0, ft4, v17\n\t" // k11 * r1[1-4] - - "vlw.v v22, (%3)\n\t" // r2[0-3] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft5, v18\n\t" // k12 * r1[2-5] - - "vlw.v v23, (%3)\n\t" // r2[1-4] - "addi %3, %3, 4\n\t" // r2++ - - "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-3] - - "vlw.v v24, (%3)\n\t" // r2[2-5] - "addi %3, %3, 8\n\t" // r2 += 2 - - "vfmacc.vf v0, ft7, v23\n\t" // k21 * r2[1-4] - - "vfmacc.vf v0, ft8, v24\n\t" // k22 * r2[2-5] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v0, v0, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v0, (%5)\n\t" // store line0 4 elements on outptr0 - "addi %5, %5, 16\n\t" // outptr0 += 4 - - "10:\n\t" // out_w_tail - "andi t2, t1, 3\n\t" - "beqz t2, 12f\n\t" - - "vfmv.v.f v0, %20\n\t" // bias0[0-3] - "li t5, 3\n\t" - "vsetvli zero, t5, e32, m1\n\t" // set vl = 3 - - "vlw.v v5, (%0)\n\t" // k0 - "addi %0, %0, 12\n\t" - "vlw.v v6, (%0)\n\t" // k1 - "addi %0, %0, 12\n\t" - "vlw.v v7, (%0)\n\t" // k2 - - "11:\n\t" // out_w_tail - - "vlw.v v4, (%1)\n\t" // r0 - "addi %1, %1, 4\n\t" // r0++ - - "vlw.v v16, (%2)\n\t" // r1 - "addi %2, %2, 4\n\t" // r1++ - - "vlw.v v22, (%3)\n\t" // r2 - "addi %3, %3, 4\n\t" // r2++ - - "vfmul.vv v8, v4, v5\n\t" // r0 * k0 - "vfmacc.vv v8, v16, v6\n\t" // += r1 * k1 - "vfmacc.vv v8, v22, v7\n\t" // += r2 * k2 - - "vfredsum.vs v11, v8, v0\n\t" // v11[0] = v0[0] + sum(v8[0..2]) - "vfmv.f.s ft9, v11\n\t" // ft9 = v11[0] - -#ifdef FUSE_CONV_RELU - "fmax.s ft9, ft9, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "fsw ft9, 0(%5)\n\t" - "addi %5, %5, 4\n\t" - - "addi t2, t2, -1\n\t" - "bnez t2, 11b\n\t" - - "12:\n\t" - // updata addr - "addi %1, %1, 8\n\t" // r0 += 2 - "addi %2, %2, 8\n\t" // r1 += 2 - "addi %3, %3, 8\n\t" // r2 += 2 - - :"=r"(kernel0), // %0 - "=r"(r0), // %1 - "=r"(r1), // %2 - "=r"(r2), // %3 - "=r"(r3), // %4 - "=r"(outptr0), // %5 - "=r"(outptr1), // %6 - "=r"(out_h), // %7 - "=r"(out_w), // %8 - "=r"(in_w) // %9 - :"0"(kernel0), - "1"(r0), - "2"(r1), - "3"(r2), - "4"(r3), - "5"(outptr0), - "6"(outptr1), - "7"(out_h), - "8"(out_w), - "9"(in_w), - "f"(bias0) // %20 - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11", "t0", "t1", "t2", "t3", "t4", "t5" - ); - } -#else - const float *k0 = kernel0; - const float *k1 = k0 + 3; - const float *k2 = k1 + 3; - - int h = 0; - for (; h + 1 < out_h; h += 2) - { - for (int w = 0; w < out_w; w++) { - float sum0 = bias0; - float sum1 = bias0; - - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; - - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; - sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2]; - - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; - sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2]; - - sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2]; - -#ifdef FUSE_CONV_RELU - sum0 = sum0 > 0 ? sum0 : 0; - sum1 = sum1 > 0 ? sum1 : 0; -#endif // FUSE_CONV_RELU - - *outptr0 = sum0; - *outptr1 = sum1; - - r0++; - r1++; - r2++; - r3++; - outptr0++; - outptr1++; - } - r0 += 2 + in_w; // jump to next line - r1 += 2 + in_w; - r2 += 2 + in_w; - r3 += 2 + in_w; - - outptr0 += out_w; - outptr1 += out_w; - } - - for (; h < out_h; h++) { - for (int w = 0; w < out_w; w++) { - float sum0 = bias0; - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; - -#ifdef FUSE_CONV_RELU - sum0 = sum0 > 0 ? sum0 : 0; -#endif // FUSE_CONV_RELU - - *outptr0 = sum0; - r0++; - r1++; - r2++; - outptr0++; - } - - r0 += 2; - r1 += 2; - r2 += 2; - } - } -#endif // __riscv_vector - - csi_mem_free(input_padd_buf); - return CSINN_TRUE; -} - - -/* - (1) Algorithm works as follows: - out_h1_loop: out_w4_loop --> out_w_tail - - k00*r00 k00*r02 k00*r04 k00*r06 - k01*r01 k01*r03 k01*r05 k01*r07 - k02*r02 k02*r04 k02*r06 k02*r08 - ---------------------------------------- - k10*r10 k10*r12 k10*r14 k10*r16 - k11*r11 k11*r13 k11*r15 k11*r17 - k12*r12 k12*r14 k12*r16 k12*r18 - ---------------------------------------- - k20*r20 k20*r22 k20*r24 k20*r26 - k21*r21 k21*r23 k21*r25 k21*r27 - k22*r22 k22*r24 k22*r26 k22*r28 - - 计算 k * r 时可以用 .vv 也可以用 .vf - - (2) register definition: - t0: i_out_h loop cnt - t1-t2: i_out_w loop cnt - t3: load stride 2 for r0-r2 - t4: constant 3 for setting vl = 3 - ft0: hold 1 output data - ft1-ft9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] - ft11: constant float 0.0f, used by fusing relu - v0: bias, acc - v4-v5: r0[0,2.4.6] r0[1,3,5,7] - v1: r0[2,4,6,8] - v6-v7: r1[0,2.4.6] r1[1,3,5,7] - v2: r1[2,4,6,8] - v8-v9: r2[0,2.4.6] r2[1,3,5,7] - v3: r2[2,4,6,8] - v10-v12: k0, k1, k2 - v20-v21: [ acc(kx1*rx), acc(kx2*rx) ] - - (3) //TODO: support channel mult ?? - Staggered instructions -*/ - -int DWCONV3X3S2(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *kernel_data = (float *)kernel->data; - float *bias_data = (float *)bias->data; - - int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel - int32_t in_h = input->dim[2]; - int32_t in_w = input->dim[3]; - - int32_t out_c = output->dim[1]; - int32_t out_h = output->dim[2]; - int32_t out_w = output->dim[3]; - - float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); - - in_h = in_h + params->pad_top + params->pad_down; - in_w = in_w + params->pad_left + params->pad_right; - - int tailstep = in_w - 2 * out_w + in_w; - -#pragma omp parallel for num_threads(1) - for (int c = 0; c < in_c; c++) { - - float *out = output_data + c * out_h * out_w; - float *outptr0 = out; - - const float bias0 = bias_data ? bias_data[c] : 0.0f; - - const float *img0 = input_padd_buf + c * in_h * in_w; - const float *r0 = img0; - const float *r1 = r0 + in_w; - const float *r2 = r1 + in_w; - - const float *kernel0 = kernel_data + c * 9; - -#if __riscv_vector == 128 - - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" - "li t3, 8\n\t" // load stride for r_x - -#ifdef FUSE_CONV_RELU - "fmv.w.x ft11, zero\n\t" -#endif // FUSE_CONV_RELU - - "flw ft1, (%0)\n\t" - "flw ft2, 4(%0)\n\t" - "flw ft3, 8(%0)\n\t" - "flw ft4, 12(%0)\n\t" - "flw ft5, 16(%0)\n\t" - "flw ft6, 20(%0)\n\t" - "flw ft7, 24(%0)\n\t" - "flw ft8, 28(%0)\n\t" - "flw ft9, 32(%0)\n\t" // load k00 - k22 - - "vlw.v v10, (%0)\n\t" // k0 - "addi %0, %0, 12\n\t" - "vlw.v v11, (%0)\n\t" // k1 - "addi %0, %0, 12\n\t" - "vlw.v v12, (%0)\n\t" // k2 - - "vfmv.v.f v0, %16\n\t" // bias0 - - "mv t0, %5\n\t" // i_out_h = out_h - - "1:\n\t" // out_h - - "srai t1, %6, 2\n\t" // t1 = out_w >> 2 - "beqz t1, 3f\n\t" - "vsetvli zero, zero, e32, m1\n\t" - - // pre-load rxx - "vlseg2e.v v4, (%1)\n\t" // v4[0..3] = r0[0,2.4.6] v5[0..3] = r0[1,3,5,7] - "addi %1, %1, 8\n\t" // r0 += 2 - "vlsw.v v1, (%1), t3\n\t" // r0[2,4,6,8] - "addi %1, %1, 24\n\t" - - "2:\n\t" // out_w_loop4 - - "vlseg2e.v v6, (%2)\n\t" // v6[0..3] = r1[0,2.4.6] v7[0..3] = r1[1,3,5,7] - "addi %2, %2, 8\n\t" - "vfmul.vf v20, v4, ft1\n\t" // = k00 * r0[0,2,4,6] - "vfmul.vf v21, v5, ft2\n\t" // = k01 * r0[1,3,5,7] - "vlsw.v v2, (%2), t3\n\t" - "addi %2, %2, 24\n\t" - "vfmacc.vf v0, ft3, v1\n\t" // += k02 * r0[2,4,6,8] - - - "vlseg2e.v v8, (%3)\n\t" // v8[0..3] = r2[0,2.4.6] v9[0..3] = r2[1,3,5,7] - "addi %3, %3, 8\n\t" - "vfmacc.vf v20, ft4, v6\n\t" // += k10 * r1[0,2,4,6] - "vfmacc.vf v21, ft5, v7\n\t" // += k11 * r1[1,3,5,7] - "vlsw.v v3, (%3), t3\n\t" - "addi %3, %3, 24\n\t" - "vfmacc.vf v0, ft6, v2\n\t" // += k12 * r1[2,4,6,8] - - - "vlseg2e.v v4, (%1)\n\t" // v4[0..3] = r0[0,2.4.6] v5[0..3] = r0[1,3,5,7] - "addi %1, %1, 8\n\t" // r0 += 2 - "vfmacc.vf v20, ft7, v8\n\t" // += k20 * r2[0,2,4,6] - "vfmacc.vf v21, ft8, v9\n\t" // += k21 * r2[1,3,5,7] - "vlsw.v v1, (%1), t3\n\t" // r0[2,4,6,8] - "addi %1, %1, 24\n\t" - "vfmacc.vf v0, ft9, v3\n\t" // += k22 * r2[2,4,6,8] - - - "vfadd.vv v2, v20, v21\n\t" - "vfadd.vv v0, v0, v2\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v0, v0, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v0, (%4)\n\t" - "addi %4, %4, 16\n\t" // outptr += 16 - - "vfmv.v.f v0, %16\n\t" // bias0 - - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" - - "addi %1, %1, -32\n\t" // r0 -= 8 ********* bump r0 to origin addr ************ - - "3:\n\t" // out_w_tail - "andi t2, %6, 3\n\t" // t2 = out_w & 3 - "beqz t2, 5f\n\t" - - - "4:\n\t" // out_w_tail - "vlw.v v4, (%1)\n\t" // r0 - "addi %1, %1, 8\n\t" - "vlw.v v6, (%2)\n\t" // r1 - "addi %2, %2, 8\n\t" - "vlw.v v8, (%3)\n\t" // r2 - "addi %3, %3, 8\n\t" - - "vfmul.vv v20, v4, v10\n\t" // r0 * k0 - "vfmacc.vv v20, v6, v11\n\t" // += r1 * k1 - "vfmacc.vv v20, v8, v12\n\t" // += r2 * k2 - - "li t4, 3\n\t" - "vsetvli zero, t4, e32, m1\n\t" // set vl = 3 - "vfredsum.vs v21, v20, v0\n\t" // v21[0] = v0[0](bias) + sum(v20[0..2]) - - "vfmv.f.s ft0, v21\n\t" // ft0 = v21[0] - -#ifdef FUSE_CONV_RELU - "fmax.s ft0, ft0, ft11\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "fsw ft0, 0(%4)\n\t" - "addi %4, %4, 4\n\t" // bump output_data pointer - - "addi t2, t2, -1\n\t" - "bnez t2, 4b\n\t" - - "5:\n\t" - "slli t2, %7, 2\n\t" // t2 = tailstep * 4 - "add %1, %1, t2\n\t" - "add %2, %2, t2\n\t" - "add %3, %3, t2\n\t" // r0/r1/r2 += tailstep - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - :"=r"(kernel0), // %0 - "=r"(r0), // %1 - "=r"(r1), // %2 - "=r"(r2), // %3 - "=r"(outptr0), // %4 - "=r"(out_h), // %5 - "=r"(out_w), // %6 - "=r"(tailstep) // %7 - :"0"(kernel0), - "1"(r0), - "2"(r1), - "3"(r2), - "4"(outptr0), - "5"(out_h), - "6"(out_w), - "7"(tailstep), - "f"(bias0) // %16 - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v20", "v21", - "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft11", "t0", "t1", "t2", "t3", "t4" - ); - } -#else - const float *k0 = kernel0; - const float *k1 = k0 + 3; - const float *k2 = k1 + 3; - int h = 0; - for (; h < out_h; h++) { - for (int w = 0; w < out_w; w++) { - float sum0 = bias0; - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; - -#ifdef FUSE_CONV_RELU - sum0 = sum0 > 0 ? sum0 : 0; -#endif // FUSE_CONV_RELU - - *outptr0 = sum0; - r0 += 2; - r1 += 2; - r2 += 2; - outptr0++; - } - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; - } - } -#endif // __riscv_vector - - csi_mem_free(input_padd_buf); - return CSINN_TRUE; -} diff --git a/source/c906_opt/depthwise_convolution_3x3_fp16.c b/source/c906_opt/depthwise_convolution_3x3_fp16.c index 4180ab9f..4ad89972 100644 --- a/source/c906_opt/depthwise_convolution_3x3_fp16.c +++ b/source/c906_opt/depthwise_convolution_3x3_fp16.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" /* (1) Algorithm works as follows: @@ -55,11 +54,9 @@ */ -int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -75,9 +72,13 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); + __fp16 *input_padd_buf = + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_c906_pad_input_fp16(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + shl_c906_pad_input_fp16( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -561,7 +562,7 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, ); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } @@ -587,11 +588,9 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, */ -int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -607,9 +606,13 @@ int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input, int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); + __fp16 *input_padd_buf = + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_c906_pad_input_fp16(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + shl_c906_pad_input_fp16( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -820,6 +823,6 @@ int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input, ); } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } diff --git a/source/c906_opt/depthwise_convolution_3x3_fp32.c b/source/c906_opt/depthwise_convolution_3x3_fp32.c new file mode 100644 index 00000000..0023a0e0 --- /dev/null +++ b/source/c906_opt/depthwise_convolution_3x3_fp32.c @@ -0,0 +1,968 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c906.h" + +#ifndef DWCONV3X3S1 +#define DWCONV3X3S1 shl_c906_dwconv3x3s1 +#endif + +#ifndef DWCONV3X3S2 +#define DWCONV3X3S2 shl_c906_dwconv3x3s2 +#endif + +/* + (1) Algorithm works as follows: + out_h2: out_h2_w8_loop --> out_h2_w4 --> out_h2_wtail + out_h_tail: out_h1_w8_loop --> out_h1_w4 --> out_h1_wtail + + out_h2_w8: out_h2_w4: || out_h1_w8: out_h1_w4: + outptr0[0-7]: outptr1[0-7]: outptr0[0-3]: outptr1[0-3] || + outptr0[0-7]: outptr0[0-3]: k00 * r0[0-7] k00 * r1[0-7] k00 * + r0[0-3] k00 * r1[0-3] || k00 * r0[0-7] k00 * r0[0-3] k01 * + r0[1-8] k01 * r1[1-8] k01 * r0[1-4] k01 * r1[1-4] || k01 + * r0[1-8] k01 * r0[1-4] k02 * r0[2-9] k02 * r1[2-9] k02 * r0[2-5] + k02 * r1[2-5] || k02 * r0[2-9] k02 * r0[2-5] k10 * r1[0-7] k10 * + r2[0-7] k10 * r1[0-3] k10 * r2[0-3] || k10 * r1[0-7] k10 * + r1[0-3] k11 * r1[1-8] k11 * r2[1-8] k11 * r1[1-4] k11 * r2[1-4] || + k11 * r1[1-8] k11 * r1[1-4] k12 * r1[2-9] k12 * r2[2-9] k12 * + r1[2-5] k12 * r2[2-5] || k12 * r1[2-9] k12 * r1[2-5] k20 * + r2[0-7] k20 * r3[0-7] k20 * r2[0-3] k20 * r3[0-3] || k20 + * r2[0-7] k20 * r2[0-3] k21 * r2[1-8] k21 * r3[1-8] k21 * r2[1-4] + k21 * r3[1-4] || k21 * r2[1-8] k21 * r2[1-4] k22 * r2[2-9] k22 * + r3[2-9] k22 * r2[2-5] k22 * r3[2-5] || k22 * r2[2-9] k22 * + r2[2-5] + + h2_w8_loop execution process: + + load r0[0-7] --> load r0[1-8] --> load r0[2-9] --> // Load r0[0-7] r0[1-8] + r0[-9] before the loop to facilitate pipeline work + + --> load bias0[0-7] --> load r3[0-7] --> load bias1[0-7] --> load r3[1-8] --> + k00*r0[0-7] / k20*r3[0-7] --> + - + - load r3[2-9] --> k01*r0[1-8] / k21*r3[1-8] --> load r1[0-7] --> k02*r0[2-9] / + k22*r3[2-9] --> load r1[1-8] --> k10*r1[0-7] / k00*r1[0-7] --> + - + - load r1[2-9] --> k11*r1[1-8] / k01*r1[1-8] --> load r2[0-7] --> k12*r1[2-9] / + k02*r1[2-9] --> load r2[1-8] --> k20*r2[0-7] / k10*r2[0-7] --> + - + - load r2[2-9] --> k21*r2[1-8] / k11*r2[1-8] --> load r0[0-7] --> k22*r2[2-9] / + k12*r2[2-9] --> load r0[1-8] --> load r0[2-9] ---------------- + - - + ----------------------------------------------------------------------------------------------------------------------------------------------------------- + + + h1_w8_loop execution process: + + load r0[0-7] --> load r0[1-8] --> load r0[2-9] --> + + --> load bias0[0-7] --> k00*r0[0-7] --> load r1[0-7] --> k01*r0[1-8] --> load + r1[1-8] --> k02*r0[2-9] --> load r1[2-9] --> k10*r1[0-7] --> + - + - load r2[0-7] --> k11*r1[1-8] --> load r2[1-8] --> k12*r1[2-9] --> load + r2[2-9] --> k20*r2[0-7] --> load r0[0-7] --> k21*r2[1-8] --> + - + - load r0[1-8] --> k22*r2[2-9] --> load r0[2-9] + ------------------------------------------------------------------------------------------------- + - - + -------------------------------------------------------------------------------------------------------------------------------------------------------- + + (2) register definition: + t0: i_out_h + t1-t2: i_out_w + v0-v1: bias0[0-7], output_data(acc) + v2-v3: bias1[0-7], output_data(acc) + v4-v9: r0 v4,v5:r0[0-7] v6,v7:r0[1-8] v8,v9:r0[2-9] + v10-v15: r3 + v16-v21: r1 + v22-v27: r2 + ft0-ft8: [ k00,k01,k02,k10,k11,k12,k20,k21,k22 ] + ft11: constant float 0.0f, used by fusing relu + + (3) // TODO: support channel mult ?? + opt padding + +*/ + +int DWCONV3X3S1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + shl_c906_pad_input( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c < in_c; c++) { + float *out = output_data + c * out_h * out_w; + float *outptr0 = out; + float *outptr1 = outptr0 + out_w; + + const float bias0 = bias_data ? bias_data[c] : 0.0f; + + const float *img0 = input_padd_buf + c * in_h * in_w; + const float *r0 = img0; + const float *r1 = r0 + in_w; + const float *r2 = r1 + in_w; + const float *r3 = r2 + in_w; + + const float *kernel0 = kernel_data + c * 9; + +#if __riscv_vector == 128 + + asm volatile( + "vsetvli zero, zero, e32, m2\n\t" + +#ifdef FUSE_CONV_RELU + "fmv.w.x ft11, zero\n\t" +#endif // FUSE_CONV_RELU + + "flw ft0, 0(%0)\n\t" // k00 + "flw ft1, 4(%0)\n\t" // k01 + "flw ft2, 8(%0)\n\t" // k02 + "flw ft3, 12(%0)\n\t" // k10 + "flw ft4, 16(%0)\n\t" // k11 + "flw ft5, 20(%0)\n\t" // k12 + "flw ft6, 24(%0)\n\t" // k20 + "flw ft7, 28(%0)\n\t" // k21 + "flw ft8, 32(%0)\n\t" // k22 + + "srai t0, %7, 1\n\t" // t0 = out_h >> 1 + "beqz t0, 7f\n\t" + + "1:\n\t" // out_h_loop2 + + "srai t1, %8, 3\n\t" // t1 = out_w >> 3 + "beqz t1, 3f\n\t" + + "vsetvli zero, zero, e32, m2\n\t" // set vl = 8 + "vlw.v v4, (%1)\n\t" // r0[0-7] + "addi %1, %1, 4\n\t" // r0++ + "vlw.v v6, (%1)\n\t" // r0[1-8] + "addi %1, %1, 4\n\t" // r0++ + "vlw.v v8, (%1)\n\t" // r0[2-9] + + "2:\n\t" // out_w_loop8 + + "vfmv.v.f v0, %20\n\t" // bias0[0-7] + "addi %1, %1, 24\n\t" // r0 += 6 + + "vlw.v v10, (%4)\n\t" // r3[0-7] + "addi %4, %4, 4\n\t" // r3++ + "vfmv.v.f v2, %20\n\t" // bias1[0-7] + + "vlw.v v12, (%4)\n\t" // r3[1-8] + "addi %4, %4, 4\n\t" // r3++ + + "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-7] + "vfmacc.vf v2, ft6, v10\n\t" // k20 * r3[0-7] + + "vlw.v v14, (%4)\n\t" // r3[2-9] + "addi %4, %4, 24\n\t" // r3 += 6 + + "vfmacc.vf v0, ft1, v6\n\t" // k01 * r0[1-8] + "vfmacc.vf v2, ft7, v12\n\t" // k21 * r3[1-8] + + "vlw.v v16, (%2)\n\t" // r1[0-7] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft2, v8\n\t" // k02 * r0[2-9] + "vfmacc.vf v2, ft8, v14\n\t" // k22 * r3[2-9] + + "vlw.v v18, (%2)\n\t" // r1[1-8] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-7] + "vfmacc.vf v2, ft0, v16\n\t" // k00 * r1[0-7] + + "vlw.v v20, (%2)\n\t" // r1[2-9] + "addi %2, %2, 24\n\t" // r1 += 6 + + "vfmacc.vf v0, ft4, v18\n\t" // k11 * r1[1-8] + "vfmacc.vf v2, ft1, v18\n\t" // k01 * r1[1-8] + + "vlw.v v22, (%3)\n\t" // r2[0-7] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft5, v20\n\t" // k12 * r1[2-9] + "vfmacc.vf v2, ft2, v20\n\t" // k02 * r1[2-9] + + "vlw.v v24, (%3)\n\t" // r2[1-8] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-7] + "vfmacc.vf v2, ft3, v22\n\t" // k10 * r2[0-7] + + "vlw.v v26, (%3)\n\t" // r2[2-9] + "addi %3, %3, 24\n\t" // r2 += 6 + + "vfmacc.vf v0, ft7, v24\n\t" // k21 * r2[1-8] + "vfmacc.vf v2, ft4, v24\n\t" // k11 * r2[1-8] + + "vlw.v v4, (%1)\n\t" // r0[0-7] load r0 for next loop + "addi %1, %1, 4\n\t" // r0++ + + "vfmacc.vf v0, ft8, v26\n\t" // k22 * r2[2-9] + + "vlw.v v6, (%1)\n\t" // r0[1-8] + "addi %1, %1, 4\n\t" // r0++ + +#ifdef FUSE_CONV_RELU + "vfmax.vf v0, v0, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v0, (%5)\n\t" // store line0 8 elements on outptr0 + "addi %5, %5, 32\n\t" // outptr0 += 8 + + "vfmacc.vf v2, ft5, v26\n\t" // k12 * r2[2-9] + + "vlw.v v8, (%1)\n\t" // r0[2-9] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v2, v2, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v2, (%6)\n\t" // store line1 8 elements on outptr1 + "addi %6, %6, 32\n\t" // outptr1 += 8 + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + "addi %1, %1, -8\n\t" // r0 -= 2 ********* bump r0 to origin addr + // ************ + + "3:\n\t" // out_w4 // h2循环中只有执行一次的机会 + "andi t1, %8, 7\n\t" // t1 = out_w & 7 + "srai t2, t1, 2\n\t" // t2 = (out_w & 7) >> 2 + "beqz t2, 4f\n\t" + + "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 + + "vlw.v v4, (%1)\n\t" // r0[0-3] + "addi %1, %1, 4\n\t" // r0++ + + "vfmv.v.f v0, %20\n\t" // bias0[0-3] + + "vlw.v v10, (%4)\n\t" // r3[0-3] + "addi %4, %4, 4\n\t" // r3++ + + "vfmv.v.f v2, %20\n\t" // bias1[0-3] + + "vlw.v v5, (%1)\n\t" // r0[1-4] + "addi %1, %1, 4\n\t" // r0++ + + "vlw.v v11, (%4)\n\t" // r3[1-4] + "addi %4, %4, 4\n\t" // r3++ + + "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-3] + "vfmacc.vf v2, ft6, v10\n\t" // k20 * r3[0-3] + + "vlw.v v6, (%1)\n\t" // r0[2-5] + "addi %1, %1, 8\n\t" // r0 += 2 + + "vlw.v v12, (%4)\n\t" // r3[2-5] + "addi %4, %4, 8\n\t" // r3 += 2 + + "vfmacc.vf v0, ft1, v5\n\t" // k01 * r0[1-4] + "vfmacc.vf v2, ft7, v11\n\t" // k21 * r3[1-4] + + "vlw.v v16, (%2)\n\t" // r1[0-3] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft2, v6\n\t" // k02 * r0[2-5] + "vfmacc.vf v2, ft8, v12\n\t" // k22 * r3[2-5] + + "vlw.v v17, (%2)\n\t" // r1[1-4] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-3] + "vfmacc.vf v2, ft0, v16\n\t" // k00 * r1[0-3] + + "vlw.v v18, (%2)\n\t" // r1[2-5] + "addi %2, %2, 8\n\t" // r1 += 2 + + "vfmacc.vf v0, ft4, v17\n\t" // k11 * r1[1-4] + "vfmacc.vf v2, ft1, v17\n\t" // k01 * r1[1-4] + + "vlw.v v22, (%3)\n\t" // r2[0-3] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft5, v18\n\t" // k12 * r1[2-5] + "vfmacc.vf v2, ft2, v18\n\t" // k02 * r1[2-5]] + + "vlw.v v23, (%3)\n\t" // r2[1-4] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-3] + "vfmacc.vf v2, ft3, v22\n\t" // k10 * r2[0-3] + + "vlw.v v24, (%3)\n\t" // r2[2-5] + "addi %3, %3, 8\n\t" // r2 += 2 + + "vfmacc.vf v0, ft7, v23\n\t" // k21 * r2[1-4] + "vfmacc.vf v2, ft4, v23\n\t" // k11 * r2[1-4] + + "vfmacc.vf v0, ft8, v24\n\t" // k22 * r2[2-5] + "vfmacc.vf v2, ft5, v24\n\t" // k12 * r2[2-5] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v0, v0, ft11\n\t" // **** relu **** + "vfmax.vf v2, v2, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v0, (%5)\n\t" // store line0 4 elements on outptr0 + "addi %5, %5, 16\n\t" // outptr0 += 4 + "vsw.v v2, (%6)\n\t" // store line1 4 elements on outptr1 + "addi %6, %6, 16\n\t" // outptr1 += 4 + + "4:\n\t" // out_w_tail + "andi t2, t1, 3\n\t" // t2 = (out_w & 7) & 3 + "beqz t2, 6f\n\t" + + "vfmv.v.f v0, %20\n\t" // bias0[0-3] / bias1[0-3] + "li t5, 3\n\t" + "vsetvli zero, t5, e32, m1\n\t" // set vl = 3 + + "vlw.v v5, (%0)\n\t" // k0 + "addi %0, %0, 12\n\t" + "vlw.v v6, (%0)\n\t" // k1 + "addi %0, %0, 12\n\t" + "vlw.v v7, (%0)\n\t" // k2 + + "5:\n\t" // out_w_tail + + "vlw.v v4, (%1)\n\t" // r0 + "addi %1, %1, 4\n\t" // r0++ + + "vlw.v v16, (%2)\n\t" // r1 + "addi %2, %2, 4\n\t" // r1++ + + "vlw.v v22, (%3)\n\t" // r2 + "addi %3, %3, 4\n\t" // r2++ + + "vlw.v v10, (%4)\n\t" // r3 + "addi %4, %4, 4\n\t" // r3++ + + "vfmul.vv v8, v4, v5\n\t" // r0 * k0 + "vfmacc.vv v8, v16, v6\n\t" // += r1 * k1 + "vfmacc.vv v8, v22, v7\n\t" // += r2 * k2 + + "vfredsum.vs v11, v8, v0\n\t" // v11[0] = v0[0] + sum(v8[0..2]) + "vfmv.f.s ft9, v11\n\t" // ft9 = v11[0] + + "vfmul.vv v9, v16, v5\n\t" // r1 * k0 + "vfmacc.vv v9, v22, v6\n\t" // += r2 * k1 + "vfmacc.vv v9, v10, v7\n\t" // += r3 * k2 + + "vfredsum.vs v12, v9, v0\n\t" // v12[0] = v0[0] + sum(v9[0..2]) + "vfmv.f.s ft10, v12\n\t" // ft10 = v12[0] + +#ifdef FUSE_CONV_RELU + "fmax.s ft9, ft9, ft11\n\t" // **** relu **** + "fmax.s ft10, ft10, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "fsw ft9, 0(%5)\n\t" + "addi %5, %5, 4\n\t" + "fsw ft10, 0(%6)\n\t" + "addi %6, %6, 4\n\t" + + "addi t2, t2, -1\n\t" + "bnez t2, 5b\n\t" + + "addi %0, %0, -24\n\t" // kernel -= 6 ********* bump kernel_data to origin + // addr ************ + + "6:\n\t" // out_h_loop2 cnt + + "slli t3, %9, 2\n\t" // in_w * 4 + "addi t3, t3, 8\n\t" // in_w * 4 + 8 + + "slli t4, %8, 2\n\t" // out_w * 4 + + "add %1, %1, t3\n\t" // r0 += 2 + in_w + "add %2, %2, t3\n\t" // r1 += 2 + in_w + "add %3, %3, t3\n\t" // r2 += 2 + in_w + "add %4, %4, t3\n\t" // r3 += 2 + in_w + + "add %5, %5, t4\n\t" // outptr0 += out_w + "add %6, %6, t4\n\t" // outptr1 += out_w + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "7:\n\t" // out_h_tail // 只有执行一次的机会 + "andi t0, %7, 1\n\t" // t0 = out_h & 1 + "beqz t0, 12f\n\t" + + "srai t1, %8, 3\n\t" // t1 = out_w >> 3 + "beqz t1, 9f\n\t" + + "vsetvli zero, zero, e32, m2\n\t" // set vl = 8 + "vlw.v v4, (%1)\n\t" // r0[0-7] + "addi %1, %1, 4\n\t" // r0++ + "vlw.v v6, (%1)\n\t" // r0[1-8] + "addi %1, %1, 4\n\t" // r0++ + "vlw.v v8, (%1)\n\t" // r0[2-9] + + "8:\n\t" // out_w_loop8 (可以考虑用m1,指令更多,但是还可以再错开,便于流水?) + + "vfmv.v.f v0, %20\n\t" // bias0[0-7] + "addi %1, %1, 24\n\t" // r0 += 6 + + "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-7] + + "vlw.v v16, (%2)\n\t" // r1[0-7] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft1, v6\n\t" // k01 * r0[1-8] + + "vlw.v v18, (%2)\n\t" // r1[1-8] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft2, v8\n\t" // k02 * r0[2-9] + + "vlw.v v20, (%2)\n\t" // r1[2-9] + "addi %2, %2, 24\n\t" // r1 += 6 + + "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-7] + + "vlw.v v22, (%3)\n\t" // r2[0-7] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft4, v18\n\t" // k11 * r1[1-8] + + "vlw.v v24, (%3)\n\t" // r2[1-8] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft5, v20\n\t" // k12 * r1[2-9] + + "vlw.v v26, (%3)\n\t" // r2[2-9] + "addi %3, %3, 24\n\t" // r2 += 6 + + "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-7] + + "vlw.v v4, (%1)\n\t" // r0[0-7] + "addi %1, %1, 4\n\t" // r0++ + + "vfmacc.vf v0, ft7, v24\n\t" // k21 * r2[1-8] + + "vlw.v v6, (%1)\n\t" // r0[1-8] + "addi %1, %1, 4\n\t" // r0++ + + "vfmacc.vf v0, ft8, v26\n\t" // k22 * r2[2-9] + + "vlw.v v8, (%1)\n\t" // r0[2-9] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v0, v0, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v0, (%5)\n\t" // store line0 8 elements on outptr0 + "addi %5, %5, 32\n\t" // outptr0 += 8 + + "addi t1, t1, -1\n\t" + "bnez t1, 8b\n\t" + + "addi %1, %1, -8\n\t" // r0 -= 8 ********* bump r0 to origin addr + // ************ + + "9:\n\t" // out_w4 + "andi t1, %8, 7\n\t" // t1 = out_w & 7 + "srai t2, t1, 2\n\t" // t2 = (out_w & 7) >> 2 + "beqz t2, 10f\n\t" + + "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 + + "vlw.v v4, (%1)\n\t" // r0[0-3] + "addi %1, %1, 4\n\t" // r0++ + + "vfmv.v.f v0, %20\n\t" // bias0[0-3] + + "vlw.v v5, (%1)\n\t" // r0[1-4] + "addi %1, %1, 4\n\t" // r0++ + + "vfmacc.vf v0, ft0, v4\n\t" // k00 * r0[0-3] + + "vlw.v v6, (%1)\n\t" // r0[2-5] + "addi %1, %1, 8\n\t" // r0 += 2 + + "vfmacc.vf v0, ft1, v5\n\t" // k01 * r0[1-4] + + "vlw.v v16, (%2)\n\t" // r1[0-3] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft2, v6\n\t" // k02 * r0[2-5] + + "vlw.v v17, (%2)\n\t" // r1[1-4] + "addi %2, %2, 4\n\t" // r1++ + + "vfmacc.vf v0, ft3, v16\n\t" // k10 * r1[0-3] + + "vlw.v v18, (%2)\n\t" // r1[2-5] + "addi %2, %2, 8\n\t" // r1 += 2 + + "vfmacc.vf v0, ft4, v17\n\t" // k11 * r1[1-4] + + "vlw.v v22, (%3)\n\t" // r2[0-3] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft5, v18\n\t" // k12 * r1[2-5] + + "vlw.v v23, (%3)\n\t" // r2[1-4] + "addi %3, %3, 4\n\t" // r2++ + + "vfmacc.vf v0, ft6, v22\n\t" // k20 * r2[0-3] + + "vlw.v v24, (%3)\n\t" // r2[2-5] + "addi %3, %3, 8\n\t" // r2 += 2 + + "vfmacc.vf v0, ft7, v23\n\t" // k21 * r2[1-4] + + "vfmacc.vf v0, ft8, v24\n\t" // k22 * r2[2-5] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v0, v0, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v0, (%5)\n\t" // store line0 4 elements on outptr0 + "addi %5, %5, 16\n\t" // outptr0 += 4 + + "10:\n\t" // out_w_tail + "andi t2, t1, 3\n\t" + "beqz t2, 12f\n\t" + + "vfmv.v.f v0, %20\n\t" // bias0[0-3] + "li t5, 3\n\t" + "vsetvli zero, t5, e32, m1\n\t" // set vl = 3 + + "vlw.v v5, (%0)\n\t" // k0 + "addi %0, %0, 12\n\t" + "vlw.v v6, (%0)\n\t" // k1 + "addi %0, %0, 12\n\t" + "vlw.v v7, (%0)\n\t" // k2 + + "11:\n\t" // out_w_tail + + "vlw.v v4, (%1)\n\t" // r0 + "addi %1, %1, 4\n\t" // r0++ + + "vlw.v v16, (%2)\n\t" // r1 + "addi %2, %2, 4\n\t" // r1++ + + "vlw.v v22, (%3)\n\t" // r2 + "addi %3, %3, 4\n\t" // r2++ + + "vfmul.vv v8, v4, v5\n\t" // r0 * k0 + "vfmacc.vv v8, v16, v6\n\t" // += r1 * k1 + "vfmacc.vv v8, v22, v7\n\t" // += r2 * k2 + + "vfredsum.vs v11, v8, v0\n\t" // v11[0] = v0[0] + sum(v8[0..2]) + "vfmv.f.s ft9, v11\n\t" // ft9 = v11[0] + +#ifdef FUSE_CONV_RELU + "fmax.s ft9, ft9, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "fsw ft9, 0(%5)\n\t" + "addi %5, %5, 4\n\t" + + "addi t2, t2, -1\n\t" + "bnez t2, 11b\n\t" + + "12:\n\t" + // updata addr + "addi %1, %1, 8\n\t" // r0 += 2 + "addi %2, %2, 8\n\t" // r1 += 2 + "addi %3, %3, 8\n\t" // r2 += 2 + + : "=r"(kernel0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(r3), // %4 + "=r"(outptr0), // %5 + "=r"(outptr1), // %6 + "=r"(out_h), // %7 + "=r"(out_w), // %8 + "=r"(in_w) // %9 + : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(r3), "5"(outptr0), "6"(outptr1), + "7"(out_h), "8"(out_w), "9"(in_w), + "f"(bias0) // %20 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7", "ft8", "ft9", "ft10", "ft11", "t0", "t1", "t2", "t3", "t4", "t5"); + } +#else + const float *k0 = kernel0; + const float *k1 = k0 + 3; + const float *k2 = k1 + 3; + + int h = 0; + for (; h + 1 < out_h; h += 2) { + for (int w = 0; w < out_w; w++) { + float sum0 = bias0; + float sum1 = bias0; + + sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; + + sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; + sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2]; + + sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; + sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2]; + + sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2]; + +#ifdef FUSE_CONV_RELU + sum0 = sum0 > 0 ? sum0 : 0; + sum1 = sum1 > 0 ? sum1 : 0; +#endif // FUSE_CONV_RELU + + *outptr0 = sum0; + *outptr1 = sum1; + + r0++; + r1++; + r2++; + r3++; + outptr0++; + outptr1++; + } + r0 += 2 + in_w; // jump to next line + r1 += 2 + in_w; + r2 += 2 + in_w; + r3 += 2 + in_w; + + outptr0 += out_w; + outptr1 += out_w; + } + + for (; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + float sum0 = bias0; + sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; + sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; + sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; + +#ifdef FUSE_CONV_RELU + sum0 = sum0 > 0 ? sum0 : 0; +#endif // FUSE_CONV_RELU + + *outptr0 = sum0; + r0++; + r1++; + r2++; + outptr0++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +#endif // __riscv_vector + + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} + +/* + (1) Algorithm works as follows: + out_h1_loop: out_w4_loop --> out_w_tail + + k00*r00 k00*r02 k00*r04 k00*r06 + k01*r01 k01*r03 k01*r05 k01*r07 + k02*r02 k02*r04 k02*r06 k02*r08 + ---------------------------------------- + k10*r10 k10*r12 k10*r14 k10*r16 + k11*r11 k11*r13 k11*r15 k11*r17 + k12*r12 k12*r14 k12*r16 k12*r18 + ---------------------------------------- + k20*r20 k20*r22 k20*r24 k20*r26 + k21*r21 k21*r23 k21*r25 k21*r27 + k22*r22 k22*r24 k22*r26 k22*r28 + + 计算 k * r 时可以用 .vv 也可以用 .vf + + (2) register definition: + t0: i_out_h loop cnt + t1-t2: i_out_w loop cnt + t3: load stride 2 for r0-r2 + t4: constant 3 for setting vl = 3 + ft0: hold 1 output data + ft1-ft9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] + ft11: constant float 0.0f, used by fusing relu + v0: bias, acc + v4-v5: r0[0,2.4.6] r0[1,3,5,7] + v1: r0[2,4,6,8] + v6-v7: r1[0,2.4.6] r1[1,3,5,7] + v2: r1[2,4,6,8] + v8-v9: r2[0,2.4.6] r2[1,3,5,7] + v3: r2[2,4,6,8] + v10-v12: k0, k1, k2 + v20-v21: [ acc(kx1*rx), acc(kx2*rx) ] + + (3) //TODO: support channel mult ?? + Staggered instructions +*/ + +int DWCONV3X3S2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + shl_c906_pad_input( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + + int tailstep = in_w - 2 * out_w + in_w; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c < in_c; c++) { + float *out = output_data + c * out_h * out_w; + float *outptr0 = out; + + const float bias0 = bias_data ? bias_data[c] : 0.0f; + + const float *img0 = input_padd_buf + c * in_h * in_w; + const float *r0 = img0; + const float *r1 = r0 + in_w; + const float *r2 = r1 + in_w; + + const float *kernel0 = kernel_data + c * 9; + +#if __riscv_vector == 128 + + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" + "li t3, 8\n\t" // load stride for r_x + +#ifdef FUSE_CONV_RELU + "fmv.w.x ft11, zero\n\t" +#endif // FUSE_CONV_RELU + + "flw ft1, (%0)\n\t" + "flw ft2, 4(%0)\n\t" + "flw ft3, 8(%0)\n\t" + "flw ft4, 12(%0)\n\t" + "flw ft5, 16(%0)\n\t" + "flw ft6, 20(%0)\n\t" + "flw ft7, 24(%0)\n\t" + "flw ft8, 28(%0)\n\t" + "flw ft9, 32(%0)\n\t" // load k00 - k22 + + "vlw.v v10, (%0)\n\t" // k0 + "addi %0, %0, 12\n\t" + "vlw.v v11, (%0)\n\t" // k1 + "addi %0, %0, 12\n\t" + "vlw.v v12, (%0)\n\t" // k2 + + "vfmv.v.f v0, %16\n\t" // bias0 + + "mv t0, %5\n\t" // i_out_h = out_h + + "1:\n\t" // out_h + + "srai t1, %6, 2\n\t" // t1 = out_w >> 2 + "beqz t1, 3f\n\t" + "vsetvli zero, zero, e32, m1\n\t" + + // pre-load rxx + "vlseg2e.v v4, (%1)\n\t" // v4[0..3] = r0[0,2.4.6] v5[0..3] = r0[1,3,5,7] + "addi %1, %1, 8\n\t" // r0 += 2 + "vlsw.v v1, (%1), t3\n\t" // r0[2,4,6,8] + "addi %1, %1, 24\n\t" + + "2:\n\t" // out_w_loop4 + + "vlseg2e.v v6, (%2)\n\t" // v6[0..3] = r1[0,2.4.6] v7[0..3] = r1[1,3,5,7] + "addi %2, %2, 8\n\t" + "vfmul.vf v20, v4, ft1\n\t" // = k00 * r0[0,2,4,6] + "vfmul.vf v21, v5, ft2\n\t" // = k01 * r0[1,3,5,7] + "vlsw.v v2, (%2), t3\n\t" + "addi %2, %2, 24\n\t" + "vfmacc.vf v0, ft3, v1\n\t" // += k02 * r0[2,4,6,8] + + "vlseg2e.v v8, (%3)\n\t" // v8[0..3] = r2[0,2.4.6] v9[0..3] = r2[1,3,5,7] + "addi %3, %3, 8\n\t" + "vfmacc.vf v20, ft4, v6\n\t" // += k10 * r1[0,2,4,6] + "vfmacc.vf v21, ft5, v7\n\t" // += k11 * r1[1,3,5,7] + "vlsw.v v3, (%3), t3\n\t" + "addi %3, %3, 24\n\t" + "vfmacc.vf v0, ft6, v2\n\t" // += k12 * r1[2,4,6,8] + + "vlseg2e.v v4, (%1)\n\t" // v4[0..3] = r0[0,2.4.6] v5[0..3] = r0[1,3,5,7] + "addi %1, %1, 8\n\t" // r0 += 2 + "vfmacc.vf v20, ft7, v8\n\t" // += k20 * r2[0,2,4,6] + "vfmacc.vf v21, ft8, v9\n\t" // += k21 * r2[1,3,5,7] + "vlsw.v v1, (%1), t3\n\t" // r0[2,4,6,8] + "addi %1, %1, 24\n\t" + "vfmacc.vf v0, ft9, v3\n\t" // += k22 * r2[2,4,6,8] + + "vfadd.vv v2, v20, v21\n\t" + "vfadd.vv v0, v0, v2\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v0, v0, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v0, (%4)\n\t" + "addi %4, %4, 16\n\t" // outptr += 16 + + "vfmv.v.f v0, %16\n\t" // bias0 + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + "addi %1, %1, -32\n\t" // r0 -= 8 ********* bump r0 to origin addr + // ************ + + "3:\n\t" // out_w_tail + "andi t2, %6, 3\n\t" // t2 = out_w & 3 + "beqz t2, 5f\n\t" + + "4:\n\t" // out_w_tail + "vlw.v v4, (%1)\n\t" // r0 + "addi %1, %1, 8\n\t" + "vlw.v v6, (%2)\n\t" // r1 + "addi %2, %2, 8\n\t" + "vlw.v v8, (%3)\n\t" // r2 + "addi %3, %3, 8\n\t" + + "vfmul.vv v20, v4, v10\n\t" // r0 * k0 + "vfmacc.vv v20, v6, v11\n\t" // += r1 * k1 + "vfmacc.vv v20, v8, v12\n\t" // += r2 * k2 + + "li t4, 3\n\t" + "vsetvli zero, t4, e32, m1\n\t" // set vl = 3 + "vfredsum.vs v21, v20, v0\n\t" // v21[0] = v0[0](bias) + sum(v20[0..2]) + + "vfmv.f.s ft0, v21\n\t" // ft0 = v21[0] + +#ifdef FUSE_CONV_RELU + "fmax.s ft0, ft0, ft11\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "fsw ft0, 0(%4)\n\t" + "addi %4, %4, 4\n\t" // bump output_data pointer + + "addi t2, t2, -1\n\t" + "bnez t2, 4b\n\t" + + "5:\n\t" + "slli t2, %7, 2\n\t" // t2 = tailstep * 4 + "add %1, %1, t2\n\t" + "add %2, %2, t2\n\t" + "add %3, %3, t2\n\t" // r0/r1/r2 += tailstep + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + : "=r"(kernel0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(outptr0), // %4 + "=r"(out_h), // %5 + "=r"(out_w), // %6 + "=r"(tailstep) // %7 + : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(outptr0), "5"(out_h), "6"(out_w), + "7"(tailstep), + "f"(bias0) // %16 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v20", "v21", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", + "ft8", "ft9", "ft11", "t0", "t1", "t2", "t3", "t4"); + } +#else + const float *k0 = kernel0; + const float *k1 = k0 + 3; + const float *k2 = k1 + 3; + int h = 0; + for (; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + float sum0 = bias0; + sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2]; + sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2]; + sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2]; + +#ifdef FUSE_CONV_RELU + sum0 = sum0 > 0 ? sum0 : 0; +#endif // FUSE_CONV_RELU + + *outptr0 = sum0; + r0 += 2; + r1 += 2; + r2 += 2; + outptr0++; + } + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +#endif // __riscv_vector + + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} diff --git a/source/c906_opt/depthwise_convolution_3x3_pack4.c b/source/c906_opt/depthwise_convolution_3x3_pack4.c deleted file mode 100644 index 8977776c..00000000 --- a/source/c906_opt/depthwise_convolution_3x3_pack4.c +++ /dev/null @@ -1,1487 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" - -#ifndef DWCONV3X3S1_PACK4 -#define DWCONV3X3S1_PACK4 csi_c906_dwconv3x3s1_pack4 -#endif - -#ifndef DWCONV3X3S2_PACK4 -#define DWCONV3X3S2_PACK4 csi_c906_dwconv3x3s2_pack4 -#endif - - -/************************************************************************************************************ - c906 vlen = 128, 128/32 = 4 --> pack4, if vlen = 256 256/32 = 8 --> pack8 - input, kernel, bias, output layout: - input: [c/4, in_h, in_w, 4] - kernel: [c/4, k_h*k_w, 4] - bias: [c/4, 4] - output: [c/4, out_h, out_w, 4] - - constraint: in_channel = out_channel and is a multiple of 4 - No reference implementation -**************************************************************************************************************/ - -/* - (1) Algorithm works as follows: - out_h2: out_h2_w4_loop --> out_h2_wtail - out_h_tail: out_h1_w4_loop --> out_h1_wtail - - (2) register definition: - t0: i_out_h - t1: i_out_w - v0: bias_data - v1-v9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] - v10-v19: r00-r05 / r10-r15 / r20-r25 / r30-r35 - v24-v27: outptr0[0-3] line0 - v28-v31: outptr1[0-3] line1 - - Due to pack4, both kxx and rxx actually occupy a v register - - TODO: how to pack for input / kernel / bias / output - padding -*/ - -int DWCONV3X3S1_PACK4(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *kernel_data = (float *)kernel->data; - float *bias_data = (float *)bias->data; - - int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel - int32_t in_h = input->dim[2]; - int32_t in_w = input->dim[3]; - - int32_t out_c = output->dim[1]; - int32_t out_h = output->dim[2]; - int32_t out_w = output->dim[3]; - - for (int c = 0; c < in_c / 4; c++) { - float *out = output_data + c * out_h * out_w * 4; - float *outptr0 = out; - float *outptr1 = outptr0 + out_w * 4; - - const float *img0 = input_data + c * in_h * in_w * 4; - const float *r0 = img0; - const float *r1 = r0 + in_w * 4; - const float *r2 = r1 + in_w * 4; - const float *r3 = r2 + in_w * 4; - - const float *kernel0 = kernel_data + c * 9 * 4; - - const float *bias0 = NULL; - if (bias_data && bias->dim_count != 0) { - bias0 = bias_data + c * 4; - } - - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" - -#ifdef FUSE_CONV_RELU - "fmv.w.x ft0, zero\n\t" -#endif // FUSE_CONV_RELU - - "vmv.v.x v0, zero\n\t" // clear v0 - "beqz %5, 0f\n\t" // if bias_data = NULL clear v0 - "vlw.v v0, (%5)\n\t" - - "0:\n\t" - - "vlw.v v1, (%0)\n\t" // k00 - "addi %0, %0, 16\n\t" // kernel += 4 - "vlw.v v2, (%0)\n\t" // k01 - "addi %0, %0, 16\n\t" - "vlw.v v3, (%0)\n\t" // k02 - "addi %0, %0, 16\n\t" - "vlw.v v4, (%0)\n\t" // k10 - "addi %0, %0, 16\n\t" - "vlw.v v5, (%0)\n\t" // k11 - "addi %0, %0, 16\n\t" - "vlw.v v6, (%0)\n\t" // k12 - "addi %0, %0, 16\n\t" - "vlw.v v7, (%0)\n\t" // k20 - "addi %0, %0, 16\n\t" - "vlw.v v8, (%0)\n\t" // k21 - "addi %0, %0, 16\n\t" - "vlw.v v9, (%0)\n\t" // k22 - - "srai t0, %8, 1\n\t" // t0 = out_h >> 1 - "beqz t0, 6f\n\t" - - "1:\n\t" // out_h2_loop - - "srai t1, %9, 2\n\t" // t1 = out_w >> 2 - "beqz t1, 3f\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - // load 24 times, mac 72 times - "2:\n\t" // out_w4_loop - - "vmv.v.x v24, zero\n\t" - - "vlw.v v13, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vmv.v.x v25, zero\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vmv.v.x v26, zero\n\t" - - "vlw.v v14, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] - "vmv.v.x v27, zero\n\t" - "vfmacc.vv v26, v1, v12\n\t" // k00 * r02 out[2][0] - "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] - - "vmv.v.x v28, zero\n\t" - - "vlw.v v15, (%1)\n\t" // r03 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] - "vmv.v.x v29, zero\n\t" - "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] - "vfmacc.vv v28, v1, v13\n\t" // k00 * r10 out[4][0] - - "vlw.v v16, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v26, v2, v15\n\t" // k01 * r03 out[2][1] - "vmv.v.x v30, zero\n\t" - "vfmacc.vv v25, v3, v15\n\t" // k02 * r03 out[1][2] - "vfmacc.vv v29, v1, v14\n\t" // k01 * r11 out[5][0] - - "vlw.v v17, (%1)\n\t" // r04 - "addi %1, %1, 16\n\t" - - "vmv.v.x v31, zero\n\t" - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - "vfmacc.vv v27, v1, v15\n\t" // k00 * r03 out[3][0] - "vfmacc.vv v28, v2, v14\n\t" // k01 * r11 out[4][1] - - "vlw.v v18, (%2)\n\t" // r13 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v2, v16\n\t" // k01 * r12 out[5][1] - "vfmacc.vv v30, v1, v16\n\t" // k00 * r12 out[6][0] - "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] - - "vlw.v v19, (%1)\n\t" // r05 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 4 element addr ************ - - "vfmacc.vv v26, v3, v17\n\t" // k02 * r04 out[2][2] - "vfmacc.vv v27, v2, v17\n\t" // k01 * r04 out[3][1] - "vfmacc.vv v28, v3, v16\n\t " // k02 * r12 out[4][2] - - "vlw.v v10, (%2)\n\t" // r14 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v4, v14\n\t" // k10 * r11 out[1][3] - "vfmacc.vv v29, v3, v18\n\t" // k02 * r13 out[5][2] - "vfmacc.vv v30, v2, v18\n\t" // k01 * r13 out[6][1] - "vfmacc.vv v31, v1, v18\n\t" // k00 * r13 out[7][0] - - "vlw.v v11, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v27, v4, v18\n\t" // k10 * r13 out[3][3] - "vfmacc.vv v24, v6, v16\n\t" // k12 * r12 out[0][5] - "vfmacc.vv v26, v4, v16\n\t" // k10 * r12 out[2][3] - "vfmacc.vv v25, v5, v16\n\t" // k11 * r12 out[1][4] - - "vlw.v v12, (%2)\n\t" // r15 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 4 element addr ************ - - "vfmacc.vv v30, v3, v10\n\t" // k02 * r14 out[6][2] - "vfmacc.vv v31, v2, v10\n\t" // k01 * r14 out[7][1] - "vfmacc.vv v27, v3, v19\n\t" // k02 * r05 out[3][2] - - "vlw.v v13, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v6, v18\n\t" // k12 * r13 out[1][5] - "vfmacc.vv v26, v5, v18\n\t" // k11 * r13 out[2][4] - "vfmacc.vv v28, v4, v11\n\t" // k10 * r20 out[4][3] - - "vlw.v v14, (%4)\n\t" // r30 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v27, v5, v10\n\t" // k11 * r14 out[3][4] - "vfmacc.vv v31, v3, v12\n\t" // k02 * r15 out[7][2] - "vfmacc.vv v24, v7, v11\n\t" // k20 * r20 out[0][6] - - "vlw.v v15, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v7, v13\n\t" // k20 * r21 out[1][6] - "vfmacc.vv v26, v6, v10\n\t" // k12 * r14 out[2][5] - "vfmacc.vv v29, v4, v13\n\t" // k10 * r21 out[5][3] - - "vlw.v v16, (%4)\n\t" // r31 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v27, v6, v12\n\t" // k12 * r15 out[3][5] - "vfmacc.vv v28, v5, v13\n\t" // k11 * r21 out[4][4] - "vfmacc.vv v30, v4, v15\n\t" // k10 * r22 out[6][3] - - "vlw.v v17, (%3)\n\t" // r23 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v8, v13\n\t" // k21 * r21 out[0][7] - "vfmacc.vv v25, v8, v15\n\t" // k21 * r22 out[1][7] - "vfmacc.vv v29, v5, v15\n\t" // k11 * r22 out[5][5] - - "vlw.v v18, (%4)\n\t" // r32 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v26, v7, v15\n\t" // k20 * r22 out[2][6] - "vfmacc.vv v28, v6, v15\n\t" // k12 * r22 out[4][5] - "vfmacc.vv v24, v9, v15\n\t" // k22 * r22 out[0][8] - - "vlw.v v19, (%3)\n\t" // r24 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v30, v5, v17\n\t" // k11 * r23 out[6][4] - "vfmacc.vv v29, v6, v17\n\t" // k12 * r23 out[5][5] - - "vfadd.vv v24, v24, v0\n\t" // out0 += bias - - "vfmacc.vv v27, v7, v17\n\t" // k20 * r23 out[3][6] - "vfmacc.vv v31, v4, v17\n\t" // k10 * r23 out[7][3] - - "vlw.v v13, (%4)\n\t" // r33 - "addi %4, %4, 16\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" // store out0 - "addi %6, %6, 16\n\t" - - "vfmacc.vv v26, v8, v17\n\t" // k21 * r23 out[2][7] - "vfmacc.vv v28, v7, v14\n\t" // k20 * r30 out[4][6] - "vfmacc.vv v29, v7, v16\n\t" // k20 * r31 out[5][6] - "vfmacc.vv v30, v6, v19\n\t" // k12 * r24 out[6][5] - - "vlw.v v14, (%3)\n\t" // r25 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 4 element addr ************ - - "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] - "vfmacc.vv v27, v8, v19\n\t" // k21 * r24 out[3][7] - "vfmacc.vv v28, v8, v16\n\t" // k21 * r31 out[4][7] - "vfmacc.vv v31, v5, v19\n\t" // k11 * r24 out[7][4] - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vfadd.vv v25, v25, v0\n\t" // out1 += bias - - "vfmacc.vv v26, v9, v19\n\t" // k22 * r24 out[2][8] - "vfmacc.vv v29, v8, v18\n\t" // k21 * r32 out[5][7] - "vfmacc.vv v30, v7, v18\n\t" // k20 * r32 out[6][6] - - "vlw.v v15, (%4)\n\t" // r34 - "addi %4, %4, 16\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v25, v25, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v25, (%6)\n\t" // store out1 - "addi %6, %6, 16\n\t" - - "vfadd.vv v26, v26, v0\n\t" // out2 += bias - - "vfmacc.vv v27, v9, v14\n\t" // k22 * r25 out[3][8] - "vfmacc.vv v28, v9, v18\n\t" // k22 * r32 out[4][8] - "vfmacc.vv v31, v6, v14\n\t" // k12 * r25 out[7][5] - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v26, v26, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v26, (%6)\n\t" // store out2 - "addi %6, %6, 16\n\t" - - "vfadd.vv v27, v27, v0\n\t" // out3 += bias - - "vfmacc.vv v29, v9, v13\n\t" // k22 * r33 out[5][8] - "vfmacc.vv v30, v8, v13\n\t" // k21 * r33 out[6][7] - "vfmacc.vv v31, v7, v13\n\t" // k20 * r33 out[7][6] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v27, v27, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v27, (%6)\n\t" // store out3 - "addi %6, %6, 16\n\t" - - "vfadd.vv v28, v28, v0\n\t" // out4 += bias - - "vlw.v v16, (%4)\n\t" // r35 - "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 4 element addr ************ - - "vfmacc.vv v30, v9, v15\n\t" // k22 * r34 out[6][8] - "vfmacc.vv v31, v8, v15\n\t" // k21 * r34 out[7][7] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v28, (%7)\n\t" // store out4 - "addi %7, %7, 16\n\t" - - "vfadd.vv v29, v29, v0\n\t" // out5 += bias - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v31, v9, v16\n\t" // k22 * r35 out[7][8] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v29, v29, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v29, (%7)\n\t" // store out5 - "addi %7, %7, 16\n\t" - - "vfadd.vv v30, v30, v0\n\t" // out6 += bias - "vfadd.vv v31, v31, v0\n\t" // out7 += bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v30, v30, ft0\n\t" // **** relu **** - "vfmax.vf v31, v31, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v30, (%7)\n\t" // store out6 - "addi %7, %7, 16\n\t" - - "vsw.v v31, (%7)\n\t" // store out7 - "addi %7, %7, 16\n\t" - - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" - - "addi %1, %1, -48\n\t" // r0 -= 12 ********* bump r0 to origin addr ************ - - "3:\n\t" // out_w2 - "andi t1, %9, 3\n\t" // t1 = out_w & 3 - "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 - "beqz t2, 4f\n\t" - - // load 16 times, mac 36 times - "vmv.v.x v24, zero\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v25, zero\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vmv.v.x v28, zero\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vlw.v v12, (%4)\n\t" // r30 - "addi %4, %4, 16\n\t" - - "vmv.v.x v29, zero\n\t" - - "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] - - "vlw.v v13, (%4)\n\t" // r31 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v28, v7, v12\n\t" // k20 * r30 out[2][6] - - "vlw.v v14, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - "vfmacc.vv v29, v7, v13\n\t" // k20 * r31 out[3][6] - - "vlw.v v15, (%4)\n\t" // r32 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v28, v8, v13\n\t" // k21 * r31 out[2][7] - "vfmacc.vv v25, v2, v14\n\t" // k01 * r02 out[1][1] - - "vlw.v v16, (%1)\n\t" // r03 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 2 element addr ************ - - "vfmacc.vv v24, v3, v14\n\t" // k02 * r02 out[0][2] - "vfmacc.vv v29, v8, v15\n\t" // k21 * r32 out[3][7] - - "vlw.v v17, (%4)\n\t" // r33 - "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 2 element addr ************ - - "vfmacc.vv v28, v9, v15\n\t" // k22 * r32 out[2][8] - "vfmacc.vv v25, v3, v16\n\t" // k02 * r03 out[1][2] - - "vlw.v v10, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v9, v17\n\t" // k22 * r33 out[3][8] - - "vlw.v v11, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v4, v10\n\t" // k10 * r10 out[0][3] - "vfmacc.vv v28, v1, v10\n\t" // k00 * r10 out[2][0] - - "vlw.v v12, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v4, v11\n\t" // k10 * r11 out[1][3] - "vfmacc.vv v29, v1, v11\n\t" // k00 * r11 out[3][0] - "vfmacc.vv v24, v5, v11\n\t" // k11 * r11 out[0][4] - "vfmacc.vv v28, v2, v11\n\t" // k01 * r11 out[2][1] - - "vlw.v v13, (%2)\n\t" // r13 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 2 element addr ************ - - "vfmacc.vv v25, v5, v12\n\t" // k11 * r12 out[1][4] - "vfmacc.vv v29, v2, v12\n\t" // k01 * r12 out[3][1] - "vfmacc.vv v24, v6, v12\n\t" // k12 * r12 out[0][4] - "vfmacc.vv v28, v3, v12\n\t" // k02 * r12 out[2][2] - - "vlw.v v14, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v6, v13\n\t" // k12 * r13 out[1][5] - "vfmacc.vv v29, v3, v13\n\t" // k02 * r13 out[3][2] - - "vlw.v v15, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v7, v14\n\t" // k20 * r20 out[0][6] - "vfmacc.vv v28, v4, v14\n\t" // k10 * r20 out[2][3] - - "vlw.v v16, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v7, v15\n\t" // k20 * r21 out[1][6] - "vfmacc.vv v29, v4, v15\n\t" // k10 * r21 out[3][3] - "vfmacc.vv v24, v8, v15\n\t" // k21 * r21 out[0][7] - "vfmacc.vv v28, v5, v15\n\t" // k11 * r21 out[2][4] - - "vlw.v v17, (%3)\n\t" // r23 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 2 element addr ************ - - "vfmacc.vv v25, v8, v16\n\t" // k21 * r22 out[1][7] - "vfmacc.vv v29, v5, v16\n\t" // k11 * r22 out[3][4] - "vfmacc.vv v24, v9, v16\n\t" // k22 * r22 out[0][8] - "vfmacc.vv v28, v6, v16\n\t" // k12 * r22 out[2][5] - - "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] - "vfmacc.vv v29, v6, v17\n\t" // k12 * r23 out[3][5] - - "vfadd.vv v24, v24, v0\n\t" - "vfadd.vv v25, v25, v0\n\t" - "vfadd.vv v28, v28, v0\n\t" - "vfadd.vv v29, v29, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** - "vfmax.vf v25, v25, ft0\n\t" // **** relu **** - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** - "vfmax.vf v29, v29, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" // store outptr[0][0] - "addi %6, %6,16\n\t" - - "vsw.v v25, (%6)\n\t" // store outptr[0][0] - "addi %6, %6, 16\n\t" - - "vsw.v v28, (%7)\n\t" // store outptr[1][0] - "addi %7, %7,16\n\t" - - "vsw.v v29, (%7)\n\t" // store outptr[1][0] - "addi %7, %7, 16\n\t" - - "4:\n\t" // out_w_tail - - "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 - "beqz t2, 5f\n\t" - - // load 12 times, mac 18 times - - "vmv.v.x v24, zero\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v28, zero\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 1 element addr ************ - - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - - "vlw.v v13, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] - - "vlw.v v14, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v1, v13\n\t" // k00 * r10 out[1][0] - "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] - - "vlw.v v15, (%2)\n\t" // r12 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 1 element addr ************ - - "vfmacc.vv v28, v2, v14\n\t" // k01 * r11 out[1][1] - "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] - - "vlw.v v16, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v3, v15\n\t" // k02 * r12 out[1][2] - "vfmacc.vv v24, v6, v15\n\t" // k12 * r12 out[0][5] - - "vlw.v v17, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v4, v16\n\t" // k10 * r20 out[1][3] - "vfmacc.vv v24, v7, v16\n\t" // k20 * r20 out[0][6] - - "vlw.v v18, (%3)\n\t" // r22 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 1 element addr ************ - - "vfmacc.vv v28, v5, v17\n\t" // k11 * r21 out[1][4] - "vfmacc.vv v24, v8, v17\n\t" // k21 * r21 out[0][7] - - "vlw.v v10, (%4)\n\t" // r30 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v28, v6, v18\n\t" // k12 * r22 out[1][5] - "vfmacc.vv v24, v9, v18\n\t" // k22 * r22 out[0][8] - - "vlw.v v11, (%4)\n\t" // r31 - "addi %4, %4, 16\n\t" - - "vfmacc.vv v28, v7, v10\n\t" // k20 * r30 out[1][6] - "vfadd.vv v24, v24, v0\n\t" // add bias - - "vlw.v v12, (%4)\n\t" // r32 - "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 1 element addr ************ - - "vfmacc.vv v28, v8, v11\n\t" // k21 * r31 out[1][7] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" // store outptr[0][0] - "addi %6, %6, 16\n\t" - - "vfmacc.vv v28, v9, v12\n\t" // k22 * r32 out[1][8] - "vfadd.vv v28, v28, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v28, (%7)\n\t" // store outptr[1][0] - "addi %7, %7, 16\n\t" - - "5:\n\t" // out_h2_loop cnt - "addi t2, %10, 2\n\t" // in_w + 2 - "slli t2, t2, 4\n\t" // (in_w + 2) * 4 * 4 - "slli t3, %9, 4\n\t" // out_w * 4 * 4 - - "add %1, %1, t2\n\t" - "add %2, %2, t2\n\t" - "add %3, %3, t2\n\t" - "add %4, %4, t2\n\t" // r0/r1/r2/r3 += (in_w + 2) * 4 - - "add %6, %6, t3\n\t" - "add %7, %7, t3\n\t" // outprt0/outptr1 += out_w * 4 - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - "6:\n\t" // out_h_tail : can only be executed once - - "andi t0, %8, 1\n\t" // t0 = out_h & 1 - "beqz t0, 10f\n\t" - - "srai t1, %9, 2\n\t" // t1 = out_w >> 2 - "beqz t1, 8f\n\t" - - // 在这里先载入第一次执行的rxx, 减少内循环依赖,便于指令流水 - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - // load 18 times, mac 36 次 - "7:\n\t" // out_w4_loop - - "vmv.v.x v24, zero\n\t" - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - "vmv.v.x v25, zero\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vlw.v v13, (%1)\n\t" // r03 - "addi %1, %1, 16\n\t" - - "vmv.v.x v26, zero\n\t" - - "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] - - "vlw.v v14, (%1)\n\t" // r04 - "addi %1, %1, 16\n\t" - "vmv.v.x v27, zero\n\t" - - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - "vfmacc.vv v26, v1, v12\n\t" // k00 * r02 out[2][0] - - "vlw.v v15, (%1)\n\t" // r05 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 4 elements addr ************ - - "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] - "vfmacc.vv v27, v1, v13\n\t" // k00 * r03 out[3][0] - - "vlw.v v16, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v26, v2, v13\n\t" // k01 * r03 out[2][1] - "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] - "vfmacc.vv v25, v3, v13\n\t" // k02 * r03 out[1][2] - - "vlw.v v17, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v27, v2, v14\n\t" // k01 * r04 out[3][1] - "vfmacc.vv v26, v3, v14\n\t" // k02 * r04 out[2][2] - - "vlw.v v18, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v4, v16\n\t" // k10 * r10 out[0][3] - "vfmacc.vv v27, v3, v15\n\t" // k02 * r05 out[3][2] - - "vlw.v v19, (%2)\n\t" // r13 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v4, v17\n\t" // k10 * r11 out[1][3] - "vfmacc.vv v24, v5, v17\n\t" // k11 * r11 out[0][4] - - "vlw.v v12, (%2)\n\t" // r14 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v26, v4, v18\n\t" // k10 * r12 out[2][3] - "vfmacc.vv v25, v5, v18\n\t" // k12 * r13 out[1][4] - - "vlw.v v13, (%2)\n\t" // r15 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 4 elements addr ************ - - "vfmacc.vv v27, v4, v19\n\t" // k10 * r13 out[3][3] - "vfmacc.vv v24, v6, v18\n\t" // k12 * r12 out[0][5] - - "vlw.v v14, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v6, v19\n\t" // k12 * r13 out[1][5] - "vfmacc.vv v26, v5, v19\n\t" // k11 * r13 out[2][4] - "vfmacc.vv v27, v5, v12\n\t" // k11 * r14 out[3][4] - - "vlw.v v15, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v7, v14\n\t" // k20 * r20 out[0][6] - "vfmacc.vv v26, v6, v12\n\t" // k12 * r14 out[2][5] - - "vlw.v v16, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v27, v6, v13\n\t" // k12 * r15 out[3][5] - "vfmacc.vv v25, v7, v15\n\t" // k20 * r21 out[1][6] - - "vlw.v v17, (%3)\n\t" // r23 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v8, v15\n\t" // k21 * r21 out[0][7] - "vfmacc.vv v26, v7, v16\n\t" // k20 * r22 out[2][6] - - "vlw.v v18, (%3)\n\t" // r24 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v8, v16\n\t" // k21 * r22 out[1][7] - "vfmacc.vv v27, v7, v17\n\t" // k20 * r23 out[3][6] - - "vlw.v v19, (%3)\n\t" // r25 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 4 elements addr ************ - - "vfmacc.vv v24, v9, v16\n\t" // k22 * r22 out[0][8] - "vfmacc.vv v26, v8, v17\n\t" // k21 * r23 out[2][7] - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vfadd.vv v24, v24, v0\n\t" - - "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] - "vfmacc.vv v27, v8, v18\n\t" // k21 * r24 out[3][7] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" - "addi %6, %6, 16\n\t" // store out0 - - "vfadd.vv v25, v25, v0\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v26, v9, v18\n\t" // k22 * r24 out[2][8] - -#ifdef FUSE_CONV_RELU - "vfmax.vf v25, v25, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v25, (%6)\n\t" - "addi %6, %6, 16\n\t" // store out1 - - "vfmacc.vv v27, v9, v19\n\t" // k22 * r25 out[3][8] - - "vfadd.vv v26, v26, v0\n\t" - "vfadd.vv v27, v27, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v26, v26, ft0\n\t" // **** relu **** - "vfmax.vf v27, v27, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v26, (%6)\n\t" - "addi %6, %6, 16\n\t" // store out2 - - "vsw.v v27, (%6)\n\t" - "addi %6, %6, 16\n\t" // store out3 - - "addi t1, t1, -1\n\t" - "bnez t1, 7b\n\t" - - "addi %1, %1, -32\n\t" // r0 -= 8 ********* bump r0 to origin addr ************ - - "8:\n\t" // out_w2 - - "andi t1, %9, 3\n\t" // t1 = out_w & 3 - "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 - "beqz t2, 9f\n\t" - - // load 12 times, mac 18 times - - "vmv.v.x v24, zero\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v25, zero\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - - "vlw.v v13, (%1)\n\t" // r03 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 2 elements addr ************ - - "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] - "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] - - "vlw.v v14, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v3, v13\n\t" // k02 * r03 out[1][2] - - "vlw.v v15, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v4, v14\n\t" // k10 * r10 out[0][3] - - "vlw.v v16, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v25, v4, v15\n\t" // k10 * r11 out[1][3] - "vfmacc.vv v24, v5, v15\n\t" // k11 * r11 out[0][4] - - "vlw.v v17, (%2)\n\t" // r13 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 2 elements addr ************ - - "vfmacc.vv v25, v5, v16\n\t" // k11 * r12 out[1][4] - "vfmacc.vv v24, v6, v16\n\t" // k12 * r12 out[0][5] - - "vlw.v v10, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v6, v17\n\t" // k12 * r13 out[1][5] - - "vlw.v v11, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v7, v10\n\t" // k20 * r20 out[0][6] - - "vlw.v v12, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v25, v7, v11\n\t" // k20 * r21 out[1][6] - "vfmacc.vv v24, v8, v11\n\t" // k21 * r21 out[0][7] - - "vlw.v v13, (%3)\n\t" // r23 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 2 elements addr ************ - - "vfmacc.vv v25, v8, v12\n\t" // k21 * r22 out[1][7] - "vfmacc.vv v24, v9, v12\n\t" // k22 * r22 out[0][8] - - "vfmacc.vv v25, v9, v13\n\t" // k22 * r23 out[1][8] - - "vfadd.vv v24, v24, v0\n\t" - "vfadd.vv v25, v25, v0\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** - "vfmax.vf v25, v25, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" - "addi %6, %6, 16\n\t" - - "vsw.v v25, (%6)\n\t" - "addi %6, %6, 16\n\t" - - "9:\n\t" // out_w_tail - "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 - "beqz t2, 10f\n\t" - - // load 9 times, mac 9 times - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v24, zero\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 1 elements addr ************ - - "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] - - "vlw.v v13, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] - - "vlw.v v14, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] - - "vlw.v v15, (%2)\n\t" // r12 - "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 1 elements addr ************ - - "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] - - "vlw.v v16, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v6, v15\n\t" // k12 * r12 out[0][5] - - "vlw.v v17, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v24, v7, v16\n\t" // k20 * r20 out[0][6] - - "vlw.v v18, (%3)\n\t" // r22 - "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 1 elements addr ************ - - "vfmacc.vv v24, v8, v17\n\t" // k21 * r21 out[0][7] - "vfmacc.vv v24, v9, v18\n\t" // k22 * r22 out[0][8] - - "vfadd.vv v24, v24, v0\n\t" - -#ifdef FUSE_CONV_RELU - "vfmax.vf v24, v24, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v24, (%6)\n\t" - "addi %6, %6, 16\n\t" - - "10:\n\t" - // updata addr - "addi %1, %1, 32\n\t" // r0 += 2 * 4 * 4 - "addi %2, %2, 32\n\t" // r1 += 2 * 4 * 4 - "addi %3, %3, 32\n\t" // r2 += 2 * 4 * 4 - - :"=r"(kernel0), // %0 - "=r"(r0), // %1 - "=r"(r1), // %2 - "=r"(r2), // %3 - "=r"(r3), // %4 - "=r"(bias0), // %5 - "=r"(outptr0), // %6 - "=r"(outptr1), // %7 - "=r"(out_h), // %8 - "=r"(out_w), // %9 - "=r"(in_w) // %10 - :"0"(kernel0), - "1"(r0), - "2"(r1), - "3"(r2), - "4"(r3), - "5"(bias0), - "6"(outptr0), - "7"(outptr1), - "8"(out_h), - "9"(out_w), - "10"(in_w) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2", "t3" - ); - } - return CSINN_TRUE; -} - - -/* - (1) Algorithm works as follows: - out_h1_loop: out_h1_w4_loop --> out_h1_wtail - - (2) register definition: - t0: i_out_h - t1: i_out_w - v0: bias_data - v1-v9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] - v10-v20: r00-r08 / r10-r18 / r20-r28 - v28-v31: output_data - - Due to pack4, both kxx and rxx actually occupy a v register - - TODO: how to pack for input / kernel / bias / output - padding -*/ - -int DWCONV3X3S2_PACK4(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *kernel_data = (float *)kernel->data; - float *bias_data = (float *)bias->data; - - int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel - int32_t in_h = input->dim[2]; - int32_t in_w = input->dim[3]; - - int32_t out_c = output->dim[1]; - int32_t out_h = output->dim[2]; - int32_t out_w = output->dim[3]; - - int tailstep = (in_w - 2 * out_w + in_w) * 4; - - - for (int c = 0; c < in_c / 4; c++) { - - float *out = output_data + c * out_h * out_w * 4; - float *outptr0 = out; - - const float *img0 = input_data + c * in_h * in_w * 4; - const float *r0 = img0; - const float *r1 = r0 + in_w * 4; - const float *r2 = r1 + in_w * 4; - - const float *kernel0 = kernel_data + c * 9 * 4; - - const float *bias0 = NULL; - if (bias_data && bias->dim_count != 0) { - bias0 = bias_data + c * 4; - } - - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 - -#ifdef FUSE_CONV_RELU - "fmv.w.x ft0, zero\n\t" -#endif // FUSE_CONV_RELU - - "vmv.v.x v0, zero\n\t" // clear v0 - "beqz %4, 0f\n\t" // if bias_data = NULL clear v0 - "vlw.v v0, (%4)\n\t" - - "0:\n\t" - - "vlw.v v1, (%0)\n\t" // k00 - "addi %0, %0, 16\n\t" // kernel += 4 - "vlw.v v2, (%0)\n\t" // k01 - "addi %0, %0, 16\n\t" - "vlw.v v3, (%0)\n\t" // k02 - "addi %0, %0, 16\n\t" - "vlw.v v4, (%0)\n\t" // k10 - "addi %0, %0, 16\n\t" - "vlw.v v5, (%0)\n\t" // k11 - "addi %0, %0, 16\n\t" - "vlw.v v6, (%0)\n\t" // k12 - "addi %0, %0, 16\n\t" - "vlw.v v7, (%0)\n\t" // k20 - "addi %0, %0, 16\n\t" - "vlw.v v8, (%0)\n\t" // k21 - "addi %0, %0, 16\n\t" - "vlw.v v9, (%0)\n\t" // k22 - - "mv t0, %6\n\t" // i_out_h = out_h - - "1:\n\t" // out_h1_loop - - "srai t1, %7, 2\n\t" // t1 = out_w >> 2 - "beqz t1, 3f\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" // r0 += 4 - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "2:\n\t" // out_w4_loop - - "vmv.v.x v28, zero\n\t" - "vmv.v.x v29, zero\n\t" - "vmv.v.x v30, zero\n\t" - "vmv.v.x v31, zero\n\t" - - "vlw.v v13, (%1)\n\t" // r03 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 out0 - - "vlw.v v14, (%1)\n\t" // r04 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v29, v1, v12\n\t" // k00 * r02 out1 - - "vlw.v v15, (%1)\n\t" // r05 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v2, v11\n\t" // k01 * r01 out0 - - "vlw.v v16, (%1)\n\t" // r06 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v29, v2, v13\n\t" // k01 * r03 out1 - - "vlw.v v17, (%1)\n\t" // r07 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v3, v12\n\t" // k02 * r02 out0 - - "vlw.v v18, (%1)\n\t" // r08 - // "addi %1, %1, 16\n\t" - - "vfmacc.vv v29, v3, v14\n\t" // k02 * r04 out1 - - "vlw.v v10, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v30, v1, v14\n\t" // k00 * r04 out2 - - "vlw.v v11, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v31, v1, v16\n\t" // k00 * r06 out3 - - "vlw.v v12, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v30, v2, v15\n\t" // k01 * r05 out2 - - "vlw.v v13, (%2)\n\t" // r13 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v31, v2, v17\n\t" // k01 * r07 out3 - - "vlw.v v14, (%2)\n\t" // r14 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v30, v3, v16\n\t" // k02 * r06 out2 - - "vlw.v v15, (%2)\n\t" // r15 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v31, v3, v18\n\t" // k02 * r08 out3 - - "vlw.v v16, (%2)\n\t" // r16 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v4, v10\n\t" // k10 * r10 out0 - - "vlw.v v17, (%2)\n\t" // r17 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v4, v12\n\t" // k10 * r12 out1 - - "vlw.v v18, (%2)\n\t" // r18 - // "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v5, v11\n\t" // k11 * r11 out0 - - "vlw.v v10, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v29, v5, v13\n\t" // k11 * r13 out1 - - "vlw.v v11, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v6, v12\n\t" // k12 * r12 out0 - - "vlw.v v12, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v29, v6, v14\n\t" // k12 * r14 out1 - - "vlw.v v13, (%3)\n\t" // r23 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v30, v4, v14\n\t" // k10 * r14 out2 - - "vlw.v v14, (%3)\n\t" // r24 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v31, v4, v16\n\t" // k10 * r16 out3 - - "vlw.v v19, (%3)\n\t" // r25 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v30, v5, v15\n\t" // k11 * r15 out2 - - "vlw.v v20, (%3)\n\t" // r26 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v31, v5, v17\n\t" // k11 * r17 out3 - - "vlw.v v15, (%3)\n\t" // r27 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v30, v6, v16\n\t" // k12 * r16 out2 - - "vlw.v v16, (%3)\n\t" // r28 - // "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v7, v10\n\t" // k20 * r20 out0 - "vfmacc.vv v31, v6, v18\n\t" // k12 * r18 out3 - - "vlw.v v10, (%1)\n\t" // r00 ******** load r00-r02 for next loop ******* - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v8, v11\n\t" // k21 * r21 out0 - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v9, v12\n\t" // k22 * r22 out0 - "vfmacc.vv v29, v7, v12\n\t" // k20 * r22 out1 - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v29, v8, v13\n\t" // k21 * r23 out1 - "vfmacc.vv v29, v9, v14\n\t" // k22 * r24 out1 - "vfmacc.vv v30, v7, v14\n\t" // k20 * r24 out2 - "vfmacc.vv v31, v7, v20\n\t" // k20 * r26 out3 - "vfmacc.vv v30, v8, v19\n\t" // k21 * r25 out2 - "vfmacc.vv v31, v8, v15\n\t" // k21 * r27 out3 - "vfmacc.vv v30, v9, v20\n\t" // k22 * r26 out2 - "vfmacc.vv v31, v9, v16\n\t" // k22 * r28 out3 - - "vfadd.vv v28, v28, v0\n\t" - "vfadd.vv v29, v29, v0\n\t" - "vfadd.vv v30, v30, v0\n\t" - "vfadd.vv v31, v31, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** - "vfmax.vf v29, v29, ft0\n\t" // **** relu **** - "vfmax.vf v30, v30, ft0\n\t" // **** relu **** - "vfmax.vf v31, v31, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v28, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "vsw.v v29, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "vsw.v v30, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "vsw.v v31, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "addi t1, t1, -1\n\t" // loop cnt - "bnez t1, 2b\n\t" - - "addi %1, %1, -48\n\t" // r0 -= 12 ********* bump r0 to origin addr ************ - - "3:\n\t" // out_w2 : can only be executed once - - "andi t1, %7, 3\n\t" // t1 = out_w & 3 - "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 - "beqz t2, 4f\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v28, zero\n\t" - - "vlw.v v11, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vmv.v.x v29, zero\n\t" - - "vlw.v v12, (%1)\n\t" // r02 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 out0 - - "vlw.v v13, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v1, v12\n\t" // k00 * r02 out1 - - "vlw.v v14, (%1)\n\t" // r03 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v2, v11\n\t" // k01 * r01 out0 - - "vlw.v v15, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v2, v14\n\t" // k01 * r03 out1 - - "vlw.v v16, (%1)\n\t" // r04 - - "vfmacc.vv v28, v3, v12\n\t" // k02 * r02 out0 - - "vlw.v v17, (%2)\n\t" // r12 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v3, v16\n\t" // k02 * r04 out1 - - "vlw.v v18, (%3)\n\t" // r20 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v4, v13\n\t" // k10 * r10 out0 - - "vlw.v v19, (%2)\n\t" // r13 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v29, v4, v17\n\t" // k10 * r12 out1 - "vfmacc.vv v28, v6, v17\n\t" // k12 * r12 out0 - - "vlw.v v20, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v29, v5, v19\n\t" // k11 * r13 out1 - "vfmacc.vv v28, v5, v15\n\t" // k11 * r11 out0 - - "vlw.v v10, (%2)\n\t" // r14 - // "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v7, v18\n\t" // k20 * r20 out0 - - "vlw.v v11, (%3)\n\t" // r22 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v29, v6, v10\n\t" // k12 * r14 out1 - - "vlw.v v12, (%3)\n\t" // r23 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v8, v20\n\t" // k21 * r21 out0 - "vfmacc.vv v29, v7, v11\n\t" // k20 * r22 out1 - - "vlw.v v13, (%3)\n\t" // r24 - // "addi %3, %3, 16\n\t" - - "vfmacc.vv v29, v8, v12\n\t" // k21 * r23 out1 - "vfmacc.vv v28, v9, v11\n\t" // k22 * r22 out0 - "vfmacc.vv v29, v9, v13\n\t" // k22 * r24 out1 - - "vfadd.vv v28, v28, v0\n\t" - "vfadd.vv v29, v29, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** - "vfmax.vf v29, v29, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v28, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "vsw.v v29, (%5)\n\t" - "addi %5, %5, 16\n\t" - - - "4:\n\t" // out_w_tail : can only be executed once - "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 - "beqz t2, 5f\n\t" - - "vlw.v v10, (%1)\n\t" // r00 - "addi %1, %1, 16\n\t" - - "vmv.v.x v28, zero\n\t" - - "vlw.v v11, (%2)\n\t" // r10 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 - - "vlw.v v12, (%3)\n\t" - "addi %3, %3, 16\n\t" // r20 - - "vfmacc.vv v28, v4, v11\n\t" // k10 * r10 - - "vlw.v v13, (%1)\n\t" // r01 - "addi %1, %1, 16\n\t" - - "vfmacc.vv v28, v7, v12\n\t" // k20 * r20 - - "vlw.v v14, (%2)\n\t" // r11 - "addi %2, %2, 16\n\t" - - "vfmacc.vv v28, v2, v13\n\t" // k01 * r01 - - "vlw.v v15, (%3)\n\t" // r21 - "addi %3, %3, 16\n\t" - - "vfmacc.vv v28, v5, v14\n\t" // k11 * r11 - - "vlw.v v16, (%1)\n\t" // r02 - - "vfmacc.vv v28, v8, v15\n\t" // k21 * r21 - - "vlw.v v17, (%2)\n\t" // r12 - - "vfmacc.vv v28, v3, v16\n\t" // k02 * r02 - - "vlw.v v18, (%3)\n\t" // r22 - - "vfmacc.vv v28, v6, v17\n\t" // k12 * r12 - "vfmacc.vv v28, v9, v18\n\t" // k22 * r22 - - "vfadd.vv v28, v28, v0\n\t" // add bias - -#ifdef FUSE_CONV_RELU - "vfmax.vf v28, v28, ft0\n\t" // **** relu **** -#endif // FUSE_CONV_RELU - - "vsw.v v28, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "5:\n\t" - - "slli t2, %8, 2\n\t" // t2 = tailstep * 4 - "add %1, %1, t2\n\t" - "add %2, %2, t2\n\t" - "add %3, %3, t2\n\t" // r0/r1/r2 += tailstep - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - :"=r"(kernel0), // %0 - "=r"(r0), // %1 - "=r"(r1), // %2 - "=r"(r2), // %3 - "=r"(bias0), // %4 - "=r"(outptr0), // %5 - "=r"(out_h), // %6 - "=r"(out_w), // %7 - "=r"(tailstep) // %8 - :"0"(kernel0), - "1"(r0), - "2"(r1), - "3"(r2), - "4"(bias0), - "5"(outptr0), - "6"(out_h), - "7"(out_w), - "8"(tailstep) - :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2" - - ); - } - return CSINN_TRUE; -} diff --git a/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c b/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c new file mode 100644 index 00000000..b6d2e22b --- /dev/null +++ b/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c @@ -0,0 +1,1487 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c906.h" + +#ifndef DWCONV3X3S1_PACK4 +#define DWCONV3X3S1_PACK4 shl_c906_dwconv3x3s1_pack4 +#endif + +#ifndef DWCONV3X3S2_PACK4 +#define DWCONV3X3S2_PACK4 shl_c906_dwconv3x3s2_pack4 +#endif + +/************************************************************************************************************ + c906 vlen = 128, 128/32 = 4 --> pack4, if vlen = 256 256/32 = 8 --> pack8 + input, kernel, bias, output layout: + input: [c/4, in_h, in_w, 4] + kernel: [c/4, k_h*k_w, 4] + bias: [c/4, 4] + output: [c/4, out_h, out_w, 4] + + constraint: in_channel = out_channel and is a multiple of 4 + No reference implementation +**************************************************************************************************************/ + +/* + (1) Algorithm works as follows: + out_h2: out_h2_w4_loop --> out_h2_wtail + out_h_tail: out_h1_w4_loop --> out_h1_wtail + + (2) register definition: + t0: i_out_h + t1: i_out_w + v0: bias_data + v1-v9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] + v10-v19: r00-r05 / r10-r15 / r20-r25 / r30-r35 + v24-v27: outptr0[0-3] line0 + v28-v31: outptr1[0-3] line1 + + Due to pack4, both kxx and rxx actually occupy a v register + + TODO: how to pack for input / kernel / bias / output + padding +*/ + +int DWCONV3X3S1_PACK4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + for (int c = 0; c < in_c / 4; c++) { + float *out = output_data + c * out_h * out_w * 4; + float *outptr0 = out; + float *outptr1 = outptr0 + out_w * 4; + + const float *img0 = input_data + c * in_h * in_w * 4; + const float *r0 = img0; + const float *r1 = r0 + in_w * 4; + const float *r2 = r1 + in_w * 4; + const float *r3 = r2 + in_w * 4; + + const float *kernel0 = kernel_data + c * 9 * 4; + + const float *bias0 = NULL; + if (bias_data && bias->dim_count != 0) { + bias0 = bias_data + c * 4; + } + + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" + +#ifdef FUSE_CONV_RELU + "fmv.w.x ft0, zero\n\t" +#endif // FUSE_CONV_RELU + + "vmv.v.x v0, zero\n\t" // clear v0 + "beqz %5, 0f\n\t" // if bias_data = NULL clear v0 + "vlw.v v0, (%5)\n\t" + + "0:\n\t" + + "vlw.v v1, (%0)\n\t" // k00 + "addi %0, %0, 16\n\t" // kernel += 4 + "vlw.v v2, (%0)\n\t" // k01 + "addi %0, %0, 16\n\t" + "vlw.v v3, (%0)\n\t" // k02 + "addi %0, %0, 16\n\t" + "vlw.v v4, (%0)\n\t" // k10 + "addi %0, %0, 16\n\t" + "vlw.v v5, (%0)\n\t" // k11 + "addi %0, %0, 16\n\t" + "vlw.v v6, (%0)\n\t" // k12 + "addi %0, %0, 16\n\t" + "vlw.v v7, (%0)\n\t" // k20 + "addi %0, %0, 16\n\t" + "vlw.v v8, (%0)\n\t" // k21 + "addi %0, %0, 16\n\t" + "vlw.v v9, (%0)\n\t" // k22 + + "srai t0, %8, 1\n\t" // t0 = out_h >> 1 + "beqz t0, 6f\n\t" + + "1:\n\t" // out_h2_loop + + "srai t1, %9, 2\n\t" // t1 = out_w >> 2 + "beqz t1, 3f\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + // load 24 times, mac 72 times + "2:\n\t" // out_w4_loop + + "vmv.v.x v24, zero\n\t" + + "vlw.v v13, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vmv.v.x v25, zero\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vmv.v.x v26, zero\n\t" + + "vlw.v v14, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] + "vmv.v.x v27, zero\n\t" + "vfmacc.vv v26, v1, v12\n\t" // k00 * r02 out[2][0] + "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] + + "vmv.v.x v28, zero\n\t" + + "vlw.v v15, (%1)\n\t" // r03 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] + "vmv.v.x v29, zero\n\t" + "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] + "vfmacc.vv v28, v1, v13\n\t" // k00 * r10 out[4][0] + + "vlw.v v16, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v26, v2, v15\n\t" // k01 * r03 out[2][1] + "vmv.v.x v30, zero\n\t" + "vfmacc.vv v25, v3, v15\n\t" // k02 * r03 out[1][2] + "vfmacc.vv v29, v1, v14\n\t" // k01 * r11 out[5][0] + + "vlw.v v17, (%1)\n\t" // r04 + "addi %1, %1, 16\n\t" + + "vmv.v.x v31, zero\n\t" + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + "vfmacc.vv v27, v1, v15\n\t" // k00 * r03 out[3][0] + "vfmacc.vv v28, v2, v14\n\t" // k01 * r11 out[4][1] + + "vlw.v v18, (%2)\n\t" // r13 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v2, v16\n\t" // k01 * r12 out[5][1] + "vfmacc.vv v30, v1, v16\n\t" // k00 * r12 out[6][0] + "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] + + "vlw.v v19, (%1)\n\t" // r05 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 4 element addr + // ************ + + "vfmacc.vv v26, v3, v17\n\t" // k02 * r04 out[2][2] + "vfmacc.vv v27, v2, v17\n\t" // k01 * r04 out[3][1] + "vfmacc.vv v28, v3, v16\n\t " // k02 * r12 out[4][2] + + "vlw.v v10, (%2)\n\t" // r14 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v4, v14\n\t" // k10 * r11 out[1][3] + "vfmacc.vv v29, v3, v18\n\t" // k02 * r13 out[5][2] + "vfmacc.vv v30, v2, v18\n\t" // k01 * r13 out[6][1] + "vfmacc.vv v31, v1, v18\n\t" // k00 * r13 out[7][0] + + "vlw.v v11, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v27, v4, v18\n\t" // k10 * r13 out[3][3] + "vfmacc.vv v24, v6, v16\n\t" // k12 * r12 out[0][5] + "vfmacc.vv v26, v4, v16\n\t" // k10 * r12 out[2][3] + "vfmacc.vv v25, v5, v16\n\t" // k11 * r12 out[1][4] + + "vlw.v v12, (%2)\n\t" // r15 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 4 element addr + // ************ + + "vfmacc.vv v30, v3, v10\n\t" // k02 * r14 out[6][2] + "vfmacc.vv v31, v2, v10\n\t" // k01 * r14 out[7][1] + "vfmacc.vv v27, v3, v19\n\t" // k02 * r05 out[3][2] + + "vlw.v v13, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v6, v18\n\t" // k12 * r13 out[1][5] + "vfmacc.vv v26, v5, v18\n\t" // k11 * r13 out[2][4] + "vfmacc.vv v28, v4, v11\n\t" // k10 * r20 out[4][3] + + "vlw.v v14, (%4)\n\t" // r30 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v27, v5, v10\n\t" // k11 * r14 out[3][4] + "vfmacc.vv v31, v3, v12\n\t" // k02 * r15 out[7][2] + "vfmacc.vv v24, v7, v11\n\t" // k20 * r20 out[0][6] + + "vlw.v v15, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v7, v13\n\t" // k20 * r21 out[1][6] + "vfmacc.vv v26, v6, v10\n\t" // k12 * r14 out[2][5] + "vfmacc.vv v29, v4, v13\n\t" // k10 * r21 out[5][3] + + "vlw.v v16, (%4)\n\t" // r31 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v27, v6, v12\n\t" // k12 * r15 out[3][5] + "vfmacc.vv v28, v5, v13\n\t" // k11 * r21 out[4][4] + "vfmacc.vv v30, v4, v15\n\t" // k10 * r22 out[6][3] + + "vlw.v v17, (%3)\n\t" // r23 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v8, v13\n\t" // k21 * r21 out[0][7] + "vfmacc.vv v25, v8, v15\n\t" // k21 * r22 out[1][7] + "vfmacc.vv v29, v5, v15\n\t" // k11 * r22 out[5][5] + + "vlw.v v18, (%4)\n\t" // r32 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v26, v7, v15\n\t" // k20 * r22 out[2][6] + "vfmacc.vv v28, v6, v15\n\t" // k12 * r22 out[4][5] + "vfmacc.vv v24, v9, v15\n\t" // k22 * r22 out[0][8] + + "vlw.v v19, (%3)\n\t" // r24 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v30, v5, v17\n\t" // k11 * r23 out[6][4] + "vfmacc.vv v29, v6, v17\n\t" // k12 * r23 out[5][5] + + "vfadd.vv v24, v24, v0\n\t" // out0 += bias + + "vfmacc.vv v27, v7, v17\n\t" // k20 * r23 out[3][6] + "vfmacc.vv v31, v4, v17\n\t" // k10 * r23 out[7][3] + + "vlw.v v13, (%4)\n\t" // r33 + "addi %4, %4, 16\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" // store out0 + "addi %6, %6, 16\n\t" + + "vfmacc.vv v26, v8, v17\n\t" // k21 * r23 out[2][7] + "vfmacc.vv v28, v7, v14\n\t" // k20 * r30 out[4][6] + "vfmacc.vv v29, v7, v16\n\t" // k20 * r31 out[5][6] + "vfmacc.vv v30, v6, v19\n\t" // k12 * r24 out[6][5] + + "vlw.v v14, (%3)\n\t" // r25 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 4 element addr + // ************ + + "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] + "vfmacc.vv v27, v8, v19\n\t" // k21 * r24 out[3][7] + "vfmacc.vv v28, v8, v16\n\t" // k21 * r31 out[4][7] + "vfmacc.vv v31, v5, v19\n\t" // k11 * r24 out[7][4] + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vfadd.vv v25, v25, v0\n\t" // out1 += bias + + "vfmacc.vv v26, v9, v19\n\t" // k22 * r24 out[2][8] + "vfmacc.vv v29, v8, v18\n\t" // k21 * r32 out[5][7] + "vfmacc.vv v30, v7, v18\n\t" // k20 * r32 out[6][6] + + "vlw.v v15, (%4)\n\t" // r34 + "addi %4, %4, 16\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v25, v25, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v25, (%6)\n\t" // store out1 + "addi %6, %6, 16\n\t" + + "vfadd.vv v26, v26, v0\n\t" // out2 += bias + + "vfmacc.vv v27, v9, v14\n\t" // k22 * r25 out[3][8] + "vfmacc.vv v28, v9, v18\n\t" // k22 * r32 out[4][8] + "vfmacc.vv v31, v6, v14\n\t" // k12 * r25 out[7][5] + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v26, v26, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v26, (%6)\n\t" // store out2 + "addi %6, %6, 16\n\t" + + "vfadd.vv v27, v27, v0\n\t" // out3 += bias + + "vfmacc.vv v29, v9, v13\n\t" // k22 * r33 out[5][8] + "vfmacc.vv v30, v8, v13\n\t" // k21 * r33 out[6][7] + "vfmacc.vv v31, v7, v13\n\t" // k20 * r33 out[7][6] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v27, v27, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v27, (%6)\n\t" // store out3 + "addi %6, %6, 16\n\t" + + "vfadd.vv v28, v28, v0\n\t" // out4 += bias + + "vlw.v v16, (%4)\n\t" // r35 + "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 4 element addr + // ************ + + "vfmacc.vv v30, v9, v15\n\t" // k22 * r34 out[6][8] + "vfmacc.vv v31, v8, v15\n\t" // k21 * r34 out[7][7] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v28, (%7)\n\t" // store out4 + "addi %7, %7, 16\n\t" + + "vfadd.vv v29, v29, v0\n\t" // out5 += bias + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v31, v9, v16\n\t" // k22 * r35 out[7][8] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v29, v29, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v29, (%7)\n\t" // store out5 + "addi %7, %7, 16\n\t" + + "vfadd.vv v30, v30, v0\n\t" // out6 += bias + "vfadd.vv v31, v31, v0\n\t" // out7 += bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v30, v30, ft0\n\t" // **** relu **** + "vfmax.vf v31, v31, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v30, (%7)\n\t" // store out6 + "addi %7, %7, 16\n\t" + + "vsw.v v31, (%7)\n\t" // store out7 + "addi %7, %7, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + "addi %1, %1, -48\n\t" // r0 -= 12 ********* bump r0 to origin addr + // ************ + + "3:\n\t" // out_w2 + "andi t1, %9, 3\n\t" // t1 = out_w & 3 + "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 + "beqz t2, 4f\n\t" + + // load 16 times, mac 36 times + "vmv.v.x v24, zero\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v25, zero\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vmv.v.x v28, zero\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vlw.v v12, (%4)\n\t" // r30 + "addi %4, %4, 16\n\t" + + "vmv.v.x v29, zero\n\t" + + "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] + + "vlw.v v13, (%4)\n\t" // r31 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v28, v7, v12\n\t" // k20 * r30 out[2][6] + + "vlw.v v14, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + "vfmacc.vv v29, v7, v13\n\t" // k20 * r31 out[3][6] + + "vlw.v v15, (%4)\n\t" // r32 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v28, v8, v13\n\t" // k21 * r31 out[2][7] + "vfmacc.vv v25, v2, v14\n\t" // k01 * r02 out[1][1] + + "vlw.v v16, (%1)\n\t" // r03 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 2 element addr + // ************ + + "vfmacc.vv v24, v3, v14\n\t" // k02 * r02 out[0][2] + "vfmacc.vv v29, v8, v15\n\t" // k21 * r32 out[3][7] + + "vlw.v v17, (%4)\n\t" // r33 + "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 2 element addr + // ************ + + "vfmacc.vv v28, v9, v15\n\t" // k22 * r32 out[2][8] + "vfmacc.vv v25, v3, v16\n\t" // k02 * r03 out[1][2] + + "vlw.v v10, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v9, v17\n\t" // k22 * r33 out[3][8] + + "vlw.v v11, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v4, v10\n\t" // k10 * r10 out[0][3] + "vfmacc.vv v28, v1, v10\n\t" // k00 * r10 out[2][0] + + "vlw.v v12, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v4, v11\n\t" // k10 * r11 out[1][3] + "vfmacc.vv v29, v1, v11\n\t" // k00 * r11 out[3][0] + "vfmacc.vv v24, v5, v11\n\t" // k11 * r11 out[0][4] + "vfmacc.vv v28, v2, v11\n\t" // k01 * r11 out[2][1] + + "vlw.v v13, (%2)\n\t" // r13 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 2 element addr + // ************ + + "vfmacc.vv v25, v5, v12\n\t" // k11 * r12 out[1][4] + "vfmacc.vv v29, v2, v12\n\t" // k01 * r12 out[3][1] + "vfmacc.vv v24, v6, v12\n\t" // k12 * r12 out[0][4] + "vfmacc.vv v28, v3, v12\n\t" // k02 * r12 out[2][2] + + "vlw.v v14, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v6, v13\n\t" // k12 * r13 out[1][5] + "vfmacc.vv v29, v3, v13\n\t" // k02 * r13 out[3][2] + + "vlw.v v15, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v7, v14\n\t" // k20 * r20 out[0][6] + "vfmacc.vv v28, v4, v14\n\t" // k10 * r20 out[2][3] + + "vlw.v v16, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v7, v15\n\t" // k20 * r21 out[1][6] + "vfmacc.vv v29, v4, v15\n\t" // k10 * r21 out[3][3] + "vfmacc.vv v24, v8, v15\n\t" // k21 * r21 out[0][7] + "vfmacc.vv v28, v5, v15\n\t" // k11 * r21 out[2][4] + + "vlw.v v17, (%3)\n\t" // r23 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 2 element addr + // ************ + + "vfmacc.vv v25, v8, v16\n\t" // k21 * r22 out[1][7] + "vfmacc.vv v29, v5, v16\n\t" // k11 * r22 out[3][4] + "vfmacc.vv v24, v9, v16\n\t" // k22 * r22 out[0][8] + "vfmacc.vv v28, v6, v16\n\t" // k12 * r22 out[2][5] + + "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] + "vfmacc.vv v29, v6, v17\n\t" // k12 * r23 out[3][5] + + "vfadd.vv v24, v24, v0\n\t" + "vfadd.vv v25, v25, v0\n\t" + "vfadd.vv v28, v28, v0\n\t" + "vfadd.vv v29, v29, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** + "vfmax.vf v25, v25, ft0\n\t" // **** relu **** + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** + "vfmax.vf v29, v29, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" // store outptr[0][0] + "addi %6, %6,16\n\t" + + "vsw.v v25, (%6)\n\t" // store outptr[0][0] + "addi %6, %6, 16\n\t" + + "vsw.v v28, (%7)\n\t" // store outptr[1][0] + "addi %7, %7,16\n\t" + + "vsw.v v29, (%7)\n\t" // store outptr[1][0] + "addi %7, %7, 16\n\t" + + "4:\n\t" // out_w_tail + + "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 + "beqz t2, 5f\n\t" + + // load 12 times, mac 18 times + + "vmv.v.x v24, zero\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v28, zero\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 1 element addr + // ************ + + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + + "vlw.v v13, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] + + "vlw.v v14, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v1, v13\n\t" // k00 * r10 out[1][0] + "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] + + "vlw.v v15, (%2)\n\t" // r12 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 1 element addr + // ************ + + "vfmacc.vv v28, v2, v14\n\t" // k01 * r11 out[1][1] + "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] + + "vlw.v v16, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v3, v15\n\t" // k02 * r12 out[1][2] + "vfmacc.vv v24, v6, v15\n\t" // k12 * r12 out[0][5] + + "vlw.v v17, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v4, v16\n\t" // k10 * r20 out[1][3] + "vfmacc.vv v24, v7, v16\n\t" // k20 * r20 out[0][6] + + "vlw.v v18, (%3)\n\t" // r22 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 1 element addr + // ************ + + "vfmacc.vv v28, v5, v17\n\t" // k11 * r21 out[1][4] + "vfmacc.vv v24, v8, v17\n\t" // k21 * r21 out[0][7] + + "vlw.v v10, (%4)\n\t" // r30 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v28, v6, v18\n\t" // k12 * r22 out[1][5] + "vfmacc.vv v24, v9, v18\n\t" // k22 * r22 out[0][8] + + "vlw.v v11, (%4)\n\t" // r31 + "addi %4, %4, 16\n\t" + + "vfmacc.vv v28, v7, v10\n\t" // k20 * r30 out[1][6] + "vfadd.vv v24, v24, v0\n\t" // add bias + + "vlw.v v12, (%4)\n\t" // r32 + "addi %4, %4, -16\n\t" // r3 -= 4 ********* bump r3 to next 1 element addr + // ************ + + "vfmacc.vv v28, v8, v11\n\t" // k21 * r31 out[1][7] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" // store outptr[0][0] + "addi %6, %6, 16\n\t" + + "vfmacc.vv v28, v9, v12\n\t" // k22 * r32 out[1][8] + "vfadd.vv v28, v28, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v28, (%7)\n\t" // store outptr[1][0] + "addi %7, %7, 16\n\t" + + "5:\n\t" // out_h2_loop cnt + "addi t2, %10, 2\n\t" // in_w + 2 + "slli t2, t2, 4\n\t" // (in_w + 2) * 4 * 4 + "slli t3, %9, 4\n\t" // out_w * 4 * 4 + + "add %1, %1, t2\n\t" + "add %2, %2, t2\n\t" + "add %3, %3, t2\n\t" + "add %4, %4, t2\n\t" // r0/r1/r2/r3 += (in_w + 2) * 4 + + "add %6, %6, t3\n\t" + "add %7, %7, t3\n\t" // outprt0/outptr1 += out_w * 4 + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "6:\n\t" // out_h_tail : can only be executed once + + "andi t0, %8, 1\n\t" // t0 = out_h & 1 + "beqz t0, 10f\n\t" + + "srai t1, %9, 2\n\t" // t1 = out_w >> 2 + "beqz t1, 8f\n\t" + + // 在这里先载入第一次执行的rxx, 减少内循环依赖,便于指令流水 + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + // load 18 times, mac 36 次 + "7:\n\t" // out_w4_loop + + "vmv.v.x v24, zero\n\t" + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + "vmv.v.x v25, zero\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vlw.v v13, (%1)\n\t" // r03 + "addi %1, %1, 16\n\t" + + "vmv.v.x v26, zero\n\t" + + "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] + + "vlw.v v14, (%1)\n\t" // r04 + "addi %1, %1, 16\n\t" + "vmv.v.x v27, zero\n\t" + + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + "vfmacc.vv v26, v1, v12\n\t" // k00 * r02 out[2][0] + + "vlw.v v15, (%1)\n\t" // r05 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 4 elements addr + // ************ + + "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] + "vfmacc.vv v27, v1, v13\n\t" // k00 * r03 out[3][0] + + "vlw.v v16, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v26, v2, v13\n\t" // k01 * r03 out[2][1] + "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] + "vfmacc.vv v25, v3, v13\n\t" // k02 * r03 out[1][2] + + "vlw.v v17, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v27, v2, v14\n\t" // k01 * r04 out[3][1] + "vfmacc.vv v26, v3, v14\n\t" // k02 * r04 out[2][2] + + "vlw.v v18, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v4, v16\n\t" // k10 * r10 out[0][3] + "vfmacc.vv v27, v3, v15\n\t" // k02 * r05 out[3][2] + + "vlw.v v19, (%2)\n\t" // r13 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v4, v17\n\t" // k10 * r11 out[1][3] + "vfmacc.vv v24, v5, v17\n\t" // k11 * r11 out[0][4] + + "vlw.v v12, (%2)\n\t" // r14 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v26, v4, v18\n\t" // k10 * r12 out[2][3] + "vfmacc.vv v25, v5, v18\n\t" // k12 * r13 out[1][4] + + "vlw.v v13, (%2)\n\t" // r15 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 4 elements addr + // ************ + + "vfmacc.vv v27, v4, v19\n\t" // k10 * r13 out[3][3] + "vfmacc.vv v24, v6, v18\n\t" // k12 * r12 out[0][5] + + "vlw.v v14, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v6, v19\n\t" // k12 * r13 out[1][5] + "vfmacc.vv v26, v5, v19\n\t" // k11 * r13 out[2][4] + "vfmacc.vv v27, v5, v12\n\t" // k11 * r14 out[3][4] + + "vlw.v v15, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v7, v14\n\t" // k20 * r20 out[0][6] + "vfmacc.vv v26, v6, v12\n\t" // k12 * r14 out[2][5] + + "vlw.v v16, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v27, v6, v13\n\t" // k12 * r15 out[3][5] + "vfmacc.vv v25, v7, v15\n\t" // k20 * r21 out[1][6] + + "vlw.v v17, (%3)\n\t" // r23 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v8, v15\n\t" // k21 * r21 out[0][7] + "vfmacc.vv v26, v7, v16\n\t" // k20 * r22 out[2][6] + + "vlw.v v18, (%3)\n\t" // r24 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v8, v16\n\t" // k21 * r22 out[1][7] + "vfmacc.vv v27, v7, v17\n\t" // k20 * r23 out[3][6] + + "vlw.v v19, (%3)\n\t" // r25 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 4 elements addr + // ************ + + "vfmacc.vv v24, v9, v16\n\t" // k22 * r22 out[0][8] + "vfmacc.vv v26, v8, v17\n\t" // k21 * r23 out[2][7] + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vfadd.vv v24, v24, v0\n\t" + + "vfmacc.vv v25, v9, v17\n\t" // k22 * r23 out[1][8] + "vfmacc.vv v27, v8, v18\n\t" // k21 * r24 out[3][7] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" + "addi %6, %6, 16\n\t" // store out0 + + "vfadd.vv v25, v25, v0\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v26, v9, v18\n\t" // k22 * r24 out[2][8] + +#ifdef FUSE_CONV_RELU + "vfmax.vf v25, v25, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v25, (%6)\n\t" + "addi %6, %6, 16\n\t" // store out1 + + "vfmacc.vv v27, v9, v19\n\t" // k22 * r25 out[3][8] + + "vfadd.vv v26, v26, v0\n\t" + "vfadd.vv v27, v27, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v26, v26, ft0\n\t" // **** relu **** + "vfmax.vf v27, v27, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v26, (%6)\n\t" + "addi %6, %6, 16\n\t" // store out2 + + "vsw.v v27, (%6)\n\t" + "addi %6, %6, 16\n\t" // store out3 + + "addi t1, t1, -1\n\t" + "bnez t1, 7b\n\t" + + "addi %1, %1, -32\n\t" // r0 -= 8 ********* bump r0 to origin addr + // ************ + + "8:\n\t" // out_w2 + + "andi t1, %9, 3\n\t" // t1 = out_w & 3 + "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 + "beqz t2, 9f\n\t" + + // load 12 times, mac 18 times + + "vmv.v.x v24, zero\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v25, zero\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v25, v1, v11\n\t" // k00 * r01 out[1][0] + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + + "vlw.v v13, (%1)\n\t" // r03 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 2 elements addr + // ************ + + "vfmacc.vv v25, v2, v12\n\t" // k01 * r02 out[1][1] + "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] + + "vlw.v v14, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v3, v13\n\t" // k02 * r03 out[1][2] + + "vlw.v v15, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v4, v14\n\t" // k10 * r10 out[0][3] + + "vlw.v v16, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v25, v4, v15\n\t" // k10 * r11 out[1][3] + "vfmacc.vv v24, v5, v15\n\t" // k11 * r11 out[0][4] + + "vlw.v v17, (%2)\n\t" // r13 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 2 elements addr + // ************ + + "vfmacc.vv v25, v5, v16\n\t" // k11 * r12 out[1][4] + "vfmacc.vv v24, v6, v16\n\t" // k12 * r12 out[0][5] + + "vlw.v v10, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v6, v17\n\t" // k12 * r13 out[1][5] + + "vlw.v v11, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v7, v10\n\t" // k20 * r20 out[0][6] + + "vlw.v v12, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v25, v7, v11\n\t" // k20 * r21 out[1][6] + "vfmacc.vv v24, v8, v11\n\t" // k21 * r21 out[0][7] + + "vlw.v v13, (%3)\n\t" // r23 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 2 elements addr + // ************ + + "vfmacc.vv v25, v8, v12\n\t" // k21 * r22 out[1][7] + "vfmacc.vv v24, v9, v12\n\t" // k22 * r22 out[0][8] + + "vfmacc.vv v25, v9, v13\n\t" // k22 * r23 out[1][8] + + "vfadd.vv v24, v24, v0\n\t" + "vfadd.vv v25, v25, v0\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** + "vfmax.vf v25, v25, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" + "addi %6, %6, 16\n\t" + + "vsw.v v25, (%6)\n\t" + "addi %6, %6, 16\n\t" + + "9:\n\t" // out_w_tail + "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 + "beqz t2, 10f\n\t" + + // load 9 times, mac 9 times + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v24, zero\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v24, v1, v10\n\t" // k00 * r00 out[0][0] + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, -16\n\t" // r0 -= 4 ********* bump r0 to next 1 elements addr + // ************ + + "vfmacc.vv v24, v2, v11\n\t" // k01 * r01 out[0][1] + + "vlw.v v13, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v3, v12\n\t" // k02 * r02 out[0][2] + + "vlw.v v14, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v24, v4, v13\n\t" // k10 * r10 out[0][3] + + "vlw.v v15, (%2)\n\t" // r12 + "addi %2, %2, -16\n\t" // r1 -= 4 ********* bump r1 to next 1 elements addr + // ************ + + "vfmacc.vv v24, v5, v14\n\t" // k11 * r11 out[0][4] + + "vlw.v v16, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v6, v15\n\t" // k12 * r12 out[0][5] + + "vlw.v v17, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v24, v7, v16\n\t" // k20 * r20 out[0][6] + + "vlw.v v18, (%3)\n\t" // r22 + "addi %3, %3, -16\n\t" // r2 -= 4 ********* bump r2 to next 1 elements addr + // ************ + + "vfmacc.vv v24, v8, v17\n\t" // k21 * r21 out[0][7] + "vfmacc.vv v24, v9, v18\n\t" // k22 * r22 out[0][8] + + "vfadd.vv v24, v24, v0\n\t" + +#ifdef FUSE_CONV_RELU + "vfmax.vf v24, v24, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v24, (%6)\n\t" + "addi %6, %6, 16\n\t" + + "10:\n\t" + // updata addr + "addi %1, %1, 32\n\t" // r0 += 2 * 4 * 4 + "addi %2, %2, 32\n\t" // r1 += 2 * 4 * 4 + "addi %3, %3, 32\n\t" // r2 += 2 * 4 * 4 + + : "=r"(kernel0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(r3), // %4 + "=r"(bias0), // %5 + "=r"(outptr0), // %6 + "=r"(outptr1), // %7 + "=r"(out_h), // %8 + "=r"(out_w), // %9 + "=r"(in_w) // %10 + : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(r3), "5"(bias0), "6"(outptr0), + "7"(outptr1), "8"(out_h), "9"(out_w), "10"(in_w) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2", "t3"); + } + return CSINN_TRUE; +} + +/* + (1) Algorithm works as follows: + out_h1_loop: out_h1_w4_loop --> out_h1_wtail + + (2) register definition: + t0: i_out_h + t1: i_out_w + v0: bias_data + v1-v9: [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ] + v10-v20: r00-r08 / r10-r18 / r20-r28 + v28-v31: output_data + + Due to pack4, both kxx and rxx actually occupy a v register + + TODO: how to pack for input / kernel / bias / output + padding +*/ + +int DWCONV3X3S2_PACK4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int tailstep = (in_w - 2 * out_w + in_w) * 4; + + for (int c = 0; c < in_c / 4; c++) { + float *out = output_data + c * out_h * out_w * 4; + float *outptr0 = out; + + const float *img0 = input_data + c * in_h * in_w * 4; + const float *r0 = img0; + const float *r1 = r0 + in_w * 4; + const float *r2 = r1 + in_w * 4; + + const float *kernel0 = kernel_data + c * 9 * 4; + + const float *bias0 = NULL; + if (bias_data && bias->dim_count != 0) { + bias0 = bias_data + c * 4; + } + + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 + +#ifdef FUSE_CONV_RELU + "fmv.w.x ft0, zero\n\t" +#endif // FUSE_CONV_RELU + + "vmv.v.x v0, zero\n\t" // clear v0 + "beqz %4, 0f\n\t" // if bias_data = NULL clear v0 + "vlw.v v0, (%4)\n\t" + + "0:\n\t" + + "vlw.v v1, (%0)\n\t" // k00 + "addi %0, %0, 16\n\t" // kernel += 4 + "vlw.v v2, (%0)\n\t" // k01 + "addi %0, %0, 16\n\t" + "vlw.v v3, (%0)\n\t" // k02 + "addi %0, %0, 16\n\t" + "vlw.v v4, (%0)\n\t" // k10 + "addi %0, %0, 16\n\t" + "vlw.v v5, (%0)\n\t" // k11 + "addi %0, %0, 16\n\t" + "vlw.v v6, (%0)\n\t" // k12 + "addi %0, %0, 16\n\t" + "vlw.v v7, (%0)\n\t" // k20 + "addi %0, %0, 16\n\t" + "vlw.v v8, (%0)\n\t" // k21 + "addi %0, %0, 16\n\t" + "vlw.v v9, (%0)\n\t" // k22 + + "mv t0, %6\n\t" // i_out_h = out_h + + "1:\n\t" // out_h1_loop + + "srai t1, %7, 2\n\t" // t1 = out_w >> 2 + "beqz t1, 3f\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" // r0 += 4 + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "2:\n\t" // out_w4_loop + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" + + "vlw.v v13, (%1)\n\t" // r03 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 out0 + + "vlw.v v14, (%1)\n\t" // r04 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v29, v1, v12\n\t" // k00 * r02 out1 + + "vlw.v v15, (%1)\n\t" // r05 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v2, v11\n\t" // k01 * r01 out0 + + "vlw.v v16, (%1)\n\t" // r06 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v29, v2, v13\n\t" // k01 * r03 out1 + + "vlw.v v17, (%1)\n\t" // r07 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v3, v12\n\t" // k02 * r02 out0 + + "vlw.v v18, (%1)\n\t" // r08 + // "addi %1, %1, 16\n\t" + + "vfmacc.vv v29, v3, v14\n\t" // k02 * r04 out1 + + "vlw.v v10, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v30, v1, v14\n\t" // k00 * r04 out2 + + "vlw.v v11, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v31, v1, v16\n\t" // k00 * r06 out3 + + "vlw.v v12, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v30, v2, v15\n\t" // k01 * r05 out2 + + "vlw.v v13, (%2)\n\t" // r13 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v31, v2, v17\n\t" // k01 * r07 out3 + + "vlw.v v14, (%2)\n\t" // r14 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v30, v3, v16\n\t" // k02 * r06 out2 + + "vlw.v v15, (%2)\n\t" // r15 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v31, v3, v18\n\t" // k02 * r08 out3 + + "vlw.v v16, (%2)\n\t" // r16 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v4, v10\n\t" // k10 * r10 out0 + + "vlw.v v17, (%2)\n\t" // r17 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v4, v12\n\t" // k10 * r12 out1 + + "vlw.v v18, (%2)\n\t" // r18 + // "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v5, v11\n\t" // k11 * r11 out0 + + "vlw.v v10, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v29, v5, v13\n\t" // k11 * r13 out1 + + "vlw.v v11, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v6, v12\n\t" // k12 * r12 out0 + + "vlw.v v12, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v29, v6, v14\n\t" // k12 * r14 out1 + + "vlw.v v13, (%3)\n\t" // r23 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v30, v4, v14\n\t" // k10 * r14 out2 + + "vlw.v v14, (%3)\n\t" // r24 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v31, v4, v16\n\t" // k10 * r16 out3 + + "vlw.v v19, (%3)\n\t" // r25 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v30, v5, v15\n\t" // k11 * r15 out2 + + "vlw.v v20, (%3)\n\t" // r26 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v31, v5, v17\n\t" // k11 * r17 out3 + + "vlw.v v15, (%3)\n\t" // r27 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v30, v6, v16\n\t" // k12 * r16 out2 + + "vlw.v v16, (%3)\n\t" // r28 + // "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v7, v10\n\t" // k20 * r20 out0 + "vfmacc.vv v31, v6, v18\n\t" // k12 * r18 out3 + + "vlw.v v10, (%1)\n\t" // r00 ******** load r00-r02 for next loop ******* + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v8, v11\n\t" // k21 * r21 out0 + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v9, v12\n\t" // k22 * r22 out0 + "vfmacc.vv v29, v7, v12\n\t" // k20 * r22 out1 + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v29, v8, v13\n\t" // k21 * r23 out1 + "vfmacc.vv v29, v9, v14\n\t" // k22 * r24 out1 + "vfmacc.vv v30, v7, v14\n\t" // k20 * r24 out2 + "vfmacc.vv v31, v7, v20\n\t" // k20 * r26 out3 + "vfmacc.vv v30, v8, v19\n\t" // k21 * r25 out2 + "vfmacc.vv v31, v8, v15\n\t" // k21 * r27 out3 + "vfmacc.vv v30, v9, v20\n\t" // k22 * r26 out2 + "vfmacc.vv v31, v9, v16\n\t" // k22 * r28 out3 + + "vfadd.vv v28, v28, v0\n\t" + "vfadd.vv v29, v29, v0\n\t" + "vfadd.vv v30, v30, v0\n\t" + "vfadd.vv v31, v31, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** + "vfmax.vf v29, v29, ft0\n\t" // **** relu **** + "vfmax.vf v30, v30, ft0\n\t" // **** relu **** + "vfmax.vf v31, v31, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v28, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "vsw.v v29, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "vsw.v v30, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "vsw.v v31, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "addi t1, t1, -1\n\t" // loop cnt + "bnez t1, 2b\n\t" + + "addi %1, %1, -48\n\t" // r0 -= 12 ********* bump r0 to origin addr + // ************ + + "3:\n\t" // out_w2 : can only be executed once + + "andi t1, %7, 3\n\t" // t1 = out_w & 3 + "srai t2, t1, 1\n\t" // t2 = (out_w & 3) >> 1 + "beqz t2, 4f\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v28, zero\n\t" + + "vlw.v v11, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vmv.v.x v29, zero\n\t" + + "vlw.v v12, (%1)\n\t" // r02 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 out0 + + "vlw.v v13, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v1, v12\n\t" // k00 * r02 out1 + + "vlw.v v14, (%1)\n\t" // r03 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v2, v11\n\t" // k01 * r01 out0 + + "vlw.v v15, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v2, v14\n\t" // k01 * r03 out1 + + "vlw.v v16, (%1)\n\t" // r04 + + "vfmacc.vv v28, v3, v12\n\t" // k02 * r02 out0 + + "vlw.v v17, (%2)\n\t" // r12 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v3, v16\n\t" // k02 * r04 out1 + + "vlw.v v18, (%3)\n\t" // r20 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v4, v13\n\t" // k10 * r10 out0 + + "vlw.v v19, (%2)\n\t" // r13 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v29, v4, v17\n\t" // k10 * r12 out1 + "vfmacc.vv v28, v6, v17\n\t" // k12 * r12 out0 + + "vlw.v v20, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v29, v5, v19\n\t" // k11 * r13 out1 + "vfmacc.vv v28, v5, v15\n\t" // k11 * r11 out0 + + "vlw.v v10, (%2)\n\t" // r14 + // "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v7, v18\n\t" // k20 * r20 out0 + + "vlw.v v11, (%3)\n\t" // r22 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v29, v6, v10\n\t" // k12 * r14 out1 + + "vlw.v v12, (%3)\n\t" // r23 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v8, v20\n\t" // k21 * r21 out0 + "vfmacc.vv v29, v7, v11\n\t" // k20 * r22 out1 + + "vlw.v v13, (%3)\n\t" // r24 + // "addi %3, %3, 16\n\t" + + "vfmacc.vv v29, v8, v12\n\t" // k21 * r23 out1 + "vfmacc.vv v28, v9, v11\n\t" // k22 * r22 out0 + "vfmacc.vv v29, v9, v13\n\t" // k22 * r24 out1 + + "vfadd.vv v28, v28, v0\n\t" + "vfadd.vv v29, v29, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** + "vfmax.vf v29, v29, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v28, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "vsw.v v29, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "4:\n\t" // out_w_tail : can only be executed once + "andi t2, t1, 1\n\t" // t2 = (out_w & 3) & 1 + "beqz t2, 5f\n\t" + + "vlw.v v10, (%1)\n\t" // r00 + "addi %1, %1, 16\n\t" + + "vmv.v.x v28, zero\n\t" + + "vlw.v v11, (%2)\n\t" // r10 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v1, v10\n\t" // k00 * r00 + + "vlw.v v12, (%3)\n\t" + "addi %3, %3, 16\n\t" // r20 + + "vfmacc.vv v28, v4, v11\n\t" // k10 * r10 + + "vlw.v v13, (%1)\n\t" // r01 + "addi %1, %1, 16\n\t" + + "vfmacc.vv v28, v7, v12\n\t" // k20 * r20 + + "vlw.v v14, (%2)\n\t" // r11 + "addi %2, %2, 16\n\t" + + "vfmacc.vv v28, v2, v13\n\t" // k01 * r01 + + "vlw.v v15, (%3)\n\t" // r21 + "addi %3, %3, 16\n\t" + + "vfmacc.vv v28, v5, v14\n\t" // k11 * r11 + + "vlw.v v16, (%1)\n\t" // r02 + + "vfmacc.vv v28, v8, v15\n\t" // k21 * r21 + + "vlw.v v17, (%2)\n\t" // r12 + + "vfmacc.vv v28, v3, v16\n\t" // k02 * r02 + + "vlw.v v18, (%3)\n\t" // r22 + + "vfmacc.vv v28, v6, v17\n\t" // k12 * r12 + "vfmacc.vv v28, v9, v18\n\t" // k22 * r22 + + "vfadd.vv v28, v28, v0\n\t" // add bias + +#ifdef FUSE_CONV_RELU + "vfmax.vf v28, v28, ft0\n\t" // **** relu **** +#endif // FUSE_CONV_RELU + + "vsw.v v28, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "5:\n\t" + + "slli t2, %8, 2\n\t" // t2 = tailstep * 4 + "add %1, %1, t2\n\t" + "add %2, %2, t2\n\t" + "add %3, %3, t2\n\t" // r0/r1/r2 += tailstep + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + : "=r"(kernel0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(bias0), // %4 + "=r"(outptr0), // %5 + "=r"(out_h), // %6 + "=r"(out_w), // %7 + "=r"(tailstep) // %8 + : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(bias0), "5"(outptr0), "6"(out_h), + "7"(out_w), "8"(tailstep) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v28", "v29", + "v30", "v31", "ft0", "t0", "t1", "t2" + + ); + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c b/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c index a638ee78..68296948 100644 --- a/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c +++ b/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" /************************************************************************************************************ c906 vlen = 128, 128/16 = 8 --> pack8, if vlen = 256 256/16 = 16 --> pack16 @@ -55,11 +54,9 @@ */ -int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_dwconv3x3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -953,11 +950,9 @@ int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input, TODO: how to pack for input / kernel / bias / output padding */ -int csi_c906_dwconv3x3s2_pack8_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_dwconv3x3s2_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/depthwise_convolution_5x5.c b/source/c906_opt/depthwise_convolution_5x5_fp32.c similarity index 56% rename from source/c906_opt/depthwise_convolution_5x5.c rename to source/c906_opt/depthwise_convolution_5x5_fp32.c index 6805d967..bb0fa483 100644 --- a/source/c906_opt/depthwise_convolution_5x5.c +++ b/source/c906_opt/depthwise_convolution_5x5_fp32.c @@ -16,28 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" #ifndef DWCONV5X5S1 -#define DWCONV5X5S1 csi_c906_dwconv5x5s1 +#define DWCONV5X5S1 shl_c906_dwconv5x5s1 #endif #ifndef DWCONV5X5S2 -#define DWCONV5X5S2 csi_c906_dwconv5x5s2 +#define DWCONV5X5S2 shl_c906_dwconv5x5s2 #endif - /* TODO: support channel mult ?? rvv optimization */ -int DWCONV5X5S1(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int DWCONV5X5S1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -45,7 +42,7 @@ int DWCONV5X5S1(struct csi_tensor *input, float *bias_data = (float *)bias->data; int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_c = input->dim[1]; // group = in_channel int32_t in_h = input->dim[2]; int32_t in_w = input->dim[3]; @@ -53,9 +50,13 @@ int DWCONV5X5S1(struct csi_tensor *input, int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + shl_c906_pad_input( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -83,31 +84,39 @@ int DWCONV5X5S1(struct csi_tensor *input, const float *k3 = k2 + 5; const float *k4 = k3 + 5; - int h = 0; - for (; h + 1 < out_h; h += 2) - { + for (; h + 1 < out_h; h += 2) { for (int w = 0; w < out_w; w++) { float sum0 = bias0; float sum1 = bias0; - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; + sum0 += + r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; - sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2] + r1[3] * k0[3] + r1[4] * k0[4]; + sum0 += + r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; + sum1 += + r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2] + r1[3] * k0[3] + r1[4] * k0[4]; - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; - sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2] + r2[3] * k1[3] + r2[4] * k1[4]; + sum0 += + r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; + sum1 += + r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2] + r2[3] * k1[3] + r2[4] * k1[4]; - sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; - sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2] + r3[3] * k2[3] + r3[4] * k2[4]; + sum0 += + r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; + sum1 += + r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2] + r3[3] * k2[3] + r3[4] * k2[4]; - sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; - sum1 += r4[0] * k3[0] + r4[1] * k3[1] + r4[2] * k3[2] + r4[3] * k3[3] + r4[4] * k3[4]; + sum0 += + r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; + sum1 += + r4[0] * k3[0] + r4[1] * k3[1] + r4[2] * k3[2] + r4[3] * k3[3] + r4[4] * k3[4]; - sum1 += r5[0] * k4[0] + r5[1] * k4[1] + r5[2] * k4[2] + r5[3] * k4[3] + r5[4] * k4[4]; + sum1 += + r5[0] * k4[0] + r5[1] * k4[1] + r5[2] * k4[2] + r5[3] * k4[3] + r5[4] * k4[4]; -#ifdef FUSE_CONV_RELU +#ifdef FUSE_CONV_RELU sum0 = sum0 > 0 ? sum0 : 0; sum1 = sum1 > 0 ? sum1 : 0; #endif // FUSE_CONV_RELU @@ -124,7 +133,7 @@ int DWCONV5X5S1(struct csi_tensor *input, outptr0++; outptr1++; } - r0 += 4 + in_w; // jump to next line + r0 += 4 + in_w; // jump to next line r1 += 4 + in_w; r2 += 4 + in_w; r3 += 4 + in_w; @@ -138,13 +147,18 @@ int DWCONV5X5S1(struct csi_tensor *input, for (; h < out_h; h++) { for (int w = 0; w < out_w; w++) { float sum0 = bias0; - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; - sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; - sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; - -#ifdef FUSE_CONV_RELU + sum0 += + r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; + sum0 += + r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; + sum0 += + r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; + sum0 += + r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; + sum0 += + r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; + +#ifdef FUSE_CONV_RELU sum0 = sum0 > 0 ? sum0 : 0; #endif // FUSE_CONV_RELU @@ -165,21 +179,18 @@ int DWCONV5X5S1(struct csi_tensor *input, } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } - /* TODO: support channel mult ?? rvv optimization */ -int DWCONV5X5S2(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int DWCONV5X5S2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -187,7 +198,7 @@ int DWCONV5X5S2(struct csi_tensor *input, float *bias_data = (float *)bias->data; int32_t batch = input->dim[0]; - int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_c = input->dim[1]; // group = in_channel int32_t in_h = input->dim[2]; int32_t in_w = input->dim[3]; @@ -195,9 +206,13 @@ int DWCONV5X5S2(struct csi_tensor *input, int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + shl_c906_pad_input( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -231,13 +246,18 @@ int DWCONV5X5S2(struct csi_tensor *input, for (int w = 0; w < out_w; w++) { float sum0 = bias0; - sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; - sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; - sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; - sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; - sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; - -#ifdef FUSE_CONV_RELU + sum0 += + r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4]; + sum0 += + r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4]; + sum0 += + r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4]; + sum0 += + r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4]; + sum0 += + r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4]; + +#ifdef FUSE_CONV_RELU sum0 = sum0 > 0 ? sum0 : 0; #endif // FUSE_CONV_RELU @@ -258,6 +278,6 @@ int DWCONV5X5S2(struct csi_tensor *input, } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } diff --git a/source/c906_opt/depthwise_convolution_fp16.c b/source/c906_opt/depthwise_convolution_fp16.c index c591db60..fc673f74 100644 --- a/source/c906_opt/depthwise_convolution_fp16.c +++ b/source/c906_opt/depthwise_convolution_fp16.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_c906_dwconv2d_s1_pad0_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -36,10 +36,10 @@ int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor * const int32_t output_depth = output->dim[1]; const int32_t input_height = input->dim[2]; const int32_t input_width = input->dim[3]; - const int32_t filter_height = kernel->dim[2]; + const int32_t filter_height = kernel->dim[2]; const int32_t filter_width = kernel->dim[3]; - const int32_t output_height = output->dim[2]; - const int32_t output_width = output->dim[3]; // input_depth = output_depth; + const int32_t output_height = output->dim[2]; + const int32_t output_width = output->dim[3]; // input_depth = output_depth; for (int32_t b = 0; b < batches; ++b) { int output_dim_pos = 0; diff --git a/source/c906_opt/depthwise_convolution_relu_5x5.c b/source/c906_opt/depthwise_convolution_relu_3x3_fp32.c similarity index 79% rename from source/c906_opt/depthwise_convolution_relu_5x5.c rename to source/c906_opt/depthwise_convolution_relu_3x3_fp32.c index 106becd9..ca3c5d1d 100644 --- a/source/c906_opt/depthwise_convolution_relu_5x5.c +++ b/source/c906_opt/depthwise_convolution_relu_3x3_fp32.c @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#define DWCONV5X5S1 csi_c906_dwconv5x5s1_fuse_relu -#define DWCONV5X5S2 csi_c906_dwconv5x5s2_fuse_relu +#define DWCONV3X3S1 shl_c906_dwconv3x3s1_fuse_relu +#define DWCONV3X3S2 shl_c906_dwconv3x3s2_fuse_relu #define FUSE_CONV_RELU -#include "./depthwise_convolution_5x5.c" +#include "./depthwise_convolution_3x3_fp32.c" diff --git a/source/c906_opt/depthwise_convolution_relu_3x3_pack4.c b/source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c similarity index 77% rename from source/c906_opt/depthwise_convolution_relu_3x3_pack4.c rename to source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c index 64001ad2..3ab9dd69 100644 --- a/source/c906_opt/depthwise_convolution_relu_3x3_pack4.c +++ b/source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c @@ -16,12 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#define DWCONV3X3S1_PACK4 csi_c906_dwconv3x3s1_pack4_fuse_relu -#define DWCONV3X3S2_PACK4 csi_c906_dwconv3x3s2_pack4_fuse_relu +#define DWCONV3X3S1_PACK4 shl_c906_dwconv3x3s1_pack4_fuse_relu +#define DWCONV3X3S2_PACK4 shl_c906_dwconv3x3s2_pack4_fuse_relu #define FUSE_CONV_RELU - -#include "./depthwise_convolution_3x3_pack4.c" +#include "./depthwise_convolution_3x3_pack4_fp32.c" diff --git a/source/c906_opt/depthwise_convolution_relu_3x3.c b/source/c906_opt/depthwise_convolution_relu_5x5_fp32.c similarity index 79% rename from source/c906_opt/depthwise_convolution_relu_3x3.c rename to source/c906_opt/depthwise_convolution_relu_5x5_fp32.c index 8f7ba794..1cea21d4 100644 --- a/source/c906_opt/depthwise_convolution_relu_3x3.c +++ b/source/c906_opt/depthwise_convolution_relu_5x5_fp32.c @@ -16,12 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#define DWCONV3X3S1 csi_c906_dwconv3x3s1_fuse_relu -#define DWCONV3X3S2 csi_c906_dwconv3x3s2_fuse_relu +#define DWCONV5X5S1 shl_c906_dwconv5x5s1_fuse_relu +#define DWCONV5X5S2 shl_c906_dwconv5x5s2_fuse_relu #define FUSE_CONV_RELU - -#include "./depthwise_convolution_3x3.c" +#include "./depthwise_convolution_5x5_fp32.c" diff --git a/source/c906_opt/div.c b/source/c906_opt/div.c index bbfd9fbe..7eec6604 100644 --- a/source/c906_opt/div.c +++ b/source/c906_opt/div.c @@ -16,26 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" -int csi_c906_div_init(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +#include "shl_c906.h" +int shl_c906_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { + struct csinn_callback *cb = params->base.cb; if (input1->dtype == CSINN_DTYPE_FLOAT32) { float *ptr = input1->data; - size_t tensor_size = csi_tensor_size(input1); + size_t tensor_size = csinn_tensor_size(input1); for (size_t i = 0; i < tensor_size; i++) { ptr[i] = 1.f / ptr[i]; } - params->base.bc = csi_c906_mul_f32; + cb->exec = shl_c906_mul_f32; } else if (input1->dtype == CSINN_DTYPE_FLOAT16) { __fp16 *ptr = input1->data; - size_t tensor_size = csi_tensor_size(input1); + size_t tensor_size = csinn_tensor_size(input1); for (size_t i = 0; i < tensor_size; i++) { ptr[i] = 1.f / ptr[i]; } - params->base.bc = csi_c906_mul_fp16; + cb->exec = shl_c906_mul_fp16; } return CSINN_TRUE; } diff --git a/source/c906_opt/fullyconnected.c b/source/c906_opt/fullyconnected.c index 51345b0a..0dd88356 100644 --- a/source/c906_opt/fullyconnected.c +++ b/source/c906_opt/fullyconnected.c @@ -16,14 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* change memory layout for weight matrix [out_nodes * in_nodes] by N(8) shape */ -void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) +void shl_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) { int i = 0; for (; i + 7 < m; i += 8) { @@ -41,13 +41,13 @@ void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int dst += i * k; src += i * k; for (; i < m; i++) { - csi_c906_memcpy(dst, src, sizeof(__fp16) * ldx); + shl_c906_memcpy(dst, src, sizeof(__fp16) * ldx); dst += k; src += k; } } -void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) +void shl_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) { int i = 0; for (; i + 15 < m; i += 16) { @@ -74,32 +74,28 @@ void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, in dst += i * k; src += i * k; for (; i < m; i++) { - csi_c906_memcpy(dst, src, sizeof(__fp16) * ldx); + shl_c906_memcpy(dst, src, sizeof(__fp16) * ldx); dst += k; src += k; } } - -void csi_c906_fc_gemv_transform_weight_fp16(struct csi_tensor *weights) +void shl_c906_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights) { __fp16 *weight_data = (__fp16 *)weights->data; int n = weights->dim[0]; // out_nodes int k = weights->dim[1]; // in_nodes - __fp16* pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16)); - csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16)); + shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k); memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } - -int csi_c906_fullyconnected_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -178,11 +174,9 @@ int csi_c906_fullyconnected_f32(struct csi_tensor *input, return CSINN_TRUE; } -int csi_c906_fullyconnected_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -436,11 +430,9 @@ int csi_c906_fullyconnected_fp16(struct csi_tensor *input, best implementation from the software perspective loop unroll: k = 8 */ -int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -460,7 +452,7 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, bool flag_bias = 1; // default: fc layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2); + bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2); } for (int b = 0; b < batches; b++) { @@ -686,7 +678,7 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } @@ -696,11 +688,9 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, /* loop unroll: k = 1 */ -int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_pack8_fp16_1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -720,7 +710,7 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, bool flag_bias = 1; // default: fc layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2); + bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2); } for (int b = 0; b < batches; b++) { @@ -834,7 +824,7 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } @@ -846,11 +836,9 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, best performance measured on D1 loop unroll: k = 1 && pack16 */ -int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -870,7 +858,7 @@ int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input, bool flag_bias = 1; // default: fc layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2); + bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2); } for (int b = 0; b < batches; b++) { @@ -983,16 +971,17 @@ int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input, } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; } -int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, struct fc_params *params) +int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *weights, + struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -1121,33 +1110,32 @@ int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input, return CSINN_TRUE; } -int csi_c906_fullyconnected_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_c906_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { + struct csinn_callback *cb = params->base.cb; if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_nn_rvv_fc_gemv_transform_weight_fp32(weights); - params->base.bc = csi_nn_rvv_fullyconnected_packn_fp32; + shl_rvv_fc_gemv_transform_weight_fp32(weights); + cb->exec = shl_rvv_fullyconnected_packn_fp32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_c906_fc_gemv_transform_weight_fp16(weights); + shl_c906_fc_gemv_transform_weight_fp16(weights); int output_depth = weights->dim[weights->dim_count - 2]; if (bias != NULL && output_depth % 16 == 0) { - params->base.bc = csi_c906_fullyconnected_pack16_output16_fp16; + cb->exec = shl_c906_fullyconnected_pack16_output16_fp16; } else { - params->base.bc = csi_c906_fullyconnected_pack16_fp16; + cb->exec = shl_c906_fullyconnected_pack16_fp16; } - // params->base.bc = csi_c906_fullyconnected_fp16; + // cb->exec = shl_c906_fullyconnected_fp16; } else if (input->dtype == CSINN_DTYPE_INT8) { - csi_nn_rvv_fc_gemv_transform_weight_int8(weights); + shl_rvv_fc_gemv_transform_weight_int8(weights); // support channel quantization for (int i = 0; i < weights->quant_channel; i++) { float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), + shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), &(weights->qinfo[i].shift)); } - params->base.bc = csi_nn_rvv_fullyconnected_packn_int8; + cb->exec = shl_rvv_fullyconnected_packn_int8; } return CSINN_TRUE; } diff --git a/source/c906_opt/gather.c b/source/c906_opt/gather.c index 77791cf2..50582cc2 100644 --- a/source/c906_opt/gather.c +++ b/source/c906_opt/gather.c @@ -16,14 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_gather_fp16(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_params *params) +int shl_c906_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -45,8 +43,8 @@ int csi_c906_gather_fp16(struct csi_tensor *input, for (int i = 0; i < outer_size; i++) { for (int j = 0; j < indices_size; j++) { if (indices_data[j] < input->dim[params->axis]) { - csi_c906_memcpy(output_data, input_data + indices_data[j] * inner_size, - inner_size * sizeof(__fp16)); + shl_c906_memcpy(output_data, input_data + indices_data[j] * inner_size, + inner_size * sizeof(__fp16)); } else { memset(output_data, 0, inner_size * sizeof(__fp16)); } @@ -56,4 +54,3 @@ int csi_c906_gather_fp16(struct csi_tensor *input, } return CSINN_TRUE; } - diff --git a/source/c906_opt/gemm_fp16.c b/source/c906_opt/gemm_fp16.c index 1ae23b82..ec4493cf 100644 --- a/source/c906_opt/gemm_fp16.c +++ b/source/c906_opt/gemm_fp16.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* (1) Algorithm works as follows: @@ -50,10 +50,10 @@ a0-a7: 8 rows addr for load v0-v14: memcpy load / store v reg - notice: called in the initialization function (csi_c906_conv2d_init) + notice: called in the initialization function (shl_c906_conv2d_init) */ -void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) +void shl_c906_reorder_kernel_fp16(__fp16* a, __fp16* sa, int m, int k, int ldx) { asm volatile( @@ -382,7 +382,7 @@ void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) */ -void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +void shl_c906_reorder_input_fp16(__fp16* b, __fp16* sb, int k, int n, int ldx) { asm volatile( @@ -553,8 +553,7 @@ void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) ); } - -void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +void shl_c906_reorder_input_fp16_1(__fp16* b, __fp16* sb, int k, int n, int ldx) { asm volatile( "vsetvli zero, zero, e16, m1\n\t" // set vl = 8 @@ -662,7 +661,8 @@ void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx) TODO: if bias == NULL */ -static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, + __fp16* bias) { asm volatile( @@ -1069,7 +1069,8 @@ static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in TODO: if bias == NULL */ -static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, + __fp16* bias) { asm volatile( @@ -1598,7 +1599,8 @@ static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in TODO: if bias == NULL */ -static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, + __fp16* bias) { asm volatile( @@ -2460,7 +2462,8 @@ static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in TODO: if bias == NULL */ -static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, + __fp16* bias) { asm volatile( @@ -3436,8 +3439,8 @@ static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in } - -static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, + __fp16* bias) { asm volatile( "vsetvli zero, zero, e16, m1\n\t" // set vl = 8 @@ -3689,8 +3692,8 @@ static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, } - -void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int m, int k, int n, int ldc, __fp16* bias) +void shl_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int m, int k, + int n, int ldc, __fp16* bias) { __fp16* pa = (__fp16 *)sa; __fp16* pb = (__fp16 *)sb; @@ -3699,7 +3702,7 @@ void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, bool flag_bias = 1; // default: conv2d layer include bias if (bias == NULL) { flag_bias = 0; - bias = (__fp16 *)csi_mem_alloc(m * 2); + bias = (__fp16*)shl_mem_alloc(m * 2); } __fp16 *bias_tmp = bias; @@ -3768,7 +3771,7 @@ void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, break; } if (!flag_bias) { - csi_mem_free(bias); + shl_mem_free(bias); bias = NULL; } } diff --git a/source/c906_opt/gemm_fp32.c b/source/c906_opt/gemm_fp32.c new file mode 100644 index 00000000..c0a4d543 --- /dev/null +++ b/source/c906_opt/gemm_fp32.c @@ -0,0 +1,3459 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c906.h" + +/* The matrices are stored in row-major order */ +#define A(i, j) a[(i)*lda + (j)] +#define B(i, j) b[(i)*ldb + (j)] +#define C(i, j) c[(i)*ldc + (j)] + +#define DECOMPOSE_K \ + int ktmp = k; \ + int k8 = k >> 3; \ + k -= (k8 << 3); \ + int k4 = k >> 2; \ + k -= (k4 << 2); \ + int k2 = k >> 1; \ + k -= (k2 << 1); \ + int k1 = k; \ + k = ktmp; + +#define DECOMPOSE_N \ + int ntmp = n; \ + int n4 = n >> 2; \ + n -= (n4 << 2); \ + int n2 = n >> 1; \ + n -= (n2 << 1); \ + int n1 = n; \ + n = ntmp; + +#define DECOMPOSE_M \ + int mtmp = m; \ + int m4 = m >> 2; \ + m -= (m4 << 2); \ + int m2 = m >> 1; \ + m -= (m2 << 1); \ + int m1 = m; \ + m = mtmp; + +/* + change memory layout for matrix A (kernel matrix) + memory index from ------> to + 0 1 2 3 0 4 8 12 + 4 5 6 7 1 5 9 13 + 8 9 10 11 2 6 10 14 + 12 13 14 15 3 7 11 15 + 16 17 18 19 16 18 20 22 + 20 21 22 23 17 19 21 23 + 24 25 26 27 24 25 26 27 + + notice: called in the initialization function (shl_c906_conv2d_init) +*/ +void shl_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx) +{ +#if __riscv_vector == 128 + DECOMPOSE_M + DECOMPOSE_K + /* + Execution delay cycles: vlsw + vsw = 6 + 1 + vlw + vssw = 4 + 2 ✔ + */ + if (m4 > 0) { + float *a0 = a; + float *a1 = a0 + ldx; + float *a2 = a1 + ldx; + float *a3 = a2 + ldx; + int k_tail = k & 7; + int store_stride = 16; + asm volatile( + "slli t3, %10, 2\n\t" // t3 = ldx * 4 + "slli t4, t3, 2\n\t" // t4 = 4 * ldx * 4 + "mv t2, %5\n\t" // t2 = m4 + "slli t0, %7, 2\n\t" // t0 = k_tail * 4 + "slli t1, t0, 2\n\t" // t1 = t0 * 4 + + "1:\n\t" + // start packm4 + "mv %0, %9\n\t" // a0 = a + "add %1, %0, t3\n\t" // a1 = a0 + 4 * ldx + "add %2, %1, t3\n\t" // a2 = a1 + 4 * ldx + "add %3, %2, t3\n\t" // a3 = a2 + 4 * ldx + "mv t6, %6\n\t" // t6 = k8 + "beqz t6, 3f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "2:\n\t" + // start subpack_m4k8 + "vlw.v v0, (%0)\n\t" + "addi %0, %0, 32\n\t" + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 32\n\t" + "vlw.v v4, (%2)\n\t" + "addi %2, %2, 32\n\t" + "vlw.v v6, (%3)\n\t" + "addi %3, %3, 32\n\t" + + "vssw.v v0, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v2, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v4, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v6, (%4), %8\n\t" + "addi %4, %4, 116\n\t" // sa += 32 ele * 4 + + "addi t6, t6, -1\n\t" // k8-- + "bnez t6, 2b\n\t" + + "3:\n\t" + "beqz %7, 4f\n\t" // k_tail == 0 ? + // Processing k_tail + "vsetvli zero, %7, e32, m2\n\t" + "vlw.v v0, (%0)\n\t" + "add %0, %0, t0\n\t" + "vlw.v v2, (%1)\n\t" + "add %1, %1, t0\n\t" + "vlw.v v4, (%2)\n\t" + "add %2, %2, t0\n\t" + "vlw.v v6, (%3)\n\t" + "add %3, %3, t0\n\t" + + "vssw.v v0, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v2, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v4, (%4), %8\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v6, (%4), %8\n\t" + "addi %4, %4, -12\n\t" + "add %4, %4, t1\n\t" // sa += 4 * k_tail * 4 + + "4:\n\t" + // end packm4 + "add %9, %9, t4\n\t" // a += 4 * ldx * 4 + "addi t2, t2, -1\n\t" // m4-- + "bnez t2, 1b\n\t" + + : "=r"(a0), // %0 + "=r"(a1), // %1 + "=r"(a2), // %2 + "=r"(a3), // %3 + "=r"(sa), // %4 + "=r"(m4), // %5 + "=r"(k8), // %6 + "=r"(k_tail), // %7 + "=r"(store_stride), // %8 + "=r"(a), // %9 + "=r"(ldx) // %10 + : "0"(a0), "1"(a1), "2"(a2), "3"(a3), "4"(sa), "5"(m4), "6"(k8), "7"(k_tail), + "8"(store_stride), "9"(a), "10"(ldx) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t2", "t3", "t4", "t6"); + } + if (m2 > 0) { + float *a0 = a; + float *a1 = a0 + ldx; + int k8 = k >> 3; + int k_tail = k & 7; + int store_stride = 8; + + asm volatile( + "slli t2, %7, 3\n\t" // t2 = ldx * 2 * 4 + "slli t0, %4, 2\n\t" // t0 = k_tail * 4 + "slli t1, t0, 1\n\t" // t1 = t0 * 2 + "beqz %3, 2f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "1:\n\t" + // start subpack_m2k8 + "vlw.v v0, (%0)\n\t" + "addi %0, %0, 32\n\t" + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 32\n\t" + + "vssw.v v0, (%2), %5\n\t" + "addi %2, %2, 4\n\t" + "vssw.v v2, (%2), %5\n\t" + "addi %2, %2, -4\n\t" + "addi %2, %2, 64\n\t" // sa += 16 ele * 4 + + "addi %3, %3, -1\n\t" + "bnez %3, 1b\n\t" + + "2:\n\t" + "beqz %4, 3f\n\t" // k_tail == 0 ? + // Processing k_tail + "vsetvli zero, %4, e32, m2\n\t" + "vlw.v v0, (%0)\n\t" + "add %0, %0, t0\n\t" + "vlw.v v2, (%1)\n\t" + "add %1, %1, t0\n\t" + + "vssw.v v0, (%2), %5\n\t" + "addi %2, %2, 4\n\t" + "vssw.v v2, (%2), %5\n\t" + "addi %2, %2, -4\n\t" + "add %2, %2, t1\n\t" // sa += k_tail * 2 * 4 + + "3:\n\t" + // end packm2 + "add %6, %6, t2\n\t" + + : "=r"(a0), // %0 + "=r"(a1), // %1 + "=r"(sa), // %2 + "=r"(k8), // %3 + "=r"(k_tail), // %4 + "=r"(store_stride), // %5 + "=r"(a), // %6 + "=r"(ldx) // %7 + : "0"(a0), "1"(a1), "2"(sa), "3"(k8), "4"(k_tail), "5"(store_stride), "6"(a), "7"(ldx) + : "v0", "v1", "v2", "v3", "t0", "t1", "t2"); + } + if (m1 > 0) { + memcpy(sa, a, sizeof(float) * ldx); + } +#else + int i = 0; + for (; i + 3 < m; i += 4) { + float *p0 = a; + float *p1 = a + ldx; + float *p2 = a + 2 * ldx; + float *p3 = a + 3 * ldx; + int j = 0; + for (; j + 7 < k; j += 8) { + sa[0] = p0[0]; + sa[16] = p0[4]; + sa[1] = p1[0]; + sa[17] = p1[4]; + sa[2] = p2[0]; + sa[18] = p2[4]; + sa[3] = p3[0]; + sa[19] = p3[4]; + + sa[4] = p0[1]; + sa[20] = p0[5]; + sa[5] = p1[1]; + sa[21] = p1[5]; + sa[6] = p2[1]; + sa[22] = p2[5]; + sa[7] = p3[1]; + sa[23] = p3[5]; + + sa[8] = p0[2]; + sa[24] = p0[6]; + sa[9] = p1[2]; + sa[25] = p1[6]; + sa[10] = p2[2]; + sa[26] = p2[6]; + sa[11] = p3[2]; + sa[27] = p3[6]; + + sa[12] = p0[3]; + sa[28] = p0[7]; + sa[13] = p1[3]; + sa[29] = p1[7]; + sa[14] = p2[3]; + sa[30] = p2[7]; + sa[15] = p3[3]; + sa[31] = p3[7]; + + sa += 32; + p0 += 8; + p1 += 8; + p2 += 8; + p3 += 8; + } + if (j + 3 < k) { + j += 4; + sa[0] = p0[0]; + sa[8] = p0[2]; + sa[1] = p1[0]; + sa[9] = p1[2]; + sa[2] = p2[0]; + sa[10] = p2[2]; + sa[3] = p3[0]; + sa[11] = p3[2]; + + sa[4] = p0[1]; + sa[12] = p0[3]; + sa[5] = p1[1]; + sa[13] = p1[3]; + sa[6] = p2[1]; + sa[14] = p2[3]; + sa[7] = p3[1]; + sa[15] = p3[3]; + + sa += 16; + p0 += 4; + p1 += 4; + p2 += 4; + p3 += 4; + } + if (j + 1 < k) { + j += 2; + sa[0] = p0[0]; + sa[1] = p1[0]; + sa[2] = p2[0]; + sa[3] = p3[0]; + + sa[4] = p0[1]; + sa[5] = p1[1]; + sa[6] = p2[1]; + sa[7] = p3[1]; + + sa += 8; + p0 += 2; + p1 += 2; + p2 += 2; + p3 += 2; + } + if (j < k) { + sa[0] = p0[0]; + sa[1] = p1[0]; + sa[2] = p2[0]; + sa[3] = p3[0]; + + sa += 4; + } + a += 4 * ldx; + } + if (i + 1 < m) { + i += 2; + float *p0 = a; + float *p1 = a + ldx; + + int j = 0; + for (; j + 7 < k; j += 8) { + sa[0] = p0[0]; + sa[1] = p1[0]; + sa[2] = p0[1]; + sa[3] = p1[1]; + sa[4] = p0[2]; + sa[5] = p1[2]; + sa[6] = p0[3]; + sa[7] = p1[3]; + sa[8] = p0[4]; + sa[9] = p1[4]; + sa[10] = p0[5]; + sa[11] = p1[5]; + sa[12] = p0[6]; + sa[13] = p1[6]; + sa[14] = p0[7]; + sa[15] = p1[7]; + + sa += 16; + p0 += 8; + p1 += 8; + } + if (j + 3 < k) { + j += 4; + sa[0] = p0[0]; + sa[1] = p1[0]; + sa[2] = p0[1]; + sa[3] = p1[1]; + sa[4] = p0[2]; + sa[5] = p1[2]; + sa[6] = p0[3]; + sa[7] = p1[3]; + + sa += 8; + p0 += 4; + p1 += 4; + } + if (j + 1 < k) { + j += 2; + sa[0] = p0[0]; + sa[1] = p1[0]; + sa[2] = p0[1]; + sa[3] = p1[1]; + + sa += 4; + p0 += 2; + p1 += 2; + } + if (j < k) { + sa[0] = p0[0]; + sa[1] = p1[0]; + + sa += 2; + } + a += 2 * ldx; + } + if (i < m) { + memcpy(sa, a, sizeof(float) * ldx); + } +#endif // __riscv_vector +} + +void shl_c906_reorder_input(float *b, float *sb, int k, int n, int ldx) +{ +#if __riscv_vector == 128 + DECOMPOSE_N + DECOMPOSE_K + if (n4 > 0) { + float *b0 = b; + float *b1 = b0 + 1; + float *b2 = b1 + 1; + float *b3 = b2 + 1; + int k_tail = k & 7; + int load_stride = 4 * ldx; + int store_stride = 16; + asm volatile( + "slli t0, %11, 5\n\t" // t0 = 8 * ldx * 4 + "slli t1, %7, 4\n\t" // t1 = 4 * k_tail * 4 + + "1:\n\t" + // start packn4 + "mv %0, %10\n\t" // b0 = b + "addi %1, %0, 4\n\t" // b1 = b0 + 1 + "addi %2, %1, 4\n\t" // b2 = b1 + 1 + "addi %3, %2, 4\n\t" // b3 = b2 + 1 + "mv t6, %6\n\t" // t6 = k8 + "beqz t6, 3f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "2:\n\t" + // start subpack_n4k8 + "vlsw.v v0, (%0), %8\n\t" + "vlsw.v v2, (%1), %8\n\t" + "vlsw.v v4, (%2), %8\n\t" + "vlsw.v v6, (%3), %8\n\t" + "add %0, %0, t0\n\t" + "add %1, %1, t0\n\t" + "add %2, %2, t0\n\t" + "add %3, %3, t0\n\t" + + "vssw.v v0, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v2, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v4, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v6, (%4), %9\n\t" + "addi %4, %4, -12\n\t" + "addi %4, %4, 128\n\t" // sb += 32 * 4 + + "addi t6, t6, -1\n\t" // k8-- + "bnez t6, 2b\n\t" + + "3:\n\t" + "beqz %7, 4f\n\t" // k_tail == 0 ? + // Processing k_tail + "vsetvli zero, %7, e32, m2\n\t" + "vlsw.v v0, (%0), %8\n\t" + "vlsw.v v2, (%1), %8\n\t" + "vlsw.v v4, (%2), %8\n\t" + "vlsw.v v6, (%3), %8\n\t" + + "vssw.v v0, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v2, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v4, (%4), %9\n\t" + "addi %4, %4, 4\n\t" + "vssw.v v6, (%4), %9\n\t" + "addi %4, %4, -12\n\t" + "add %4, %4, t1\n\t" // sb += k_tail * 4 * 4 + + "4:\n\t" + // end packn4 + "addi %10, %10, 16\n\t" // b += 4 * 4 + "addi %5, %5, -1\n\t" // n4-- + "bnez %5, 1b\n\t" + + : "=r"(b0), // %0 + "=r"(b1), // %1 + "=r"(b2), // %2 + "=r"(b3), // %3 + "=r"(sb), // %4 + "=r"(n4), // %5 + "=r"(k8), // %6 + "=r"(k_tail), // %7 + "=r"(load_stride), // %8 + "=r"(store_stride), // %9 + "=r"(b), // %10 + "=r"(ldx) // %11 + : "0"(b0), "1"(b1), "2"(b2), "3"(b3), "4"(sb), "5"(n4), "6"(k8), "7"(k_tail), + "8"(load_stride), "9"(store_stride), "10"(b), "11"(ldx) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t6"); + } + int n_tail = n & 3; + if (n_tail > 0) { + float *b0 = b; + int k_tail = k & 7; + int load_stride = 4 * ldx; + asm volatile( + "slli t0, %7, 5\n\t" // t0 = 8 * ldx * 4 + "slli t1, %4, 2\n\t" // t1 = k_tail * 4 + + "1:\n\t" + // pack remain n_tail cols one by one + "mv %0, %6\n\t" // b0 = b + "mv t3, %3\n\t" // t3 = k8 + "beqz t3, 3f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "2:\n\t" + // start subpack_n1k8 + "vlsw.v v0, (%0), %5\n\t" + "add %0, %0, t0\n\t" + "vsw.v v0, (%1)\n\t" + "addi %1, %1, 32\n\t" // sb += 8 * 4 + + "addi t3, t3, -1\n\t" // k8-- + "bnez t3, 2b\n\t" + + "3:\n\t" + "beqz %4, 4f\n\t" // k_tail == 0 ? + // Processing k_tail + "vsetvli zero, %4, e32, m2\n\t" + "vlsw.v v0, (%0), %5\n\t" + "vsw.v v0, (%1)\n\t" + "add %1, %1, t1\n\t" + + "4:\n\t" + // end packn1 + "addi %6, %6, 4\n\t" // b += 1 * 4 + "addi %2, %2, -1\n\t" + "bnez %2, 1b\n\t" + + : "=r"(b0), // %0 + "=r"(sb), // %1 + "=r"(n_tail), // %2 + "=r"(k8), // %3 + "=r"(k_tail), // %4 + "=r"(load_stride), // %5 + "=r"(b), // %6 + "=r"(ldx) // %7 + : "0"(b0), "1"(sb), "2"(n_tail), "3"(k8), "4"(k_tail), "5"(load_stride), "6"(b), + "7"(ldx) + : "v0", "v1", "t0", "t1", "t3"); + } +#else + int i = 0; + for (; i + 3 < n; i += 4) { + const float *p0 = b + i; + const float *p1 = b + 1 * ldx + i; + const float *p2 = b + 2 * ldx + i; + const float *p3 = b + 3 * ldx + i; + + const float *p4 = b + 4 * ldx + i; + const float *p5 = b + 5 * ldx + i; + const float *p6 = b + 6 * ldx + i; + const float *p7 = b + 7 * ldx + i; + + int j = 0; + for (; j + 7 < k; j += 8) { + sb[0] = p0[0]; + sb[4] = p1[0]; + sb[1] = p0[1]; + sb[5] = p1[1]; + sb[2] = p0[2]; + sb[6] = p1[2]; + sb[3] = p0[3]; + sb[7] = p1[3]; + + sb[8] = p2[0]; + sb[12] = p3[0]; + sb[9] = p2[1]; + sb[13] = p3[1]; + sb[10] = p2[2]; + sb[14] = p3[2]; + sb[11] = p2[3]; + sb[15] = p3[3]; + + sb[16] = p4[0]; + sb[20] = p5[0]; + sb[17] = p4[1]; + sb[21] = p5[1]; + sb[18] = p4[2]; + sb[22] = p5[2]; + sb[19] = p4[3]; + sb[23] = p5[3]; + + sb[24] = p6[0]; + sb[28] = p7[0]; + sb[25] = p6[1]; + sb[29] = p7[1]; + sb[26] = p6[2]; + sb[30] = p7[2]; + sb[27] = p6[3]; + sb[31] = p7[3]; + + sb += 32; + p0 += 8 * ldx; + p1 += 8 * ldx; + p2 += 8 * ldx; + p3 += 8 * ldx; + p4 += 8 * ldx; + p5 += 8 * ldx; + p6 += 8 * ldx; + p7 += 8 * ldx; + } + if (j + 3 < k) { + j += 4; + sb[0] = p0[0]; + sb[1] = p0[1]; + sb[2] = p0[2]; + sb[3] = p0[3]; + + sb[4] = p1[0]; + sb[5] = p1[1]; + sb[6] = p1[2]; + sb[7] = p1[3]; + + sb[8] = p2[0]; + sb[9] = p2[1]; + sb[10] = p2[2]; + sb[11] = p2[3]; + + sb[12] = p3[0]; + sb[13] = p3[1]; + sb[14] = p3[2]; + sb[15] = p3[3]; + + sb += 16; + p0 += 4 * ldx; + p1 += 4 * ldx; + p2 += 4 * ldx; + p3 += 4 * ldx; + } + if (j + 1 < k) { + j += 2; + sb[0] = p0[0]; + sb[1] = p0[1]; + sb[2] = p0[2]; + sb[3] = p0[3]; + + sb[4] = p1[0]; + sb[5] = p1[1]; + sb[6] = p1[2]; + sb[7] = p1[3]; + + sb += 8; + p0 += 2 * ldx; + p1 += 2 * ldx; + } + if (j < k) { + sb[0] = p0[0]; + sb[1] = p0[1]; + sb[2] = p0[2]; + sb[3] = p0[3]; + + sb += 4; + p0 += ldx; + } + } + while (i < n) { + const float *p = b + i; + for (int j = 0; j < k; j++) { + *sb = *p; + sb++; + p += ldx; + } + i++; + } + +#endif // __riscv_vector +} + +void shl_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx) +{ + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" // set vl = 8 + + "slli t2, %4, 2\n\t" // t2 = ldx * 4 (line stride) + + "srai t0, %3, 2\n\t" // t0 = n4 + "beqz t0, 3f\n\t" // jump to packn_tail + + "1:\n\t" // n4 + "mv a0, %0\n\t" + "addi %0, %0, 16\n\t" + "mv t1, %2\n\t" // k + + "2:\n\t" + // start packn8k1 + "vle.v v2, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n_tail + "andi t0, %3, 3\n\t" // n & 3u + "beqz t0, 8f\n\t" + + "srai t3, %2, 2\n\t" // k4 + "slli t5, %4, 4\n\t" // t5 = ldx * 4 * 4 (4 lines) + "andi t6, %2, 3\n\t" // k_tail + "slli t4, t6, 2\n\t" // k_tail * 4 + + "4:\n\t" + "mv a0, %0\n\t" + "addi %0, %0, 4\n\t" + "mv t1, t3\n\t" // t1 = k4 + "beqz t3, 6f\n\t" + + "5:\n\t" + "vsetvli zero, zero, e32, m1\n\t" + "vlse.v v2, (a0), t2\n\t" + "add a0, a0, t5\n\t" + "vse.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 5b\n\t" + + "6:\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vlse.v v2, (a0), t2\n\t" + "vse.v v2, (%1)\n\t" + "add %1, %1, t4\n\t" + + "7:\n\t" + "addi t0, t0, -1\n\t" + "bnez t0, 4b\n\t" + + "8:\n\t" // ending + + : "=r"(b), // %0 + "=r"(sb), // %1 + "=r"(k), // %2 + "=r"(n), // %3 + "=r"(ldx) // %4 + : "0"(b), "1"(sb), "2"(k), "3"(n), "4"(ldx) + : "v0", "v2", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +static inline void kernel_m1_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias, bool fuse_relu) +{ + float *pa = sa; + float *pb = sb; + float *pc = dst; + DECOMPOSE_K + DECOMPOSE_N + +#if __riscv_vector == 128 + if (n4 > 0) { + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" + "flw ft0, (%8)\n\t" // bias + + "beqz %9, 1f\n\t" // if fuse_relu == 0 + "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu + + "1:\n\t" + // start kernel_m1n4 + "vfmv.v.f v24, ft0\n\t" // v24[0..3] = *bias + // "vlw.v v24, (%8)\n\t" // v24[0..3] = bias[0..3] + // "addi %8, %8, 16\n\t" + + "mv a1, %0\n\t" // a1 = pa + "mv t0, %3\n\t" // t0 = k8 + "beqz t0, 3f\n\t" // k8 == 0 ? + + "2:\n\t" + // start subkernel_m1n4k8 + "vlw.v v1, (%1)\n\t" // load pb + "flw ft1, 0(a1)\n\t" // load pa + "vfmv.v.f v2, ft1\n\t" + "addi %1, %1, 16\n\t" // pb += 4 * 4 + "vfmacc.vv v24, v1, v2\n\t" // 0 + + "vlw.v v3, (%1)\n\t" + "flw ft2, 4(a1)\n\t" + "vfmv.v.f v4, ft2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v3, v4\n\t" // 1 + + "vlw.v v5, (%1)\n\t" + "flw ft3, 8(a1)\n\t" + "vfmv.v.f v6, ft3\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v5, v6\n\t" // 2 + + "vlw.v v7, (%1)\n\t" + "flw ft4, 12(a1)\n\t" + "vfmv.v.f v8, ft4\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v7, v8\n\t" // 3 + + "vlw.v v9, (%1)\n\t" + "flw ft5, 16(a1)\n\t" + "vfmv.v.f v10, ft5\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v9, v10\n\t" // 4 + + "vlw.v v11, (%1)\n\t" + "flw ft6, 20(a1)\n\t" + "vfmv.v.f v12, ft6\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v11, v12\n\t" // 5 + + "vlw.v v13, (%1)\n\t" + "flw ft7, 24(a1)\n\t" + "vfmv.v.f v14, ft7\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v13, v14\n\t" // 6 + + "vlw.v v15, (%1)\n\t" + "flw ft8, 28(a1)\n\t" + "vfmv.v.f v16, ft8\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v15, v16\n\t" // 7 + "addi a1, a1, 32\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 2b\n\t" + + "3:\n\t" + "beqz %4, 4f\n\t" // k4 == 0 ? + // start subkernel_m1n4k4 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + + "vlw.v v3, (%1)\n\t" + "flw ft2, 4(a1)\n\t" + "vfmv.v.f v4, ft2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v3, v4\n\t" // 1 + + "vlw.v v5, (%1)\n\t" + "flw ft3, 8(a1)\n\t" + "vfmv.v.f v6, ft3\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v5, v6\n\t" // 2 + + "vlw.v v7, (%1)\n\t" + "flw ft4, 12(a1)\n\t" + "vfmv.v.f v8, ft4\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v7, v8\n\t" // 3 + "addi a1, a1, 16\n\t" + + "4:\n\t" + "beqz %5, 5f\n\t" // k2 == 0 ? + // start subkernel_m1n4k2 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + + "vlw.v v3, (%1)\n\t" + "flw ft2, 4(a1)\n\t" + "vfmv.v.f v4, ft2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v3, v4\n\t" // 1 + "addi a1, a1, 8\n\t" + + "5:\n\t" + "beqz %6, 6f\n\t" // k1 == 0 ? + // start subkernel_m1n4k1 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + "addi a1, a1, 4\n\t" + + "6:\n\t" + "beqz %9, 7f\n\t" + // fused relu + "vfmax.vv v24, v24, v0\n\t" // **** relu **** + + "7:\n\t" + // end kernel_m1n4 + "vsw.v v24, (%2)\n\t" + "addi %2, %2, 16\n\t" // pc += 4 * 4 + + "addi %7, %7, -1\n\t" + "bnez %7, 1b\n\t" + + : "=r"(pa), // %0 + "=r"(pb), // %1 + "=r"(pc), // %2 + "=r"(k8), // %3 + "=r"(k4), // %4 + "=r"(k2), // %5 + "=r"(k1), // %6 + "=r"(n4), // %7 + "=r"(bias), // %8 + "=r"(fuse_relu) // %9 + : "0"(pa), "1"(pb), "2"(pc), "3"(k8), "4"(k4), "5"(k2), "6"(k1), "7"(n4), "8"(bias), + "9"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v24", "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", + "ft5", "ft6", "ft7", "ft8"); + } + if (n2 > 0) { + int k_tail = k & 7; + float *pb0 = pb; + float *pb1 = pb0 + k; + + asm volatile( + "fmv.w.x ft4, zero\n\t" // for fuse relu + "mv t4, %4\n\t" // t4 = k8 + "vsetvli zero, zero, e32, m2\n\t" + "vxor.vv v6, v6, v6\n\t" // clear + "vxor.vv v8, v8, v8\n\t" // clear + "flw ft0, 0(%6)\n\t" // ft0 = *bias + // "flw ft3, 4(%6)\n\t" // ft3 = *(bias + 1) + // "addi %6, %6, 8\n\t" + "vfmv.s.f v10, ft0\n\t" // v10[0] = ft0 + "vfmv.s.f v12, ft0\n\t" // v10[0] = ft0 + // "vfmv.s.f v12, ft3\n\t" // v12[0] = ft3 + + "beqz %5, 1f\n\t" // k_tail == 0 ? + // Processing k_tail + "slli t0, %5, 2\n\t" // t0 = k_tail * 4 + "vsetvli zero, %5, e32, m2\n\t" + "vlw.v v0, (%0)\n\t" + "add %0, %0, t0\n\t" + "vlw.v v2, (%1)\n\t" + "add %1, %1, t0\n\t" + "vlw.v v4, (%2)\n\t" + "add %2, %2, t0\n\t" + "vfmacc.vv v6, v0, v2\n\t" + "vfmacc.vv v8, v0, v4\n\t" + "beqz t4, 2f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "1:\n\t" + // start subkernel_m1n2k8 + "vlw.v v0, (%0)\n\t" + "addi %0, %0, 32\n\t" + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 32\n\t" + "vlw.v v4, (%2)\n\t" + "addi %2, %2, 32\n\t" + "vfmacc.vv v6, v0, v2\n\t" + "vfmacc.vv v8, v0, v4\n\t" + "addi t4, t4, -1\n\t" + "bnez t4, 1b\n\t" + + "2:\n\t" + // end kernel_m1n2 + "vfredsum.vs v10, v6, v10\n\t" // v10[0] = v10[0] + sum(v6[0..i]) + "vfredsum.vs v12, v8, v12\n\t" // v12[0] = v12[0] + sum(v8[0..i]) + "vfmv.f.s ft1, v10\n\t" + "vfmv.f.s ft2, v12\n\t" + + "beqz %7, 3f\n\t" + // fuse relu + "fmax.s ft1, ft1, ft4\n\t" // **** relu **** + "fmax.s ft2, ft2, ft4\n\t" // **** relu **** + + "3:\n\t" + + "fsw ft1, 0(%3)\n\t" + "fsw ft2, 4(%3)\n\t" + + : "=r"(pa), // %0 + "=r"(pb0), // %1 + "=r"(pb1), // %2 + "=r"(pc), // %3 + "=r"(k8), // %4 + "=r"(k_tail), // %5 + "=r"(bias), // %6 + "=r"(fuse_relu) // %7 + : "0"(pa), "1"(pb0), "2"(pb1), "3"(pc), "4"(k8), "5"(k_tail), "6"(bias), "7"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t4"); + pb += 2 * k; + pc += 2; + } + if (n1 > 0) { + pa = sa; + int k_tail = k & 7; + asm volatile( + "fmv.w.x ft2, zero\n\t" // for fuse relu + "vsetvli zero, zero, e32, m2\n\t" + "vxor.vv v4, v4, v4\n\t" // clear + + "flw ft0, 0(%5)\n\t" // ft0 = *bias + "vfmv.s.f v6, ft0\n\t" // v6[0] = ft0 + + "beqz %4, 1f\n\t" // k_tail == 0 ? + // Processing k_tail + "slli t0, %4, 2\n\t" // t0 = k_tail * 4 + "vsetvli zero, %4, e32, m2\n\t" + "vlw.v v0, (%0)\n\t" + "add %0, %0, t0\n\t" + "vlw.v v2, (%1)\n\t" + "add %1, %1, t0\n\t" + "vfmacc.vv v4, v0, v2\n\t" + "beqz %3, 2f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "1:\n\t" + // start subkernel_m1n1k8 + "vlw.v v0, (%0)\n\t" + "addi %0, %0, 32\n\t" + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 32\n\t" + "vfmacc.vv v4, v0, v2\n\t" + "addi %3, %3, -1\n\t" + "bnez %3, 1b\n\t" + + "2:\n\t" + // end kernel_m1n1 + "vfredsum.vs v6, v4, v6\n\t" // v6[0] = v6[0] + sum(v4[0..i]) + "vfmv.f.s ft1, v6\n\t" + + "beqz %6, 3f\n\t" + // fused relu + "fmax.s ft1, ft1, ft2\n\t" // **** relu **** + + "3:\n\t" + "fsw ft1, 0(%2)\n\t" + + : "=r"(pa), // %0 + "=r"(pb), // %1 + "=r"(pc), // %2 + "=r"(k8), // %3 + "=r"(k_tail), // %4 + "=r"(bias), // %5 + "=r"(fuse_relu) // %6 + : "0"(pa), "1"(pb), "2"(pc), "3"(k8), "4"(k_tail), "5"(bias), "6"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ft0", "ft1", "ft2", "t0"); + } +#else + for (int i = 0; i < n4; i++) { + int j = 0; + pa = sa; + pc[0] = pc[1] = pc[2] = pc[3] = *bias; + for (; j + 7 < k; j += 8) { + pc[0] += pa[0] * pb[0]; + pc[1] += pa[0] * pb[1]; + pc[2] += pa[0] * pb[2]; + pc[3] += pa[0] * pb[3]; + + pc[0] += pa[1] * pb[4]; + pc[1] += pa[1] * pb[5]; + pc[2] += pa[1] * pb[6]; + pc[3] += pa[1] * pb[7]; + + pc[0] += pa[2] * pb[8]; + pc[1] += pa[2] * pb[9]; + pc[2] += pa[2] * pb[10]; + pc[3] += pa[2] * pb[11]; + + pc[0] += pa[3] * pb[12]; + pc[1] += pa[3] * pb[13]; + pc[2] += pa[3] * pb[14]; + pc[3] += pa[3] * pb[15]; + + pc[0] += pa[4] * pb[16]; + pc[1] += pa[4] * pb[17]; + pc[2] += pa[4] * pb[18]; + pc[3] += pa[4] * pb[19]; + + pc[0] += pa[5] * pb[20]; + pc[1] += pa[5] * pb[21]; + pc[2] += pa[5] * pb[22]; + pc[3] += pa[5] * pb[23]; + + pc[0] += pa[6] * pb[24]; + pc[1] += pa[6] * pb[25]; + pc[2] += pa[6] * pb[26]; + pc[3] += pa[6] * pb[27]; + + pc[0] += pa[7] * pb[28]; + pc[1] += pa[7] * pb[29]; + pc[2] += pa[7] * pb[30]; + pc[3] += pa[7] * pb[31]; + + pa += 8; + pb += 32; + } + if (j + 3 < k) { + j += 4; + pc[0] += pa[0] * pb[0]; + pc[1] += pa[0] * pb[1]; + pc[2] += pa[0] * pb[2]; + pc[3] += pa[0] * pb[3]; + + pc[0] += pa[1] * pb[4]; + pc[1] += pa[1] * pb[5]; + pc[2] += pa[1] * pb[6]; + pc[3] += pa[1] * pb[7]; + + pc[0] += pa[2] * pb[8]; + pc[1] += pa[2] * pb[9]; + pc[2] += pa[2] * pb[10]; + pc[3] += pa[2] * pb[11]; + + pc[0] += pa[3] * pb[12]; + pc[1] += pa[3] * pb[13]; + pc[2] += pa[3] * pb[14]; + pc[3] += pa[3] * pb[15]; + + pa += 4; + pb += 16; + } + if (j + 1 < k) { + j += 2; + pc[0] += pa[0] * pb[0]; + pc[1] += pa[0] * pb[1]; + pc[2] += pa[0] * pb[2]; + pc[3] += pa[0] * pb[3]; + + pc[0] += pa[1] * pb[4]; + pc[1] += pa[1] * pb[5]; + pc[2] += pa[1] * pb[6]; + pc[3] += pa[1] * pb[7]; + + pa += 2; + pb += 8; + } + if (j < k) { + pc[0] += pa[0] * pb[0]; + pc[1] += pa[0] * pb[1]; + pc[2] += pa[0] * pb[2]; + pc[3] += pa[0] * pb[3]; + + pa += 1; + pb += 4; + } + if (fuse_relu) { + pc[0] = pc[0] > 0 ? pc[0] : 0; + pc[1] = pc[1] > 0 ? pc[1] : 0; + pc[2] = pc[2] > 0 ? pc[2] : 0; + pc[3] = pc[3] > 0 ? pc[3] : 0; + } + pc += 4; + } + if (n2 > 0) { + pa = sa; + pc[0] = pc[1] = *bias; + float *pb0 = pb; + float *pb1 = pb0 + k; + int j = 0; + for (; j + 7 < k; j += 8) { + pc[0] += pa[0] * pb0[0]; + pc[1] += pa[0] * pb1[0]; + + pc[0] += pa[1] * pb0[1]; + pc[1] += pa[1] * pb1[1]; + + pc[0] += pa[2] * pb0[2]; + pc[1] += pa[2] * pb1[2]; + + pc[0] += pa[3] * pb0[3]; + pc[1] += pa[3] * pb1[3]; + + pc[0] += pa[4] * pb0[4]; + pc[1] += pa[4] * pb1[4]; + + pc[0] += pa[5] * pb0[5]; + pc[1] += pa[5] * pb1[5]; + + pc[0] += pa[6] * pb0[6]; + pc[1] += pa[6] * pb1[6]; + + pc[0] += pa[7] * pb0[7]; + pc[1] += pa[7] * pb1[7]; + + pa += 8; + pb0 += 8; + pb1 += 8; + } + if (j + 3 < k) { + j += 4; + pc[0] += pa[0] * pb0[0]; + pc[1] += pa[0] * pb1[0]; + + pc[0] += pa[1] * pb0[1]; + pc[1] += pa[1] * pb1[1]; + + pc[0] += pa[2] * pb0[2]; + pc[1] += pa[2] * pb1[2]; + + pc[0] += pa[3] * pb0[3]; + pc[1] += pa[3] * pb1[3]; + + pa += 4; + pb0 += 4; + pb1 += 4; + } + if (j + 1 < k) { + j += 2; + pc[0] += pa[0] * pb0[0]; + pc[1] += pa[0] * pb1[0]; + + pc[0] += pa[1] * pb0[1]; + pc[1] += pa[1] * pb1[1]; + + pa += 2; + pb0 += 2; + pb1 += 2; + } + if (j < k) { + pc[0] += pa[0] * pb0[0]; + pc[1] += pa[0] * pb1[0]; + + pa += 1; + pb0 += 1; + pb1 += 1; + } + if (fuse_relu) { + pc[0] = pc[0] > 0 ? pc[0] : 0; + pc[1] = pc[1] > 0 ? pc[1] : 0; + } + pc += 2; + pb += 2 * k; + } + if (n1 > 0) { + pa = sa; + pc[0] = *bias; + int j = 0; + for (; j + 7 < k; j += 8) { + pc[0] += pa[0] * pb[0]; + pc[0] += pa[1] * pb[1]; + pc[0] += pa[2] * pb[2]; + pc[0] += pa[3] * pb[3]; + pc[0] += pa[4] * pb[4]; + pc[0] += pa[5] * pb[5]; + pc[0] += pa[6] * pb[6]; + pc[0] += pa[7] * pb[7]; + + pa += 8; + pb += 8; + } + if (j + 3 < k) { + j += 4; + pc[0] += pa[0] * pb[0]; + pc[0] += pa[1] * pb[1]; + pc[0] += pa[2] * pb[2]; + pc[0] += pa[3] * pb[3]; + + pa += 4; + pb += 4; + } + if (j + 1 < k) { + j += 2; + pc[0] += pa[0] * pb[0]; + pc[0] += pa[1] * pb[1]; + + pa += 2; + pb += 2; + } + if (j < k) { + pc[0] += pa[0] * pb[0]; + + pa += 1; + pb += 1; + } + if (fuse_relu) { + pc[0] = pc[0] > 0 ? pc[0] : 0; + } + pc += 1; + } +#endif // __riscv_vector +} + +static inline void kernel_m2_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias, bool fuse_relu) +{ + float *pa = sa; + float *pb = sb; + float *pc0 = dst; + float *pc1 = pc0 + ldc; + DECOMPOSE_K + DECOMPOSE_N +#if __riscv_vector == 128 + if (n4 > 0) { + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" + "flw ft0, (%9)\n\t" // ft0 = *bias + "flw ft10, 4(%9)\n\t" // ft1 = *(bias + 1) + + "beqz %10, 1f\n\t" // if fuse_relu == 0 + "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu + + "1:\n\t" // n4 + // start kernel_m2n4 + "vfmv.v.f v24, ft0\n\t" // v24[0..3] = ft0 = *bias + "vfmv.v.f v25, ft10\n\t" // v25[0..3] = ft10 = *(bias + 1) + // "vlw.v v24, (%9)\n\t" // v24[0..3] = bias[0..3] + // "vlw.v v25, (%9)\n\t" // v24[0..3] = bias[0..3] + // "addi %9, %9, 16\n\t" + + "mv a1, %0\n\t" // a1 = pa + "mv t0, %4\n\t" // t0 = k8 + "beqz t0, 3f\n\t" // k8 == 0 ? + + "2:\n\t" + // start subkernel_m2n4k8 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "flw fa1, 4(a1)\n\t" + "vfmv.v.f v3, fa1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + "vfmacc.vv v25, v1, v3\n\t" + + "vlw.v v4, (%1)\n\t" + "flw ft2, 8(a1)\n\t" + "vfmv.v.f v5, ft2\n\t" + "flw fa2, 12(a1)\n\t" + "vfmv.v.f v6, fa2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v4, v5\n\t" // 1 + "vfmacc.vv v25, v4, v6\n\t" + + "vlw.v v7, (%1)\n\t" + "flw ft3, 16(a1)\n\t" + "vfmv.v.f v8, ft3\n\t" + "flw fa3, 20(a1)\n\t" + "vfmv.v.f v9, fa3\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v7, v8\n\t" // 2 + "vfmacc.vv v25, v7, v9\n\t" + + "vlw.v v10, (%1)\n\t" + "flw ft4, 24(a1)\n\t" + "vfmv.v.f v11, ft4\n\t" + "flw fa4, 28(a1)\n\t" + "vfmv.v.f v12, fa4\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v10, v11\n\t" // 3 + "vfmacc.vv v25, v10, v12\n\t" + + "vlw.v v13, (%1)\n\t" + "flw ft5, 32(a1)\n\t" + "vfmv.v.f v14, ft5\n\t" + "flw fa5, 36(a1)\n\t" + "vfmv.v.f v15, fa5\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v13, v14\n\t" // 4 + "vfmacc.vv v25, v13, v15\n\t" + + "vlw.v v16, (%1)\n\t" + "flw ft6, 40(a1)\n\t" + "vfmv.v.f v17, ft6\n\t" + "flw fa6, 44(a1)\n\t" + "vfmv.v.f v18, fa6\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v16, v17\n\t" // 5 + "vfmacc.vv v25, v16, v18\n\t" + + "vlw.v v19, (%1)\n\t" + "flw ft7, 48(a1)\n\t" + "vfmv.v.f v20, ft7\n\t" + "flw fa7, 52(a1)\n\t" + "vfmv.v.f v21, fa7\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v19, v20\n\t" // 6 + "vfmacc.vv v25, v19, v21\n\t" + + "vlw.v v28, (%1)\n\t" + "flw ft8, 56(a1)\n\t" + "vfmv.v.f v29, ft8\n\t" + "flw fa0, 60(a1)\n\t" + "vfmv.v.f v30, fa0\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v28, v29\n\t" // 7 + "vfmacc.vv v25, v28, v30\n\t" + "addi a1, a1, 64\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 2b\n\t" + + "3:\n\t" + "beqz %5, 4f\n\t" // k4 == 0 ? + // start subkernel_m2n4k4 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "flw fa1, 4(a1)\n\t" + "vfmv.v.f v3, fa1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + "vfmacc.vv v25, v1, v3\n\t" + + "vlw.v v4, (%1)\n\t" + "flw ft2, 8(a1)\n\t" + "vfmv.v.f v5, ft2\n\t" + "flw fa2, 12(a1)\n\t" + "vfmv.v.f v6, fa2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v4, v5\n\t" // 1 + "vfmacc.vv v25, v4, v6\n\t" + + "vlw.v v7, (%1)\n\t" + "flw ft3, 16(a1)\n\t" + "vfmv.v.f v8, ft3\n\t" + "flw fa3, 20(a1)\n\t" + "vfmv.v.f v9, fa3\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v7, v8\n\t" // 2 + "vfmacc.vv v25, v7, v9\n\t" + + "vlw.v v10, (%1)\n\t" + "flw ft4, 24(a1)\n\t" + "vfmv.v.f v11, ft4\n\t" + "flw fa4, 28(a1)\n\t" + "vfmv.v.f v12, fa4\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v10, v11\n\t" // 3 + "vfmacc.vv v25, v10, v12\n\t" + "addi a1, a1, 32\n\t" + + "4:\n\t" + "beqz %6, 5f\n\t" // k2 == 0 ? + // start subkernel_m2n4k2 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "flw fa1, 4(a1)\n\t" + "vfmv.v.f v3, fa1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + "vfmacc.vv v25, v1, v3\n\t" + + "vlw.v v4, (%1)\n\t" + "flw ft2, 8(a1)\n\t" + "vfmv.v.f v5, ft2\n\t" + "flw fa2, 12(a1)\n\t" + "vfmv.v.f v6, fa2\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v4, v5\n\t" // 1 + "vfmacc.vv v25, v4, v6\n\t" + "addi a1, a1, 16\n\t" + + "5:\n\t" + "beqz %7, 6f\n\t" // k1 == 0 ? + // start subkernel_m2n4k1 + "vlw.v v1, (%1)\n\t" + "flw ft1, 0(a1)\n\t" + "vfmv.v.f v2, ft1\n\t" + "flw fa1, 4(a1)\n\t" + "vfmv.v.f v3, fa1\n\t" + "addi %1, %1, 16\n\t" + "vfmacc.vv v24, v1, v2\n\t" // 0 + "vfmacc.vv v25, v1, v3\n\t" + "addi a1, a1, 8\n\t" + + "6:\n\t" + "beqz %10, 7f\n\t" + // fused relu + "vfmax.vv v25, v25, v0\n\t" // **** relu **** + "vfmax.vv v25, v25, v0\n\t" // **** relu **** + + "7:\n\t" + // end kernel_m2n4 + "vsw.v v24, (%2)\n\t" // pc0[0..3] = v24 + "addi %2, %2, 16\n\t" + "vsw.v v25, (%3)\n\t" // pc1[0..3] = v25 + "addi %3, %3, 16\n\t" + + "addi %8, %8, -1\n\t" + "bnez %8, 1b\n\t" + + : "=r"(pa), // %0 + "=r"(pb), // %1 + "=r"(pc0), // %2 + "=r"(pc1), // %3 + "=r"(k8), // %4 + "=r"(k4), // %5 + "=r"(k2), // %6 + "=r"(k1), // %7 + "=r"(n4), // %8 + "=r"(bias), // %9 + "=r"(fuse_relu) // %10 + : "0"(pa), "1"(pb), "2"(pc0), "3"(pc1), "4"(k8), "5"(k4), "6"(k2), "7"(k1), "8"(n4), + "9"(bias), "10"(fuse_relu) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v24", "v25", "v28", "v29", + "v30", "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", + "ft9", "ft10", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"); + } + if (n2 > 0) { + int k_tail = k & 7; + float *pa0 = sa; + float *pa1 = pa0 + 1; + float *pb0 = pb; + float *pb1 = pb0 + k; + int load_stride = 8; + + asm volatile( + "fmv.w.x ft6, zero\n\t" // for fuse relu + "mv t6, %6\n\t" // t6 = k8 + "vsetvli zero, zero, e32, m2\n\t" + "vxor.vv v8, v8, v8\n\t" // clear + "vxor.vv v10, v10, v10\n\t" // clear + "vxor.vv v12, v12, v12\n\t" // clear + "vxor.vv v14, v14, v14\n\t" // clear + "flw ft0, 0(%8)\n\t" // ft0 = *bias + "flw ft1, 4(%8)\n\t" // ft1 = *(bias + 1) + // "addi %8, %8, 8\n\t" + "vfmv.s.f v16, ft0\n\t" // v16[0] = ft0 + "vfmv.s.f v18, ft0\n\t" // v18[0] = ft0 + "vfmv.s.f v20, ft1\n\t" // v20[0] = ft1 + "vfmv.s.f v22, ft1\n\t" // v22[1] = ft1 + + "beqz %7, 1f\n\t" // k_tail == 0 ? + // Processing k_tail + "slli t0, %7, 2\n\t" // t0 = k_tail * 4 + "slli t1, t0, 1\n\t" // t1 = t0 * 2 + "vsetvli zero, %7, e32, m2\n\t" + "vlsw.v v0, (%0), %9\n\t" + "add %0, %0, t1\n\t" + "vlsw.v v2, (%1), %9\n\t" + "addi %1, %0, 4\n\t" + + "vlw.v v4, (%2)\n\t" + "add %2, %2, t0\n\t" + "vlw.v v6, (%3)\n\t" + "add %3, %3, t0\n\t" + + "vfmacc.vv v8, v0, v4\n\t" + "vfmacc.vv v10, v0, v6\n\t" + "vfmacc.vv v12, v2, v4\n\t" + "vfmacc.vv v14, v2, v6\n\t" + "beqz t6, 2f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "1:\n\t" + // start subkernel_m2n2k8 + "vlsw.v v0, (%0), %9\n\t" + "addi %0, %0, 64\n\t" + "vlsw.v v2, (%1), %9\n\t" + "addi %1, %0, 4\n\t" + + "vlw.v v4, (%2)\n\t" + "addi %2, %2, 32\n\t" + "vlw.v v6, (%3)\n\t" + "addi %3, %3, 32\n\t" + + "vfmacc.vv v8, v0, v4\n\t" + "vfmacc.vv v10, v0, v6\n\t" + "vfmacc.vv v12, v2, v4\n\t" + "vfmacc.vv v14, v2, v6\n\t" + "addi t6, t6, -1\n\t" + "bnez t6, 1b\n\t" + + "2:\n\t" + // end kernel_m2n2 + "vfredsum.vs v16, v8, v16\n\t" // v16[0] = v16[0] + sum(v8[0..i]) + "vfredsum.vs v18, v10, v18\n\t" // v18[0] = v18[0] + sum(v10[0..i]) + "vfredsum.vs v20, v12, v20\n\t" // v20[0] = v20[0] + sum(v12[0..i]) + "vfredsum.vs v22, v14, v22\n\t" // v22[0] = v22[0] + sum(v14[0..i]) + "vfmv.f.s ft2, v16\n\t" + "vfmv.f.s ft3, v18\n\t" + "vfmv.f.s ft4, v20\n\t" + "vfmv.f.s ft5, v22\n\t" + + "beqz %10, 3f\n\t" + // fuse relu + "fmax.s ft2, ft2, ft6\n\t" // **** relu **** + "fmax.s ft3, ft3, ft6\n\t" // **** relu **** + "fmax.s ft4, ft4, ft6\n\t" // **** relu **** + "fmax.s ft5, ft5, ft6\n\t" // **** relu **** + + "3:\n\t" + + "fsw ft2, 0(%4)\n\t" + "fsw ft3, 4(%4)\n\t" + "fsw ft4, 0(%5)\n\t" + "fsw ft5, 4(%5)\n\t" + + : "=r"(pa0), // %0 + "=r"(pa1), // %1 + "=r"(pb0), // %2 + "=r"(pb1), // %3 + "=r"(pc0), // %4 + "=r"(pc1), // %5 + "=r"(k8), // %6 + "=r"(k_tail), // %7 + "=r"(bias), // %8 + "=r"(load_stride), // %9 + "=r"(fuse_relu) // %10 + : "0"(pa0), "1"(pa1), "2"(pb0), "3"(pb1), "4"(pc0), "5"(pc1), "6"(k8), "7"(k_tail), + "8"(bias), "9"(load_stride), "10"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "ft0", + "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "t0", "t1", "t6"); + pb += 2 * k; + pc0 += 2; + pc1 += 2; + } + if (n1 > 0) { + float *pa0 = sa; + float *pa1 = pa0 + 1; + int k8 = k >> 3; + int k_tail = k & 7; + int load_stride = 8; + asm volatile( + "fmv.w.x ft4, zero\n\t" // for fuse relu + "mv t5, %5\n\t" // t5 = k8 + "vsetvli zero, zero, e32, m2\n\t" + "vxor.vv v6, v6, v6\n\t" // clear + "vxor.vv v8, v8, v8\n\t" // clear + "flw ft0, 0(%7)\n\t" // ft0 = *bias + "flw ft1, 4(%7)\n\t" // ft1 = *(bias + 1) + "vfmv.s.f v10, ft0\n\t" // v10[0] = ft0 + "vfmv.s.f v12, ft1\n\t" // v12[0] = ft1 + + "beqz %6, 1f\n\t" // k_tail == 0 ? + // Processing k_tail + "slli t0, %6, 2\n\t" // t0 = k_tail * 4 + "slli t1, t0, 1\n\t" // t1 = t0 * 2 + "vsetvli zero, %6, e32, m2\n\t" + "vlsw.v v0, (%0), %8\n\t" + "add %0, %0, t1\n\t" + "vlsw.v v2, (%1), %8\n\t" + "addi %1, %0, 4\n\t" + + "vlw.v v4, (%2)\n\t" + "add %2, %2, t0\n\t" + + "vfmacc.vv v6, v0, v4\n\t" + "vfmacc.vv v8, v2, v4\n\t" + "beqz t5, 2f\n\t" // k8 == 0 ? + "vsetvli zero, zero, e32, m2\n\t" + + "1:\n\t" + // start subkernel_m2n1k8 + "vlsw.v v0, (%0), %8\n\t" + "addi %0, %0, 64\n\t" + "vlsw.v v2, (%1), %8\n\t" + "addi %1, %0, 4\n\t" + + "vlw.v v4, (%2)\n\t" + "addi %2, %2, 32\n\t" + + "vfmacc.vv v6, v0, v4\n\t" + "vfmacc.vv v8, v2, v4\n\t" + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "2:\n\t" + // end kernel_m2n1 + "vfredsum.vs v10, v6, v10\n\t" // v10[0] = v10[0] + sum(v6[0..i]) + "vfredsum.vs v12, v8, v12\n\t" // v12[0] = v12[0] + sum(v8[0..i]) + "vfmv.f.s ft2, v10\n\t" + "vfmv.f.s ft3, v12\n\t" + + "beqz %9, 3f\n\t" + // fuse relu + "fmax.s ft2, ft3, ft4\n\t" // **** relu **** + "fmax.s ft2, ft3, ft4\n\t" // **** relu **** + + "3:\n\t" + "fsw ft2, 0(%3)\n\t" + "fsw ft3, 0(%4)\n\t" + + : "=r"(pa0), // %0 + "=r"(pa1), // %1 + "=r"(pb), // %2 + "=r"(pc0), // %3 + "=r"(pc1), // %4 + "=r"(k8), // %5 + "=r"(k_tail), // %6 + "=r"(bias), // %7 + "=r"(load_stride), // %8 + "=r"(fuse_relu) // %9 + : "0"(pa0), "1"(pa1), "2"(pb), "3"(pc0), "4"(pc1), "5"(k8), "6"(k_tail), "7"(bias), + "8"(load_stride), "9"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "ft0", + "ft1", "ft2", "ft3", "ft4", "t0", "t1", "t5"); + } +#else + for (int i = 0; i < n4; i++) { + pa = sa; + pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias; + pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1); + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + + pc0[0] += pa[2] * pb[4]; + pc1[0] += pa[3] * pb[4]; + pc0[1] += pa[2] * pb[5]; + pc1[1] += pa[3] * pb[5]; + pc0[2] += pa[2] * pb[6]; + pc1[2] += pa[3] * pb[6]; + pc0[3] += pa[2] * pb[7]; + pc1[3] += pa[3] * pb[7]; + + pc0[0] += pa[4] * pb[8]; + pc1[0] += pa[5] * pb[8]; + pc0[1] += pa[4] * pb[9]; + pc1[1] += pa[5] * pb[9]; + pc0[2] += pa[4] * pb[10]; + pc1[2] += pa[5] * pb[10]; + pc0[3] += pa[4] * pb[11]; + pc1[3] += pa[5] * pb[11]; + + pc0[0] += pa[6] * pb[12]; + pc1[0] += pa[7] * pb[12]; + pc0[1] += pa[6] * pb[13]; + pc1[1] += pa[7] * pb[13]; + pc0[2] += pa[6] * pb[14]; + pc1[2] += pa[7] * pb[14]; + pc0[3] += pa[6] * pb[15]; + pc1[3] += pa[7] * pb[15]; + + pc0[0] += pa[8] * pb[16]; + pc1[0] += pa[9] * pb[16]; + pc0[1] += pa[8] * pb[17]; + pc1[1] += pa[9] * pb[17]; + pc0[2] += pa[8] * pb[18]; + pc1[2] += pa[9] * pb[18]; + pc0[3] += pa[8] * pb[19]; + pc1[3] += pa[9] * pb[19]; + + pc0[0] += pa[10] * pb[20]; + pc1[0] += pa[11] * pb[20]; + pc0[1] += pa[10] * pb[21]; + pc1[1] += pa[11] * pb[21]; + pc0[2] += pa[10] * pb[22]; + pc1[2] += pa[11] * pb[22]; + pc0[3] += pa[10] * pb[23]; + pc1[3] += pa[11] * pb[23]; + + pc0[0] += pa[12] * pb[24]; + pc1[0] += pa[13] * pb[24]; + pc0[1] += pa[12] * pb[25]; + pc1[1] += pa[13] * pb[25]; + pc0[2] += pa[12] * pb[26]; + pc1[2] += pa[13] * pb[26]; + pc0[3] += pa[12] * pb[27]; + pc1[3] += pa[13] * pb[27]; + + pc0[0] += pa[14] * pb[28]; + pc1[0] += pa[15] * pb[28]; + pc0[1] += pa[14] * pb[29]; + pc1[1] += pa[15] * pb[29]; + pc0[2] += pa[14] * pb[30]; + pc1[2] += pa[15] * pb[30]; + pc0[3] += pa[14] * pb[31]; + pc1[3] += pa[15] * pb[31]; + + pa += 16; + pb += 32; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + + pc0[0] += pa[2] * pb[4]; + pc1[0] += pa[3] * pb[4]; + pc0[1] += pa[2] * pb[5]; + pc1[1] += pa[3] * pb[5]; + pc0[2] += pa[2] * pb[6]; + pc1[2] += pa[3] * pb[6]; + pc0[3] += pa[2] * pb[7]; + pc1[3] += pa[3] * pb[7]; + + pc0[0] += pa[4] * pb[8]; + pc1[0] += pa[5] * pb[8]; + pc0[1] += pa[4] * pb[9]; + pc1[1] += pa[5] * pb[9]; + pc0[2] += pa[4] * pb[10]; + pc1[2] += pa[5] * pb[10]; + pc0[3] += pa[4] * pb[11]; + pc1[3] += pa[5] * pb[11]; + + pc0[0] += pa[6] * pb[12]; + pc1[0] += pa[7] * pb[12]; + pc0[1] += pa[6] * pb[13]; + pc1[1] += pa[7] * pb[13]; + pc0[2] += pa[6] * pb[14]; + pc1[2] += pa[7] * pb[14]; + pc0[3] += pa[6] * pb[15]; + pc1[3] += pa[7] * pb[15]; + + pa += 8; + pb += 16; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + + pc0[0] += pa[2] * pb[4]; + pc1[0] += pa[3] * pb[4]; + pc0[1] += pa[2] * pb[5]; + pc1[1] += pa[3] * pb[5]; + pc0[2] += pa[2] * pb[6]; + pc1[2] += pa[3] * pb[6]; + pc0[3] += pa[2] * pb[7]; + pc1[3] += pa[3] * pb[7]; + + pa += 4; + pb += 8; + } + if (j < k) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + + pa += 2; + pb += 4; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + pc0[1] = pc0[1] > 0 ? pc0[1] : 0; + pc0[2] = pc0[2] > 0 ? pc0[2] : 0; + pc0[3] = pc0[3] > 0 ? pc0[3] : 0; + + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + pc1[1] = pc1[1] > 0 ? pc1[1] : 0; + pc1[2] = pc1[2] > 0 ? pc1[2] : 0; + pc1[3] = pc1[3] > 0 ? pc1[3] : 0; + } + pc0 += 4; + pc1 += 4; + } + if (n2 > 0) { + pa = sa; + pc0[0] = pc0[1] = *bias; + pc1[0] = pc1[1] = *(bias + 1); + float *pb0 = pb; + float *pb1 = pb0 + k; + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + + pc0[0] += pa[2] * pb0[1]; + pc1[0] += pa[3] * pb0[1]; + pc0[1] += pa[2] * pb1[1]; + pc1[1] += pa[3] * pb1[1]; + + pc0[0] += pa[4] * pb0[2]; + pc1[0] += pa[5] * pb0[2]; + pc0[1] += pa[4] * pb1[2]; + pc1[1] += pa[5] * pb1[2]; + + pc0[0] += pa[6] * pb0[3]; + pc1[0] += pa[7] * pb0[3]; + pc0[1] += pa[6] * pb1[3]; + pc1[1] += pa[7] * pb1[3]; + + pc0[0] += pa[8] * pb0[4]; + pc1[0] += pa[9] * pb0[4]; + pc0[1] += pa[8] * pb1[4]; + pc1[1] += pa[9] * pb1[4]; + + pc0[0] += pa[10] * pb0[5]; + pc1[0] += pa[11] * pb0[5]; + pc0[1] += pa[10] * pb1[5]; + pc1[1] += pa[11] * pb1[5]; + + pc0[0] += pa[12] * pb0[6]; + pc1[0] += pa[13] * pb0[6]; + pc0[1] += pa[12] * pb1[6]; + pc1[1] += pa[13] * pb1[6]; + + pc0[0] += pa[14] * pb0[7]; + pc1[0] += pa[15] * pb0[7]; + pc0[1] += pa[14] * pb1[7]; + pc1[1] += pa[15] * pb1[7]; + + pa += 16; + pb0 += 8; + pb1 += 8; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + + pc0[0] += pa[2] * pb0[1]; + pc1[0] += pa[3] * pb0[1]; + pc0[1] += pa[2] * pb1[1]; + pc1[1] += pa[3] * pb1[1]; + + pc0[0] += pa[4] * pb0[2]; + pc1[0] += pa[5] * pb0[2]; + pc0[1] += pa[4] * pb1[2]; + pc1[1] += pa[5] * pb1[2]; + + pc0[0] += pa[6] * pb0[3]; + pc1[0] += pa[7] * pb0[3]; + pc0[1] += pa[6] * pb1[3]; + pc1[1] += pa[7] * pb1[3]; + + pa += 8; + pb0 += 4; + pb1 += 4; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + + pc0[0] += pa[2] * pb0[1]; + pc1[0] += pa[3] * pb0[1]; + pc0[1] += pa[2] * pb1[1]; + pc1[1] += pa[3] * pb1[1]; + + pa += 4; + pb0 += 2; + pb1 += 2; + } + if (j < k) { + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + + pa += 2; + pb0 += 1; + pb1 += 1; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + pc0[1] = pc0[1] > 0 ? pc0[1] : 0; + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + pc1[1] = pc1[1] > 0 ? pc1[1] : 0; + } + pc0 += 2; + pc1 += 2; + pb += 2 * k; + } + if (n1 > 0) { + pa = sa; + pc0[0] = *bias; + pc1[0] = *(bias + 1); + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + + pc0[0] += pa[2] * pb[1]; + pc1[0] += pa[3] * pb[1]; + + pc0[0] += pa[4] * pb[2]; + pc1[0] += pa[5] * pb[2]; + + pc0[0] += pa[6] * pb[3]; + pc1[0] += pa[7] * pb[3]; + + pc0[0] += pa[8] * pb[4]; + pc1[0] += pa[9] * pb[4]; + + pc0[0] += pa[10] * pb[5]; + pc1[0] += pa[11] * pb[5]; + + pc0[0] += pa[12] * pb[6]; + pc1[0] += pa[13] * pb[6]; + + pc0[0] += pa[14] * pb[7]; + pc1[0] += pa[15] * pb[7]; + + pa += 16; + pb += 8; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + + pc0[0] += pa[2] * pb[1]; + pc1[0] += pa[3] * pb[1]; + + pc0[0] += pa[4] * pb[2]; + pc1[0] += pa[5] * pb[2]; + + pc0[0] += pa[6] * pb[3]; + pc1[0] += pa[7] * pb[3]; + + pa += 8; + pb += 4; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + + pc0[0] += pa[2] * pb[1]; + pc1[0] += pa[3] * pb[1]; + + pa += 4; + pb += 2; + } + if (j < k) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + + pa += 2; + pb += 1; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + } + pc0 += 1; + pc1 += 1; + } +#endif // __riscv_vector +} + +static inline void kernel_m4_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias, bool fuse_relu) +{ + float *pa = sa; + float *pb = sb; + float *pc0 = dst; + float *pc1 = pc0 + ldc; + float *pc2 = pc1 + ldc; + float *pc3 = pc2 + ldc; + DECOMPOSE_K + DECOMPOSE_N + +#if __riscv_vector == 128 + if (n4 > 0) { + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" + "flw ft8, (%11)\n\t" + "flw ft9, 4(%11)\n\t" + "flw ft10, 8(%11)\n\t" + "flw ft11, 12(%11)\n\t" + "beqz %12, 1f\n\t" // if fuse_relu == 0 + "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu + + "1:\n\t" // n4 + // start kernel_m4n4 + "vfmv.v.f v24, ft8\n\t" // v24[0..3] = *bias + "vfmv.v.f v25, ft9\n\t" // v25[0..3] = *(bias + 1) + "vfmv.v.f v26, ft10\n\t" // v26[0..3] = *(bias + 2) + "vfmv.v.f v27, ft11\n\t" // v27[0..3] = *(bias + 3) + // "vlw.v v24, (%11)\n\t" // v24[0..3] = bias[0..3] + // "vlw.v v25, (%11)\n\t" // v25[0..3] = bias[0..3] + // "vlw.v v26, (%11)\n\t" // v26[0..3] = bias[0..3] + // "vlw.v v27, (%11)\n\t" // v27[0..3] = bias[0..3] + // "addi %11, %11, 16\n\t" // bias += 4 * 4 + + "mv a1, %0\n\t" // a1 = pa + "mv t0, %6\n\t" // t0 = k8 + + "flw ft0, (a1)\n\t" + "flw ft1, 4(a1)\n\t" + "flw ft2, 8(a1)\n\t" + "flw ft3, 12(a1)\n\t" // pre load pa + + "beqz t0, 3f\n\t" // k8 == 0 ? + + "vlw.v v1, (%1)\n\t" // pre load pb + "addi %1, %1, 16\n\t" + + "2:\n\t" + // start subkernel_m4n4k8 + + "vlw.v v2, (%1)\n\t" // load pb + "addi %1, %1, 16\n\t" + "flw ft4, 16(a1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw ft5, 20(a1)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw ft6, 24(a1)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw ft7, 28(a1)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" // 0 + + "vlw.v v3, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, 32(a1)\n\t" + "vfmacc.vf v24, ft4, v2\n\t" + "flw ft1, 36(a1)\n\t" + "vfmacc.vf v25, ft5, v2\n\t" + "flw ft2, 40(a1)\n\t" + "vfmacc.vf v26, ft6, v2\n\t" + "flw ft3, 44(a1)\n\t" + "vfmacc.vf v27, ft7, v2\n\t" // 1 + + "vlw.v v4, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft4, 48(a1)\n\t" + "vfmacc.vf v24, ft0, v3\n\t" + "flw ft5, 52(a1)\n\t" + "vfmacc.vf v25, ft1, v3\n\t" + "flw ft6, 56(a1)\n\t" + "vfmacc.vf v26, ft2, v3\n\t" + "flw ft7, 60(a1)\n\t" + "vfmacc.vf v27, ft3, v3\n\t" // 2 + + "vlw.v v5, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, 64(a1)\n\t" + "vfmacc.vf v24, ft4, v4\n\t" + "flw ft1, 68(a1)\n\t" + "vfmacc.vf v25, ft5, v4\n\t" + "flw ft2, 72(a1)\n\t" + "vfmacc.vf v26, ft6, v4\n\t" + "flw ft3, 76(a1)\n\t" + "vfmacc.vf v27, ft7, v4\n\t" // 3 + + "vlw.v v6, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft4, 80(a1)\n\t" + "vfmacc.vf v24, ft0, v5\n\t" + "flw ft5, 84(a1)\n\t" + "vfmacc.vf v25, ft1, v5\n\t" + "flw ft6, 88(a1)\n\t" + "vfmacc.vf v26, ft2, v5\n\t" + "flw ft7, 92(a1)\n\t" + "vfmacc.vf v27, ft3, v5\n\t" // 4 + + "vlw.v v7, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, 96(a1)\n\t" + "vfmacc.vf v24, ft4, v6\n\t" + "flw ft1, 100(a1)\n\t" + "vfmacc.vf v25, ft5, v6\n\t" + "flw ft2, 104(a1)\n\t" + "vfmacc.vf v26, ft6, v6\n\t" + "flw ft3, 108(a1)\n\t" + "vfmacc.vf v27, ft7, v6\n\t" // 5 + + "vlw.v v8, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft4, 112(a1)\n\t" + "vfmacc.vf v24, ft0, v7\n\t" + "flw ft5, 116(a1)\n\t" + "vfmacc.vf v25, ft1, v7\n\t" + "flw ft6, 120(a1)\n\t" + "vfmacc.vf v26, ft2, v7\n\t" + "flw ft7, 124(a1)\n\t" + "vfmacc.vf v27, ft3, v7\n\t" // 6 + "addi a1, a1, 128\n\t" // += 32 elements, bump pa to next k8 addr + + "vlw.v v1, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, (a1)\n\t" + "vfmacc.vf v24, ft4, v8\n\t" + "flw ft1, 4(a1)\n\t" + "vfmacc.vf v25, ft5, v8\n\t" + "flw ft2, 8(a1)\n\t" + "vfmacc.vf v26, ft6, v8\n\t" + "flw ft3, 12(a1)\n\t" + "vfmacc.vf v27, ft7, v8\n\t" // 7 + + "addi t0, t0, -1\n\t" // k8 -- + "bnez t0, 2b\n\t" + + "addi %1, %1, -16\n\t" // pb -= 4 ********* bump pb to origin addr + // ************ + + "3:\n\t" + "beqz %7, 4f\n\t" // k4 == 0 ? + // start subkernel_m4n4k4 + "vlw.v v1, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft4, 16(a1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw ft5, 20(a1)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw ft6, 24(a1)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw ft7, 28(a1)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" // 0 + + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, 32(a1)\n\t" + "vfmacc.vf v24, ft4, v2\n\t" + "flw ft1, 36(a1)\n\t" + "vfmacc.vf v25, ft5, v2\n\t" + "flw ft2, 40(a1)\n\t" + "vfmacc.vf v26, ft6, v2\n\t" + "flw ft3, 44(a1)\n\t" + "vfmacc.vf v27, ft7, v2\n\t" // 1 + + "vlw.v v3, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft4, 48(a1)\n\t" + "vfmacc.vf v24, ft0, v3\n\t" + "flw ft5, 52(a1)\n\t" + "vfmacc.vf v25, ft1, v3\n\t" + "flw ft6, 56(a1)\n\t" + "vfmacc.vf v26, ft2, v3\n\t" + "flw ft7, 60(a1)\n\t" + "vfmacc.vf v27, ft3, v3\n\t" // 2 + "addi a1, a1, 64\n\t" // += 16 elements, bump pa to next k addr + + "vlw.v v4, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, (a1)\n\t" + "vfmacc.vf v24, ft4, v4\n\t" + "flw ft1, 4(a1)\n\t" + "vfmacc.vf v25, ft5, v4\n\t" + "flw ft2, 8(a1)\n\t" + "vfmacc.vf v26, ft6, v4\n\t" + "flw ft3, 12(a1)\n\t" + "vfmacc.vf v27, ft7, v4\n\t" // 3 + + "4:\n\t" + "beqz %8, 5f\n\t" // k2 == 0 ? + // start subkernel_m4n4k2 + + "vlw.v v1, (%1)\n\t" + "addi %1, %1, 16\n\t" + + "flw ft4, 16(a1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw ft5, 20(a1)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw ft6, 24(a1)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw ft7, 28(a1)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" // 0 + "addi a1, a1, 32\n\t" // += 8 elements, bump pa to next k addr + + "vlw.v v2, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw ft0, (a1)\n\t" + "vfmacc.vf v24, ft4, v2\n\t" + "flw ft1, 4(a1)\n\t" + "vfmacc.vf v25, ft5, v2\n\t" + "flw ft2, 8(a1)\n\t" + "vfmacc.vf v26, ft6, v2\n\t" + "flw ft3, 12(a1)\n\t" + "vfmacc.vf v27, ft7, v2\n\t" // 1 + + "5:\n\t" + "beqz %9, 6f\n\t" // k1 == 0 ? + // start subkernel_m4n4k1 + "vlw.v v1, (%1)\n\t" + "addi %1, %1, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" // 0 + + "6:\n\t" + "beqz %12, 7f\n\t" + // fused relu + "vfmax.vv v24, v24, v0\n\t" // **** relu **** + "vfmax.vv v25, v25, v0\n\t" // **** relu **** + "vfmax.vv v26, v26, v0\n\t" // **** relu **** + "vfmax.vv v27, v27, v0\n\t" // **** relu **** + + "7:\n\t" + // end kernel_m4n4 + "vsw.v v24, (%2)\n\t" + "addi %2, %2, 16\n\t" + "vsw.v v25, (%3)\n\t" + "addi %3, %3, 16\n\t" + "vsw.v v26, (%4)\n\t" + "addi %4, %4, 16\n\t" + "vsw.v v27, (%5)\n\t" + "addi %5, %5, 16\n\t" + + "addi %10, %10, -1\n\t" + "bnez %10, 1b\n\t" + + : "=r"(pa), // %0 + "=r"(pb), // %1 + "=r"(pc0), // %2 + "=r"(pc1), // %3 + "=r"(pc2), // %4 + "=r"(pc3), // %5 + "=r"(k8), // %6 + "=r"(k4), // %7 + "=r"(k2), // %8 + "=r"(k1), // %9 + "=r"(n4), // %10 + "=r"(bias), // %11 + "=r"(fuse_relu) // %12 + : "0"(pa), "1"(pb), "2"(pc0), "3"(pc1), "4"(pc2), "5"(pc3), "6"(k8), "7"(k4), "8"(k2), + "9"(k1), "10"(n4), "11"(bias), "12"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", + "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", + "ft10", "ft11"); + } + if (n2 > 0) { + float *pa = sa; + float *pb0 = pb; + float *pb1 = pb0 + k; + float *pc00 = pc0; + float *pc11 = pc00 + 1; + asm volatile( + "slli t1, %10, 2\n\t" + "vsetvli zero, zero, e32, m1\n\t" + // "flw ft8, (%9)\n\t" + // "flw ft9, 4(%9)\n\t" + // "addi %9, %9, 8\n\t" + + "vlw.v v24, (%9)\n\t" // v24[0..3] = bias[0]..bias[3] + "vlw.v v25, (%9)\n\t" // v25[0..3] = bias[0]..bias[3] + // "vfmv.v.f v24, ft8\n\t" // v24[0..3] = bias[0]; + // "vfmv.v.f v25, ft9\n\t" // v25[0..3] = bias[1]; + + "flw ft0, (%1)\n\t" // pre load pb0 + "flw fa0, (%2)\n\t" // pre load pb1 + + "beqz %11, 0f\n\t" // if fuse_relu == 0 + "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu + + "0:\n\t" + "mv t0, %5\n\t" // t0 = k8 + "beqz t0, 2f\n\t" // k8 == 0 ? + + "1:\n\t" + // start subkernel_m4n2k8 + "vlw.v v1, (%0)\n\t" // load pa + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa1, 4(%2)\n\t" + "vfmacc.vf v25, fa0, v1\n\t" // 0 + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 8(%1)\n\t" + "vfmacc.vf v24, ft1, v2\n\t" + "flw fa0, 8(%2)\n\t" + "vfmacc.vf v25, fa1, v2\n\t" // 1 + + "vlw.v v3, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 12(%1)\n\t" + "vfmacc.vf v24, ft0, v3\n\t" + "flw fa1, 12(%2)\n\t" + "vfmacc.vf v25, fa0, v3\n\t" // 2 + + "vlw.v v4, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 16(%1)\n\t" + "vfmacc.vf v24, ft1, v4\n\t" + "flw fa0, 16(%2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" // 3 + + "vlw.v v5, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 20(%1)\n\t" + "vfmacc.vf v24, ft0, v5\n\t" + "flw fa1, 20(%2)\n\t" + "vfmacc.vf v25, fa0, v5\n\t" // 4 + + "vlw.v v6, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 24(%1)\n\t" + "vfmacc.vf v24, ft1, v6\n\t" + "flw fa0, 24(%2)\n\t" + "vfmacc.vf v25, fa1, v6\n\t" // 5 + + "vlw.v v7, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 28(%1)\n\t" + "vfmacc.vf v24, ft0, v7\n\t" + "flw fa1, 28(%2)\n\t" + "vfmacc.vf v25, fa0, v7\n\t" // 6 + "addi %1, %1, 32\n\t" // += 8 elements, bump pb0 to next k8 addr + "addi %2, %2, 32\n\t" // += 8 elements, bump pb1 to next k8 addr + + "vlw.v v8, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v24, ft1, v8\n\t" + "flw fa0, (%2)\n\t" + "vfmacc.vf v25, fa1, v8\n\t" // 7 + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "2:\n\t" + "beqz %6, 3f\n\t" // k4 == 0 ? + // start subkernel_m4n2k4 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa1, 4(%2)\n\t" + "vfmacc.vf v25, fa0, v1\n\t" // 0 + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 8(%1)\n\t" + "vfmacc.vf v24, ft1, v2\n\t" + "flw fa0, 8(%2)\n\t" + "vfmacc.vf v25, fa1, v2\n\t" // 1 + + "vlw.v v3, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 12(%1)\n\t" + "vfmacc.vf v24, ft0, v3\n\t" + "flw fa1, 12(%2)\n\t" + "vfmacc.vf v25, fa0, v3\n\t" // 2 + "addi %1, %1, 16\n\t" // += 4 elements, bump pb0 to next k addr + "addi %2, %2, 16\n\t" // += 4 elements, bump pb1 to next k addr + + "vlw.v v4, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v24, ft1, v4\n\t" + "flw fa0, (%2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" // 3 + + "3:\n\t" + "beqz %7, 4f\n\t" // k2 == 0 ? + // start subkernel_m4n2k2 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa1, 4(%2)\n\t" + "vfmacc.vf v25, fa0, v1\n\t" // 0 + "addi %1, %1, 8\n\t" // += 2 elements, bump pb0 to next k addr + "addi %2, %2, 8\n\t" // += 2 elements, bump pb1 to next k addr + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v24, ft1, v2\n\t" + "flw fa0, (%2)\n\t" + "vfmacc.vf v25, fa1, v2\n\t" // 1 + + "4:\n\t" + "beqz %8, 5f\n\t" // k1 == 0 ? + // start subkernel_m4n2k1 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, fa0, v1\n\t" // 0 + + "5:\n\t" + "beqz %11, 6f\n\t" + // fused relu + "vfmax.vv v24, v24, v0\n\t" // **** relu **** + "vfmax.vv v25, v25, v0\n\t" // **** relu **** + + "6:\n\t" + "vssw.v v24, (%3), t1\n\t" + "vssw.v v25, (%4), t1\n\t" + + : "=r"(pa), // %0 + "=r"(pb0), // %1 + "=r"(pb1), // %2 + "=r"(pc00), // %3 + "=r"(pc11), // %4 + "=r"(k8), // %5 + "=r"(k4), // %6 + "=r"(k2), // %7 + "=r"(k1), // %8 + "=r"(bias), // %9 + "=r"(ldc), // %10 + "=r"(fuse_relu) // %11 + : "0"(pa), "1"(pb0), "2"(pb1), "3"(pc00), "4"(pc11), "5"(k8), "6"(k4), "7"(k2), "8"(k1), + "9"(bias), "10"(ldc), "11"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "t0", "t1", "ft0", + "ft1", "fa0", "fa1"); + pb += 2 * k; + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + if (n1 > 0) { + pa = sa; + float *pc00 = pc0; + asm volatile( + "slli t1, %8, 2\n\t" // t1 = ldc * 4 + "vsetvli zero, zero, e32, m1\n\t" + // "flw ft8, 0(%7)\n\t" + // "vfmv.v.f v16, ft8\n\t" + "vlw.v v16, (%7)\n\t" // v24[0..3] = bias[0]..bias[3] + "flw ft0, (%1)\n\t" // pre load pb + + "beqz %9, 0f\n\t" // if fuse_relu == 0 + "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu + + "0:\n\t" + "beqz %3, 2f\n\t" // k8 == 0 ? + + "1:\n\t" + // start subkernel_m4n1k8 + "vlw.v v1, (%0)\n\t" // load pa + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v16, ft0, v1\n\t" // 0 + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 8(%1)\n\t" + "vfmacc.vf v16, ft1, v2\n\t" // 1 + + "vlw.v v3, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 12(%1)\n\t" + "vfmacc.vf v16, ft0, v3\n\t" // 2 + + "vlw.v v4, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 16(%1)\n\t" + "vfmacc.vf v16, ft1, v4\n\t" // 3 + + "vlw.v v5, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 20(%1)\n\t" + "vfmacc.vf v16, ft0, v5\n\t" // 4 + + "vlw.v v6, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 24(%1)\n\t" + "vfmacc.vf v16, ft1, v6\n\t" // 5 + + "vlw.v v7, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 28(%1)\n\t" + "vfmacc.vf v16, ft0, v7\n\t" // 6 + "addi %1, %1, 32\n\t" // += 8 elements, bump pb to next k8 addr + + "vlw.v v8, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v16, ft1, v8\n\t" // 7 + + "addi %3, %3, -1\n\t" + "bnez %3, 1b\n\t" + + "2:\n\t" + "beqz %4, 3f\n\t" // k4 == 0 ? + // start subkernel_m4n1k4 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v16, ft0, v1\n\t" // 0 + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, 8(%1)\n\t" + "vfmacc.vf v16, ft1, v2\n\t" // 1 + + "vlw.v v3, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 12(%1)\n\t" + "vfmacc.vf v16, ft0, v3\n\t" // 2 + "addi %1, %1, 16\n\t" // += 4 elements, bump pb to next k addr + + "vlw.v v4, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v16, ft1, v4\n\t" // 3 + + "3:\n\t" + "beqz %5, 4f\n\t" // k2 == 0 ? + // start subkernel_m4n1k2 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft1, 4(%1)\n\t" + "vfmacc.vf v16, ft0, v1\n\t" // 0 + "addi %1, %1, 8\n\t" // += 2 elements, bump pb to next k addr + + "vlw.v v2, (%0)\n\t" + "addi %0, %0, 16\n\t" + "flw ft0, (%1)\n\t" + "vfmacc.vf v16, ft1, v2\n\t" // 1 + + "4:\n\t" + "beqz %6, 5f\n\t" // k1 == 0 ? + // start subkernel_m4n2k1 + "vlw.v v1, (%0)\n\t" + "addi %0, %0, 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" // 0 + + "5:\n\t" + "beqz %9, 6f\n\t" + // fused relu + "vfmax.vv v16, v16, v0\n\t" // **** relu **** + + "6:\n\t" + "vssw.v v16, (%2), t1\n\t" + + : "=r"(pa), // %0 + "=r"(pb), // %1 + "=r"(pc00), // %2 + "=r"(k8), // %3 + "=r"(k4), // %4 + "=r"(k2), // %5 + "=r"(k1), // %6 + "=r"(bias), // %7 + "=r"(ldc), // %8 + "=r"(fuse_relu) // %9 + : "0"(pa), "1"(pb), "2"(pc00), "3"(k8), "4"(k4), "5"(k2), "6"(k1), "7"(bias), "8"(ldc), + "9"(fuse_relu) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "t0", "t1", "ft0", + "ft1"); + } +#else + for (int i = 0; i < n4; i++) { + pa = sa; + pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias; + pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1); + pc2[0] = pc2[1] = pc2[2] = pc2[3] = *(bias + 2); + pc3[0] = pc3[1] = pc3[2] = pc3[3] = *(bias + 3); + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc2[1] += pa[2] * pb[1]; + pc3[1] += pa[3] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc2[2] += pa[2] * pb[2]; + pc3[2] += pa[3] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + pc2[3] += pa[2] * pb[3]; + pc3[3] += pa[3] * pb[3]; + + pc0[0] += pa[4] * pb[4]; + pc1[0] += pa[5] * pb[4]; + pc2[0] += pa[6] * pb[4]; + pc3[0] += pa[7] * pb[4]; + pc0[1] += pa[4] * pb[5]; + pc1[1] += pa[5] * pb[5]; + pc2[1] += pa[6] * pb[5]; + pc3[1] += pa[7] * pb[5]; + pc0[2] += pa[4] * pb[6]; + pc1[2] += pa[5] * pb[6]; + pc2[2] += pa[6] * pb[6]; + pc3[2] += pa[7] * pb[6]; + pc0[3] += pa[4] * pb[7]; + pc1[3] += pa[5] * pb[7]; + pc2[3] += pa[6] * pb[7]; + pc3[3] += pa[7] * pb[7]; + + pc0[0] += pa[8] * pb[8]; + pc1[0] += pa[9] * pb[8]; + pc2[0] += pa[10] * pb[8]; + pc3[0] += pa[11] * pb[8]; + pc0[1] += pa[8] * pb[9]; + pc1[1] += pa[9] * pb[9]; + pc2[1] += pa[10] * pb[9]; + pc3[1] += pa[11] * pb[9]; + pc0[2] += pa[8] * pb[10]; + pc1[2] += pa[9] * pb[10]; + pc2[2] += pa[10] * pb[10]; + pc3[2] += pa[11] * pb[10]; + pc0[3] += pa[8] * pb[11]; + pc1[3] += pa[9] * pb[11]; + pc2[3] += pa[10] * pb[11]; + pc3[3] += pa[11] * pb[11]; + + pc0[0] += pa[12] * pb[12]; + pc1[0] += pa[13] * pb[12]; + pc2[0] += pa[14] * pb[12]; + pc3[0] += pa[15] * pb[12]; + pc0[1] += pa[12] * pb[13]; + pc1[1] += pa[13] * pb[13]; + pc2[1] += pa[14] * pb[13]; + pc3[1] += pa[15] * pb[13]; + pc0[2] += pa[12] * pb[14]; + pc1[2] += pa[13] * pb[14]; + pc2[2] += pa[14] * pb[14]; + pc3[2] += pa[15] * pb[14]; + pc0[3] += pa[12] * pb[15]; + pc1[3] += pa[13] * pb[15]; + pc2[3] += pa[14] * pb[15]; + pc3[3] += pa[15] * pb[15]; + + pc0[0] += pa[16] * pb[16]; + pc1[0] += pa[17] * pb[16]; + pc2[0] += pa[18] * pb[16]; + pc3[0] += pa[19] * pb[16]; + pc0[1] += pa[16] * pb[17]; + pc1[1] += pa[17] * pb[17]; + pc2[1] += pa[18] * pb[17]; + pc3[1] += pa[19] * pb[17]; + pc0[2] += pa[16] * pb[18]; + pc1[2] += pa[17] * pb[18]; + pc2[2] += pa[18] * pb[18]; + pc3[2] += pa[19] * pb[18]; + pc0[3] += pa[16] * pb[19]; + pc1[3] += pa[17] * pb[19]; + pc2[3] += pa[18] * pb[19]; + pc3[3] += pa[19] * pb[19]; + + pc0[0] += pa[20] * pb[20]; + pc1[0] += pa[21] * pb[20]; + pc2[0] += pa[22] * pb[20]; + pc3[0] += pa[23] * pb[20]; + pc0[1] += pa[20] * pb[21]; + pc1[1] += pa[21] * pb[21]; + pc2[1] += pa[22] * pb[21]; + pc3[1] += pa[23] * pb[21]; + pc0[2] += pa[20] * pb[22]; + pc1[2] += pa[21] * pb[22]; + pc2[2] += pa[22] * pb[22]; + pc3[2] += pa[23] * pb[22]; + pc0[3] += pa[20] * pb[23]; + pc1[3] += pa[21] * pb[23]; + pc2[3] += pa[22] * pb[23]; + pc3[3] += pa[23] * pb[23]; + + pc0[0] += pa[24] * pb[24]; + pc1[0] += pa[25] * pb[24]; + pc2[0] += pa[26] * pb[24]; + pc3[0] += pa[27] * pb[24]; + pc0[1] += pa[24] * pb[25]; + pc1[1] += pa[25] * pb[25]; + pc2[1] += pa[26] * pb[25]; + pc3[1] += pa[27] * pb[25]; + pc0[2] += pa[24] * pb[26]; + pc1[2] += pa[25] * pb[26]; + pc2[2] += pa[26] * pb[26]; + pc3[2] += pa[27] * pb[26]; + pc0[3] += pa[24] * pb[27]; + pc1[3] += pa[25] * pb[27]; + pc2[3] += pa[26] * pb[27]; + pc3[3] += pa[27] * pb[27]; + + pc0[0] += pa[28] * pb[28]; + pc1[0] += pa[29] * pb[28]; + pc2[0] += pa[30] * pb[28]; + pc3[0] += pa[31] * pb[28]; + pc0[1] += pa[28] * pb[29]; + pc1[1] += pa[29] * pb[29]; + pc2[1] += pa[30] * pb[29]; + pc3[1] += pa[31] * pb[29]; + pc0[2] += pa[28] * pb[30]; + pc1[2] += pa[29] * pb[30]; + pc2[2] += pa[30] * pb[30]; + pc3[2] += pa[31] * pb[30]; + pc0[3] += pa[28] * pb[31]; + pc1[3] += pa[29] * pb[31]; + pc2[3] += pa[30] * pb[31]; + pc3[3] += pa[31] * pb[31]; + + pa += 32; + pb += 32; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc2[1] += pa[2] * pb[1]; + pc3[1] += pa[3] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc2[2] += pa[2] * pb[2]; + pc3[2] += pa[3] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + pc2[3] += pa[2] * pb[3]; + pc3[3] += pa[3] * pb[3]; + + pc0[0] += pa[4] * pb[4]; + pc1[0] += pa[5] * pb[4]; + pc2[0] += pa[6] * pb[4]; + pc3[0] += pa[7] * pb[4]; + pc0[1] += pa[4] * pb[5]; + pc1[1] += pa[5] * pb[5]; + pc2[1] += pa[6] * pb[5]; + pc3[1] += pa[7] * pb[5]; + pc0[2] += pa[4] * pb[6]; + pc1[2] += pa[5] * pb[6]; + pc2[2] += pa[6] * pb[6]; + pc3[2] += pa[7] * pb[6]; + pc0[3] += pa[4] * pb[7]; + pc1[3] += pa[5] * pb[7]; + pc2[3] += pa[6] * pb[7]; + pc3[3] += pa[7] * pb[7]; + + pc0[0] += pa[8] * pb[8]; + pc1[0] += pa[9] * pb[8]; + pc2[0] += pa[10] * pb[8]; + pc3[0] += pa[11] * pb[8]; + pc0[1] += pa[8] * pb[9]; + pc1[1] += pa[9] * pb[9]; + pc2[1] += pa[10] * pb[9]; + pc3[1] += pa[11] * pb[9]; + pc0[2] += pa[8] * pb[10]; + pc1[2] += pa[9] * pb[10]; + pc2[2] += pa[10] * pb[10]; + pc3[2] += pa[11] * pb[10]; + pc0[3] += pa[8] * pb[11]; + pc1[3] += pa[9] * pb[11]; + pc2[3] += pa[10] * pb[11]; + pc3[3] += pa[11] * pb[11]; + + pc0[0] += pa[12] * pb[12]; + pc1[0] += pa[13] * pb[12]; + pc2[0] += pa[14] * pb[12]; + pc3[0] += pa[15] * pb[12]; + pc0[1] += pa[12] * pb[13]; + pc1[1] += pa[13] * pb[13]; + pc2[1] += pa[14] * pb[13]; + pc3[1] += pa[15] * pb[13]; + pc0[2] += pa[12] * pb[14]; + pc1[2] += pa[13] * pb[14]; + pc2[2] += pa[14] * pb[14]; + pc3[2] += pa[15] * pb[14]; + pc0[3] += pa[12] * pb[15]; + pc1[3] += pa[13] * pb[15]; + pc2[3] += pa[14] * pb[15]; + pc3[3] += pa[15] * pb[15]; + + pa += 16; + pb += 16; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc2[1] += pa[2] * pb[1]; + pc3[1] += pa[3] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc2[2] += pa[2] * pb[2]; + pc3[2] += pa[3] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + pc2[3] += pa[2] * pb[3]; + pc3[3] += pa[3] * pb[3]; + + pc0[0] += pa[4] * pb[4]; + pc1[0] += pa[5] * pb[4]; + pc2[0] += pa[6] * pb[4]; + pc3[0] += pa[7] * pb[4]; + pc0[1] += pa[4] * pb[5]; + pc1[1] += pa[5] * pb[5]; + pc2[1] += pa[6] * pb[5]; + pc3[1] += pa[7] * pb[5]; + pc0[2] += pa[4] * pb[6]; + pc1[2] += pa[5] * pb[6]; + pc2[2] += pa[6] * pb[6]; + pc3[2] += pa[7] * pb[6]; + pc0[3] += pa[4] * pb[7]; + pc1[3] += pa[5] * pb[7]; + pc2[3] += pa[6] * pb[7]; + pc3[3] += pa[7] * pb[7]; + + pa += 8; + pb += 8; + } + if (j < k) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + pc0[1] += pa[0] * pb[1]; + pc1[1] += pa[1] * pb[1]; + pc2[1] += pa[2] * pb[1]; + pc3[1] += pa[3] * pb[1]; + pc0[2] += pa[0] * pb[2]; + pc1[2] += pa[1] * pb[2]; + pc2[2] += pa[2] * pb[2]; + pc3[2] += pa[3] * pb[2]; + pc0[3] += pa[0] * pb[3]; + pc1[3] += pa[1] * pb[3]; + pc2[3] += pa[2] * pb[3]; + pc3[3] += pa[3] * pb[3]; + + pa += 4; + pb += 4; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + pc0[1] = pc0[1] > 0 ? pc0[1] : 0; + pc0[2] = pc0[2] > 0 ? pc0[2] : 0; + pc0[3] = pc0[3] > 0 ? pc0[3] : 0; + + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + pc1[1] = pc1[1] > 0 ? pc1[1] : 0; + pc1[2] = pc1[2] > 0 ? pc1[2] : 0; + pc1[3] = pc1[3] > 0 ? pc1[3] : 0; + + pc2[0] = pc2[0] > 0 ? pc2[0] : 0; + pc2[1] = pc2[1] > 0 ? pc2[1] : 0; + pc2[2] = pc2[2] > 0 ? pc2[2] : 0; + pc2[3] = pc2[3] > 0 ? pc2[3] : 0; + + pc3[0] = pc3[0] > 0 ? pc3[0] : 0; + pc3[1] = pc3[1] > 0 ? pc3[1] : 0; + pc3[2] = pc3[2] > 0 ? pc3[2] : 0; + pc3[3] = pc3[3] > 0 ? pc3[3] : 0; + } + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + if (n2 > 0) { + pa = sa; + pc0[0] = pc0[1] = *bias; + pc1[0] = pc1[1] = *(bias + 1); + pc2[0] = pc2[1] = *(bias + 2); + pc3[0] = pc3[1] = *(bias + 3); + float *pb0 = pb; + float *pb1 = pb0 + k; + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc2[0] += pa[2] * pb0[0]; + pc3[0] += pa[3] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + pc2[1] += pa[2] * pb1[0]; + pc3[1] += pa[3] * pb1[0]; + + pc0[0] += pa[4] * pb0[1]; + pc1[0] += pa[5] * pb0[1]; + pc2[0] += pa[6] * pb0[1]; + pc3[0] += pa[7] * pb0[1]; + pc0[1] += pa[4] * pb1[1]; + pc1[1] += pa[5] * pb1[1]; + pc2[1] += pa[6] * pb1[1]; + pc3[1] += pa[7] * pb1[1]; + + pc0[0] += pa[8] * pb0[2]; + pc1[0] += pa[9] * pb0[2]; + pc2[0] += pa[10] * pb0[2]; + pc3[0] += pa[11] * pb0[2]; + pc0[1] += pa[8] * pb1[2]; + pc1[1] += pa[9] * pb1[2]; + pc2[1] += pa[10] * pb1[2]; + pc3[1] += pa[11] * pb1[2]; + + pc0[0] += pa[12] * pb0[3]; + pc1[0] += pa[13] * pb0[3]; + pc2[0] += pa[14] * pb0[3]; + pc3[0] += pa[15] * pb0[3]; + pc0[1] += pa[12] * pb1[3]; + pc1[1] += pa[13] * pb1[3]; + pc2[1] += pa[14] * pb1[3]; + pc3[1] += pa[15] * pb1[3]; + + pc0[0] += pa[16] * pb0[4]; + pc1[0] += pa[17] * pb0[4]; + pc2[0] += pa[18] * pb0[4]; + pc3[0] += pa[19] * pb0[4]; + pc0[1] += pa[16] * pb1[4]; + pc1[1] += pa[17] * pb1[4]; + pc2[1] += pa[18] * pb1[4]; + pc3[1] += pa[19] * pb1[4]; + + pc0[0] += pa[20] * pb0[5]; + pc1[0] += pa[21] * pb0[5]; + pc2[0] += pa[22] * pb0[5]; + pc3[0] += pa[23] * pb0[5]; + pc0[1] += pa[20] * pb1[5]; + pc1[1] += pa[21] * pb1[5]; + pc2[1] += pa[22] * pb1[5]; + pc3[1] += pa[23] * pb1[5]; + + pc0[0] += pa[24] * pb0[6]; + pc1[0] += pa[25] * pb0[6]; + pc2[0] += pa[26] * pb0[6]; + pc3[0] += pa[27] * pb0[6]; + pc0[1] += pa[24] * pb1[6]; + pc1[1] += pa[25] * pb1[6]; + pc2[1] += pa[26] * pb1[6]; + pc3[1] += pa[27] * pb1[6]; + + pc0[0] += pa[28] * pb0[7]; + pc1[0] += pa[29] * pb0[7]; + pc2[0] += pa[30] * pb0[7]; + pc3[0] += pa[31] * pb0[7]; + pc0[1] += pa[28] * pb1[7]; + pc1[1] += pa[29] * pb1[7]; + pc2[1] += pa[30] * pb1[7]; + pc3[1] += pa[31] * pb1[7]; + + pa += 32; + pb0 += 8; + pb1 += 8; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc2[0] += pa[2] * pb0[0]; + pc3[0] += pa[3] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + pc2[1] += pa[2] * pb1[0]; + pc3[1] += pa[3] * pb1[0]; + + pc0[0] += pa[4] * pb0[1]; + pc1[0] += pa[5] * pb0[1]; + pc2[0] += pa[6] * pb0[1]; + pc3[0] += pa[7] * pb0[1]; + pc0[1] += pa[4] * pb1[1]; + pc1[1] += pa[5] * pb1[1]; + pc2[1] += pa[6] * pb1[1]; + pc3[1] += pa[7] * pb1[1]; + + pc0[0] += pa[8] * pb0[2]; + pc1[0] += pa[9] * pb0[2]; + pc2[0] += pa[10] * pb0[2]; + pc3[0] += pa[11] * pb0[2]; + pc0[1] += pa[8] * pb1[2]; + pc1[1] += pa[9] * pb1[2]; + pc2[1] += pa[10] * pb1[2]; + pc3[1] += pa[11] * pb1[2]; + + pc0[0] += pa[12] * pb0[3]; + pc1[0] += pa[13] * pb0[3]; + pc2[0] += pa[14] * pb0[3]; + pc3[0] += pa[15] * pb0[3]; + pc0[1] += pa[12] * pb1[3]; + pc1[1] += pa[13] * pb1[3]; + pc2[1] += pa[14] * pb1[3]; + pc3[1] += pa[15] * pb1[3]; + + pa += 16; + pb0 += 4; + pb1 += 4; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc2[0] += pa[2] * pb0[0]; + pc3[0] += pa[3] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + pc2[1] += pa[2] * pb1[0]; + pc3[1] += pa[3] * pb1[0]; + + pc0[0] += pa[4] * pb0[1]; + pc1[0] += pa[5] * pb0[1]; + pc2[0] += pa[6] * pb0[1]; + pc3[0] += pa[7] * pb0[1]; + pc0[1] += pa[4] * pb1[1]; + pc1[1] += pa[5] * pb1[1]; + pc2[1] += pa[6] * pb1[1]; + pc3[1] += pa[7] * pb1[1]; + + pa += 8; + pb0 += 2; + pb1 += 2; + } + if (j < k) { + pc0[0] += pa[0] * pb0[0]; + pc1[0] += pa[1] * pb0[0]; + pc2[0] += pa[2] * pb0[0]; + pc3[0] += pa[3] * pb0[0]; + pc0[1] += pa[0] * pb1[0]; + pc1[1] += pa[1] * pb1[0]; + pc2[1] += pa[2] * pb1[0]; + pc3[1] += pa[3] * pb1[0]; + + pa += 4; + pb0 += 1; + pb1 += 1; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + pc0[1] = pc0[1] > 0 ? pc0[1] : 0; + + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + pc1[1] = pc1[1] > 0 ? pc1[1] : 0; + + pc2[0] = pc2[0] > 0 ? pc2[0] : 0; + pc2[1] = pc2[1] > 0 ? pc2[1] : 0; + + pc3[0] = pc3[0] > 0 ? pc3[0] : 0; + pc3[1] = pc3[1] > 0 ? pc3[1] : 0; + } + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + pb += 2 * k; + } + if (n1 > 0) { + pa = sa; + pc0[0] = *bias; + pc1[0] = *(bias + 1); + pc2[0] = *(bias + 2); + pc3[0] = *(bias + 3); + int j = 0; + for (; j + 7 < k; j += 8) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + + pc0[0] += pa[4] * pb[1]; + pc1[0] += pa[5] * pb[1]; + pc2[0] += pa[6] * pb[1]; + pc3[0] += pa[7] * pb[1]; + + pc0[0] += pa[8] * pb[2]; + pc1[0] += pa[9] * pb[2]; + pc2[0] += pa[10] * pb[2]; + pc3[0] += pa[11] * pb[2]; + + pc0[0] += pa[12] * pb[3]; + pc1[0] += pa[13] * pb[3]; + pc2[0] += pa[14] * pb[3]; + pc3[0] += pa[15] * pb[3]; + + pc0[0] += pa[16] * pb[4]; + pc1[0] += pa[17] * pb[4]; + pc2[0] += pa[18] * pb[4]; + pc3[0] += pa[19] * pb[4]; + + pc0[0] += pa[20] * pb[5]; + pc1[0] += pa[21] * pb[5]; + pc2[0] += pa[22] * pb[5]; + pc3[0] += pa[23] * pb[5]; + + pc0[0] += pa[24] * pb[6]; + pc1[0] += pa[25] * pb[6]; + pc2[0] += pa[26] * pb[6]; + pc3[0] += pa[27] * pb[6]; + + pc0[0] += pa[28] * pb[7]; + pc1[0] += pa[29] * pb[7]; + pc2[0] += pa[30] * pb[7]; + pc3[0] += pa[31] * pb[7]; + + pa += 32; + pb += 8; + } + if (j + 3 < k) { + j += 4; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + + pc0[0] += pa[4] * pb[1]; + pc1[0] += pa[5] * pb[1]; + pc2[0] += pa[6] * pb[1]; + pc3[0] += pa[7] * pb[1]; + + pc0[0] += pa[8] * pb[2]; + pc1[0] += pa[9] * pb[2]; + pc2[0] += pa[10] * pb[2]; + pc3[0] += pa[11] * pb[2]; + + pc0[0] += pa[12] * pb[3]; + pc1[0] += pa[13] * pb[3]; + pc2[0] += pa[14] * pb[3]; + pc3[0] += pa[15] * pb[3]; + + pa += 16; + pb += 4; + } + if (j + 1 < k) { + j += 2; + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + + pc0[0] += pa[4] * pb[1]; + pc1[0] += pa[5] * pb[1]; + pc2[0] += pa[6] * pb[1]; + pc3[0] += pa[7] * pb[1]; + + pa += 8; + pb += 2; + } + if (j < k) { + pc0[0] += pa[0] * pb[0]; + pc1[0] += pa[1] * pb[0]; + pc2[0] += pa[2] * pb[0]; + pc3[0] += pa[3] * pb[0]; + + pa += 4; + pb += 1; + } + if (fuse_relu) { + pc0[0] = pc0[0] > 0 ? pc0[0] : 0; + + pc1[0] = pc1[0] > 0 ? pc1[0] : 0; + + pc2[0] = pc2[0] > 0 ? pc2[0] : 0; + + pc3[0] = pc3[0] > 0 ? pc3[0] : 0; + } + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } +#endif // __riscv_vector +} + +static inline void kernel_m4_f32_1(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias, bool fuse_relu) +{ + asm volatile( + "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 + + "flw fs0, 0(%2)\n\t" + "flw fs1, 4(%2)\n\t" + "flw fs2, 8(%2)\n\t" + "flw fs3, 12(%2)\n\t" + + // init output addr + "slli t5, %6, 2\n\t" // t5_tmp = ldx * 4 + "mv a0, %3\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "srai t0, %5, 2\n\t" // t0 = n >> 2 (n4) + "beqz t0, 4f\n\t" + + "1:\n\t" // m4n4 + // start kernel_m4n4 + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" // init acc = bias + + "mv t6, %0\n\t" // t6 hold kernel 4 lines start addr + "mv t5, %4\n\t" // t5 = k (k > 0) + + "2:\n\t" + // start subkernel_m4n4k1 + "vle.v v1, (%1)\n\t" + "addi %1, %1, 16\n\t" + "flw fa0, 0(t6)\n\t" + "flw fa1, 4(t6)\n\t" + "flw fa2, 8(t6)\n\t" + "flw fa3, 12(t6)\n\t" + "addi t6, t6, 16\n\t" + + "vfmacc.vf v24, fa0, v1\n\t" + "vfmacc.vf v25, fa1, v1\n\t" + "vfmacc.vf v26, fa2, v1\n\t" + "vfmacc.vf v27, fa3, v1\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 2b\n\t" + + "3:\n\t" // end kernel_m4n4 + + "vse.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "4:\n\t" // m4n2 + "andi t0, %5, 3\n\t" // n & 3 + "srai t0, t0, 1\n\t" // (n & 3) >> 2 + "beqz t0, 7f\n\t" // jump to m4n1 + // start kernel_m4n2 + "vle.v v24, (%2)\n\t" + "vle.v v25, (%2)\n\t" // init acc = bias + + // init addr for pa, pb and pc + "slli t0, %4, 2\n\t" // t0_tmp = k * 4 + + "mv t6, %0\n\t" // t6 hold pa(kernel) 2 lines start addr + + "mv a4, %1\n\t" + "add a5, a4, t0\n\t" // a4-a5 hold pb(input) 2 cols addr + + "addi a1, a0, 4\n\t" // a0-a1 hold pc(output) addr + + "mv t5, %4\n\t" // t5 = k + + "5:\n\t" + // start subkernel_m4n2k1 + "vle.v v1, (t6)\n\t" + "addi t6, t6, 16\n\t" + "flw fa0, 0(a4)\n\t" + "vfmacc.vf v24, fa0, v1\n\t" + "flw fa1, 0(a5)\n\t" + "vfmacc.vf v25, fa1, v1\n\t" + + "addi a4, a4, 4\n\t" + "addi a5, a5, 4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 5b\n\t" + + "6:\n\t" // end kernel_m4n2 + "slli t0, %6, 2\n\t" // t0_tmp = ldx * 4 (store_stride) + + "vsse.v v24, (a0), t0\n\t" + "vsse.v v25, (a1), t0\n\t" + + "addi a0, a0, 8\n\t" // updata output start addr ( +2 cols) + "slli t0, %4, 3\n\t" // t_tmp = k * 2 * 4 + "add %1, %1, t0\n\t" // updata pb start addr + + "7:\n\t" // m4n1 + "andi t0, %5, 1\n\t" // n & 1 + "beqz t0, 10f\n\t" // jump to ending + // start kernel_m8n1 + + "vle.v v24, (%2)\n\t" // init out_tmp = bias + + // init addr for pa, pb and pc + "mv t6, %0\n\t" // t6 hold pa(kernel) 8 lines start addr + "mv a4, %1\n\t" // a4 hold pb(input) 1 cols addr + // a0 hold pc(output) addr + + "mv t5, %4\n\t" // t5 = k + + "8:\n\t" + // start subkernel_m8n1k8 + "vle.v v1, (t6)\n\t" + "addi t6, t6, 16\n\t" + "flw fa0, 0(a4)\n\t" + "vfmacc.vf v24, fa0, v1\n\t" // 0 + + "addi a4, a4, 4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 8b\n\t" + + "9:\n\t" // end kernel_m8n1 + "slli t0, %6, 2\n\t" // t0_tmp = ldx * 4 (store_stride) + + "vsse.v v24, (a0), t0\n\t" + + "10:\n\t" // ending + + : "=r"(sa), // %0 + "=r"(sb), // %1 + "=r"(bias), // %2 + "=r"(dst), // %3 + "=r"(k), // %4 + "=r"(n), // %5 + "=r"(ldc) // %6 + : "0"(sa), "1"(sb), "2"(bias), "3"(dst), "4"(k), "5"(n), "6"(ldc) + : "v1", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2", "a3", + "a4", "a5", "a6", "a7", "t0", "t5", "t6", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", + "fa7", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7"); +} + +void shl_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n, + int ldc, float *bias, bool fuse_relu) +{ + float *pa = (float *)sa; + float *pb = (float *)sb; + float *pc = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * 4); + } + float *bias_tmp = bias; + + const int mm = (m >> 2) << 2; + + for (int i = 0; i < mm; i += 4) { + kernel_m4_f32_1(pc + i * ldc, pa + i * k, pb, m, k, n, ldc, bias_tmp + i, fuse_relu); + } + + pa += mm * k; + pc += mm * ldc; + bias_tmp += mm; + + switch (m - mm) { + case 3: + kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); + pc += 2 * ldc; + pa += 2 * k; + bias_tmp += 2; + kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); + break; + case 2: + kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); + break; + case 1: + kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); + break; + case 0: + break; + default: + break; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c906_opt/gemv_fp16.c b/source/c906_opt/gemv_fp16.c index 3e841756..08db3f94 100644 --- a/source/c906_opt/gemv_fp16.c +++ b/source/c906_opt/gemv_fp16.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* change memory layout for matrix [k * n] by Z shape Z length: 8 */ -void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx) +void shl_c906_reorder_matrix_z8_fp16(__fp16* src, __fp16* dst, int k, int n, int ldx) { asm volatile( "vsetvli zero, zero, e16, m1\n\t" // set vl = 8 @@ -106,7 +106,7 @@ void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ); } -void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx) +void shl_c906_reorder_matrix_z16_fp16(__fp16* src, __fp16* dst, int k, int n, int ldx) { asm volatile( "vsetvli zero, zero, e16, m2\n\t" // set vl = 8 @@ -191,24 +191,22 @@ void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, in vector: 1 x k matrix: n x k */ -void csi_c906_gemv_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias) +void shl_c906_gemv_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, + int ldc, __fp16* bias) { - - } -void csi_c906_gemv_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias) +void shl_c906_gemv_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, + int ldc, __fp16* bias) { - - } - /* vector: 1 x k matrix: k x n */ -void csi_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias) +void shl_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, + int ldc, __fp16* bias) { asm volatile( "vsetvli zero, zero, e16, m1\n\t" // set vl = 8 @@ -311,8 +309,8 @@ void csi_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* } - -void csi_c906_gemv_trans_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias) +void shl_c906_gemv_trans_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, + int ldc, __fp16* bias) { asm volatile( "vsetvli zero, zero, e16, m2\n\t" // set vl = 8 diff --git a/source/c906_opt/sgemv.c b/source/c906_opt/gemv_fp32.c similarity index 92% rename from source/c906_opt/sgemv.c rename to source/c906_opt/gemv_fp32.c index 58ce5258..c525a13a 100644 --- a/source/c906_opt/sgemv.c +++ b/source/c906_opt/gemv_fp32.c @@ -16,7 +16,6 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" diff --git a/source/c906_opt/global_avgpool.c b/source/c906_opt/global_avgpool.c index 3ebd93c9..5544b75c 100644 --- a/source/c906_opt/global_avgpool.c +++ b/source/c906_opt/global_avgpool.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_global_avgpool2d_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -86,10 +85,8 @@ int csi_c906_global_avgpool2d_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_global_avgpool2d_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; diff --git a/source/c906_opt/global_maxpool.c b/source/c906_opt/global_maxpool.c index cba7d763..9dac20b4 100644 --- a/source/c906_opt/global_maxpool.c +++ b/source/c906_opt/global_maxpool.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_global_maxpool2d_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -83,9 +82,8 @@ int csi_c906_global_maxpool2d_f32(struct csi_tensor *input, return CSINN_TRUE; } -int csi_c906_global_maxpool2d_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/hpm.c b/source/c906_opt/hpm.c index 5d4df358..aae330ad 100644 --- a/source/c906_opt/hpm.c +++ b/source/c906_opt/hpm.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* hpm: hardware performance monitor note: Refer to the hpm sample program in the c906 user manual, Enable related status first. */ -struct csi_c906_hpm csi_c906_get_hw_perf() +struct shl_c906_hpm shl_c906_get_hw_perf() { - struct csi_c906_hpm tmp; + struct shl_c906_hpm tmp; asm volatile( "csrr %0, instret\n\t" "csrr %1, cycle\n\t" @@ -52,8 +52,7 @@ struct csi_c906_hpm csi_c906_get_hw_perf() return tmp; } - -uint64_t csi_c906_get_inst() +uint64_t shl_c906_get_inst() { uint64_t inst = 0; asm volatile("csrr %0, instret" @@ -67,7 +66,7 @@ uint64_t csi_c906_get_inst() return inst; } -uint64_t csi_c906_get_cycle() +uint64_t shl_c906_get_cycle() { uint64_t a = 0; asm volatile("csrr %0, cycle" @@ -96,7 +95,7 @@ uint64_t csi_c906_get_cycle() >=0x10 Reserve mhpmcounter18-31 */ -uint64_t csi_c906_get_l1_icache_access() +uint64_t shl_c906_get_l1_icache_access() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter3" @@ -106,7 +105,7 @@ uint64_t csi_c906_get_l1_icache_access() return a; } -uint64_t csi_c906_get_l1_icache_miss() +uint64_t shl_c906_get_l1_icache_miss() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter4" @@ -116,7 +115,7 @@ uint64_t csi_c906_get_l1_icache_miss() return a; } -uint64_t csi_c906_get_cb_miss() +uint64_t shl_c906_get_cb_miss() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter8" @@ -126,7 +125,7 @@ uint64_t csi_c906_get_cb_miss() return a; } -uint64_t csi_c906_get_cb_inst() +uint64_t shl_c906_get_cb_inst() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter9" @@ -136,7 +135,7 @@ uint64_t csi_c906_get_cb_inst() return a; } -uint64_t csi_c906_get_store_inst() +uint64_t shl_c906_get_store_inst() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter13" @@ -146,7 +145,7 @@ uint64_t csi_c906_get_store_inst() return a; } -uint64_t csi_c906_get_l1_dcache_raccess() +uint64_t shl_c906_get_l1_dcache_raccess() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter14" @@ -156,7 +155,7 @@ uint64_t csi_c906_get_l1_dcache_raccess() return a; } -uint64_t csi_c906_get_l1_dcache_rmiss() +uint64_t shl_c906_get_l1_dcache_rmiss() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter15" @@ -166,7 +165,7 @@ uint64_t csi_c906_get_l1_dcache_rmiss() return a; } -uint64_t csi_c906_get_l1_dcache_waccess() +uint64_t shl_c906_get_l1_dcache_waccess() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter16" @@ -176,7 +175,7 @@ uint64_t csi_c906_get_l1_dcache_waccess() return a; } -uint64_t csi_c906_get_l1_dcache_wmiss() +uint64_t shl_c906_get_l1_dcache_wmiss() { uint64_t a = 0; asm volatile("csrr %0, hpmcounter17" diff --git a/source/c906_opt/layer_norm.c b/source/c906_opt/layer_norm.c index 81ef9b9d..2809442c 100644 --- a/source/c906_opt/layer_norm.c +++ b/source/c906_opt/layer_norm.c @@ -16,22 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include -#include "csi_c906.h" -#include "csi_utils.h" +#include "shl_c906.h" -int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params) +int shl_c906_layer_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { int flatten_size = 0; flatten_size *= input->dim[0] * input->dim[1] * input->dim[2]; - __fp16 *sum = (__fp16 *)csi_mem_alloc(input->dim[1] * sizeof(__fp16)); - __fp16 *sum2 = (__fp16 *)csi_mem_alloc(input->dim[1] * sizeof(__fp16)); + __fp16 *sum = (__fp16 *)shl_mem_alloc(input->dim[1] * sizeof(__fp16)); + __fp16 *sum2 = (__fp16 *)shl_mem_alloc(input->dim[1] * sizeof(__fp16)); __fp16 *input_data = input->data; __fp16 *output_data = output->data; __fp16 *gamma_data = gamma->data; @@ -98,8 +97,8 @@ int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output } } - csi_mem_free(sum); - csi_mem_free(sum2); + shl_mem_free(sum); + shl_mem_free(sum2); return CSINN_TRUE; } diff --git a/source/c906_opt/leaky_relu.c b/source/c906_opt/leaky_relu.c index e70f5e2f..f765cb93 100644 --- a/source/c906_opt/leaky_relu.c +++ b/source/c906_opt/leaky_relu.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_leaky_relu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -61,10 +60,8 @@ int csi_c906_leaky_relu_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_leaky_relu_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/lrn.c b/source/c906_opt/lrn.c index 92a062ce..2ee0dc53 100644 --- a/source/c906_opt/lrn.c +++ b/source/c906_opt/lrn.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params) +int shl_c906_lrn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { __fp16 *input_data = input->data; __fp16 *output_data = output->data; @@ -34,8 +34,8 @@ int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output, for (int j = 0; j < input->dim[0]; j++) { for (int c = 0; c < depth; ++c) { - const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range); - const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1); + const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range); + const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1); for (int i = 0; i < inner_size; ++i) { float accum = 0.f; for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { diff --git a/source/c906_opt/matmul.c b/source/c906_opt/matmul.c index 78b83ad4..9989207f 100644 --- a/source/c906_opt/matmul.c +++ b/source/c906_opt/matmul.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" static void reorder_matrixa_n8_fp16(__fp16 *src, __fp16 *dst, int row, int col) { @@ -166,14 +166,14 @@ static void reorder_matrixb_z8_fp16(__fp16 *src, __fp16 *dst, int row, int col) } } -int csi_c906_matmul_fp32(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params) +int shl_c906_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { return CSINN_TRUE; } -int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params) +int shl_c906_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { __fp16 *mat0_data = (__fp16 *)mat0->data; __fp16 *mat1_data = (__fp16 *)mat1->data; @@ -192,23 +192,23 @@ int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1, const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)]; if (!params->trans_a && !params->trans_b) { - __fp16 *in0 = (__fp16 *)csi_mem_alloc(dim_m * dim_k * sizeof(__fp16)); - __fp16 *in1 = (__fp16 *)csi_mem_alloc(dim_k * dim_n * sizeof(__fp16)); + __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); + __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); for (int b = 0; b < batches; b++) { reorder_matrixa_n8_fp16(mat0_data, in0, dim_m, dim_k); reorder_matrixb_z8_fp16(mat1_data, in1, dim_k, dim_n); - csi_c906_sgemm_kernel_fp16(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, NULL); + shl_c906_sgemm_kernel_fp16(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, NULL); mat0_data += dim_m * dim_k; mat1_data += dim_n * dim_k; output_data += dim_m * dim_n; } - csi_mem_free(in0); - csi_mem_free(in1); + shl_mem_free(in0); + shl_mem_free(in1); } else { - csi_debug_error("Unsupport matrix transpose on C906\n"); + shl_debug_error("Unsupport matrix transpose on C906\n"); return CSINN_FALSE; } return CSINN_TRUE; diff --git a/source/c906_opt/maxpool.c b/source/c906_opt/maxpool.c index c0445a75..dce6eb28 100644 --- a/source/c906_opt/maxpool.c +++ b/source/c906_opt/maxpool.c @@ -16,18 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" /* pad_left = pad_top = 0 pad_right = 0 or 1 pad_down = 0 or 1 */ -static int maxpool2x2s2(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool2x2s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -196,10 +195,8 @@ static int maxpool2x2s2(struct csi_tensor *input, return CSINN_TRUE; } - -static int maxpool2x2s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -370,9 +367,8 @@ static int maxpool2x2s2_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int maxpool2x2s2_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool2x2s2_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -618,10 +614,8 @@ static int maxpool2x2s2_p1(struct csi_tensor *input, return CSINN_TRUE; } - -static int maxpool2x2s2_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; @@ -869,9 +863,8 @@ static int maxpool2x2s2_p1_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int maxpool3x3s2(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s2(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -1107,10 +1100,8 @@ static int maxpool3x3s2(struct csi_tensor *input, return CSINN_TRUE; } - -static int maxpool3x3s2_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -1354,9 +1345,8 @@ static int maxpool3x3s2_fp16(struct csi_tensor *input, pad_right = 0 or 1 pad_down = 0 or 1 */ -static int maxpool3x3s2_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s2_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -1705,11 +1695,8 @@ static int maxpool3x3s2_p1(struct csi_tensor *input, return CSINN_TRUE; } - - -static int maxpool3x3s2_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -2080,9 +2067,8 @@ static int maxpool3x3s2_p1_fp16(struct csi_tensor *input, pad_left = pad_right = pad_top = pad_down = 1 in_w = out_w in_h = out_h */ -static int maxpool3x3s1_p1(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s1_p1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -2399,10 +2385,8 @@ static int maxpool3x3s1_p1(struct csi_tensor *input, return CSINN_TRUE; } - -static int maxpool3x3s1_p1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -2749,10 +2733,8 @@ static int maxpool3x3s1_p1_fp16(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_maxpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_c906_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int32_t input_h = input->dim[2]; int32_t input_w = input->dim[3]; @@ -2767,14 +2749,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input, int32_t pad_top = params->pad_top; int32_t pad_down = params->pad_down; - params->base.bc = NULL; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; // global maxpool2d if (input_h == kernel_h && input_w == kernel_w) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_c906_global_maxpool2d_f32; + cb->exec = shl_c906_global_maxpool2d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_c906_global_maxpool2d_fp16; + cb->exec = shl_c906_global_maxpool2d_fp16; } return CSINN_TRUE; } @@ -2792,15 +2775,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input, // end consider ceil_mode 2x2s2p0 if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = maxpool2x2s2; + cb->exec = maxpool2x2s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = maxpool2x2s2_fp16; + cb->exec = maxpool2x2s2_fp16; } } else if (pad_left == 1 && pad_top == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = maxpool2x2s2_p1; + cb->exec = maxpool2x2s2_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = maxpool2x2s2_p1_fp16; + cb->exec = maxpool2x2s2_p1_fp16; } } } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 @@ -2815,15 +2798,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input, // end consider ceil_mode 3x3s2p0 if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = maxpool3x3s2; + cb->exec = maxpool3x3s2; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = maxpool3x3s2_fp16; + cb->exec = maxpool3x3s2_fp16; } } else if (pad_left == 1 && pad_top == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = maxpool3x3s2_p1; + cb->exec = maxpool3x3s2_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = maxpool3x3s2_p1_fp16; + cb->exec = maxpool3x3s2_p1_fp16; } } } @@ -2831,20 +2814,22 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input, if (kernel_h == 3 && kernel_w == 3) { if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = maxpool3x3s1_p1; + cb->exec = maxpool3x3s1_p1; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = maxpool3x3s1_p1_fp16; + cb->exec = maxpool3x3s1_p1_fp16; } } } } - if (params->base.bc == NULL) { - csi_debug_warning("maxpool is not optimized to achieve under this condition on C906, call reference func replaced.\n"); + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on C906, call reference func " + "replaced.\n"); if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_maxpool2d_f32; + cb->exec = shl_ref_maxpool2d_f32; } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_maxpool2d_quant; + cb->exec = shl_ref_maxpool2d_quant; } } return CSINN_TRUE; diff --git a/source/c906_opt/minimum.c b/source/c906_opt/minimum.c index 0c44ea8d..23a4f19c 100644 --- a/source/c906_opt/minimum.c +++ b/source/c906_opt/minimum.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" static void element_minimum_f32(float *input0, float *input1, float *output, int size) { @@ -44,17 +44,15 @@ static void element_minimum_f32(float *input0, float *input1, float *output, int ); } -int csi_c906_minimum_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; float *output_data = (float *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224] if (in_size1 == 1) { asm volatile( @@ -90,23 +88,23 @@ int csi_c906_minimum_f32(struct csi_tensor *input0, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { - float *in0_data_b = csi_mem_alloc(out_size * 4); - float *in1_data_b = csi_mem_alloc(out_size * 4); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + float *in0_data_b = shl_mem_alloc(out_size * 4); + float *in1_data_b = shl_mem_alloc(out_size * 4); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_minimum_f32(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] ; [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] else { @@ -147,17 +145,15 @@ static void element_minimum_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, ); } -int csi_c906_minimum_fp16(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_minimum_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { __fp16 *input0_data = (__fp16 *)input0->data; __fp16 *input1_data = (__fp16 *)input1->data; __fp16 *output_data = (__fp16 *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); if (in_size1 == 1) { asm volatile( "flh ft0, 0(%3)\n\t" @@ -188,23 +184,23 @@ int csi_c906_minimum_fp16(struct csi_tensor *input0, } } if (!flag) { - __fp16 *in0_data_b = csi_mem_alloc(out_size * 2); - __fp16 *in1_data_b = csi_mem_alloc(out_size * 2); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + __fp16 *in0_data_b = shl_mem_alloc(out_size * 2); + __fp16 *in1_data_b = shl_mem_alloc(out_size * 2); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_minimum_fp16(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } else { int inner_size = in_size1; int outer_size = out_size / in_size1; diff --git a/source/c906_opt/mul.c b/source/c906_opt/mul.c index 921fd545..7a0ec243 100644 --- a/source/c906_opt/mul.c +++ b/source/c906_opt/mul.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" static void element_mul_f32(float *input0, float *input1, float *output, int size) { @@ -49,18 +48,16 @@ static void element_mul_f32(float *input0, float *input1, float *output, int siz ); } -int csi_c906_mul_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; float *output_data = (float *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // HACK: special case: tensorflow densenet121 // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55] @@ -135,28 +132,28 @@ int csi_c906_mul_f32(struct csi_tensor *input0, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { - float *in0_data_b = csi_mem_alloc(out_size * 4); - float *in1_data_b = csi_mem_alloc(out_size * 4); + float *in0_data_b = shl_mem_alloc(out_size * 4); + float *in1_data_b = shl_mem_alloc(out_size * 4); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_mul_f32(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] else { @@ -196,18 +193,16 @@ static void element_mul_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int : "v8", "v9", "v12", "v13", "v16", "v17", "t0"); } -int csi_c906_mul_fp16(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { __fp16 *input0_data = (__fp16 *)input0->data; __fp16 *input1_data = (__fp16 *)input1->data; __fp16 *output_data = (__fp16 *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) { int inner_size = input0->dim[2] * input0->dim[3]; @@ -274,29 +269,28 @@ int csi_c906_mul_fp16(struct csi_tensor *input0, } } if (!flag) { + __fp16 *in0_data_b = shl_mem_alloc(out_size * 2); + __fp16 *in1_data_b = shl_mem_alloc(out_size * 2); - __fp16 *in0_data_b = csi_mem_alloc(out_size * 2); - __fp16 *in1_data_b = csi_mem_alloc(out_size * 2); - - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_mul_fp16(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } else { int inner_size = in_size1; int outer_size = out_size / in_size1; diff --git a/source/c906_opt/pad.c b/source/c906_opt/pad.c index 11a42ca3..2ee67936 100644 --- a/source/c906_opt/pad.c +++ b/source/c906_opt/pad.c @@ -16,17 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" // constrain: only support pad on h and w dim // pad_mode: constant // layout: [n,c,h,w] -int csi_c906_pad_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int shl_c906_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -146,10 +144,8 @@ int csi_c906_pad_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_pad_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int shl_c906_pad_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/prelu.c b/source/c906_opt/prelu.c index c36cefb0..b7ca4045 100644 --- a/source/c906_opt/prelu.c +++ b/source/c906_opt/prelu.c @@ -16,14 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *alpha, - struct csi_tensor *output, - struct prelu_params *params) +static int shl_c906_prelu_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -68,8 +66,8 @@ static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input, // for (int y = 0; y < output->dim[1]; ++y) { // for (int x = 0; x < output->dim[2]; ++x) { // for (int c = 0; c < output->dim[3]; ++c) { - // int output_index = csi_ref_get_index(output->dim, b, y, x, c); - // int input_index = csi_ref_get_index(input->dim, b, y, x, c); + // int output_index = shl_ref_get_index(output->dim, b, y, x, c); + // int input_index = shl_ref_get_index(input->dim, b, y, x, c); // float input_value = input_data[input_index]; // if (input_value >= 0) { // output_data[output_index] = input_data[input_index]; @@ -83,10 +81,8 @@ static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_c906_prelu_nchw_f32(struct csi_tensor *input, - struct csi_tensor *alpha, - struct csi_tensor *output, - struct prelu_params *params) +static int shl_c906_prelu_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -140,15 +136,13 @@ static int csi_c906_prelu_nchw_f32(struct csi_tensor *input, return CSINN_TRUE; } -int csi_c906_prelu_f32(struct csi_tensor *input, - struct csi_tensor *alpha, - struct csi_tensor *output, - struct prelu_params *params) +int shl_c906_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_c906_prelu_nchw_f32(input, alpha, output, params); + shl_c906_prelu_nchw_f32(input, alpha, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_c906_prelu_nhwc_f32(input, alpha, output, params); + shl_c906_prelu_nhwc_f32(input, alpha, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } @@ -156,10 +150,8 @@ int csi_c906_prelu_f32(struct csi_tensor *input, // nchw layout -int csi_c906_prelu_fp16(struct csi_tensor *input, - struct csi_tensor *alpha, - struct csi_tensor *output, - struct prelu_params *params) +int shl_c906_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/relu.c b/source/c906_opt/relu.c index 34a26ca2..6b69b6d7 100644 --- a/source/c906_opt/relu.c +++ b/source/c906_opt/relu.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" static float relu(float x){ return x > 0 ? x : 0; } -int csi_c906_relu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -62,10 +61,8 @@ int csi_c906_relu_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_relu_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/relu1.c b/source/c906_opt/relu1.c index 7dcacd64..864ad1ca 100644 --- a/source/c906_opt/relu1.c +++ b/source/c906_opt/relu1.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" static float relu1(float x){ return fmin(x > 0 ? x : 0, 1); } -int csi_c906_relu1_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,10 +64,8 @@ int csi_c906_relu1_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_relu1_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/relu6.c b/source/c906_opt/relu6.c index 8c12e2e4..77b5418f 100644 --- a/source/c906_opt/relu6.c +++ b/source/c906_opt/relu6.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" static float relu6(float x){ return fmin(x > 0 ? x : 0, 6); } -int csi_c906_relu6_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,10 +64,8 @@ int csi_c906_relu6_f32(struct csi_tensor *input, return CSINN_TRUE; } - -int csi_c906_relu6_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_c906_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/reshape.c b/source/c906_opt/reshape.c index 10d63d50..76c4e1c0 100644 --- a/source/c906_opt/reshape.c +++ b/source/c906_opt/reshape.c @@ -16,18 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_reshape_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params) +int shl_c906_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_byte_size(input); + int size = csinn_tensor_byte_size(input); if (input_data != output_data) { - csi_c906_memcpy(output_data, input_data, size); + shl_c906_memcpy(output_data, input_data, size); } return CSINN_TRUE; } diff --git a/source/c906_opt/setup.c b/source/c906_opt/setup.c index ed964abf..7a2eb536 100644 --- a/source/c906_opt/setup.c +++ b/source/c906_opt/setup.c @@ -16,438 +16,174 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -static struct csi_bc_op_list csi_nn_c906_init_bc_op_list; -static struct csi_bc_op_list csi_nn_c906_func_bc_op_list; +static struct shl_cb_op_list shl_c906_cb_op_list; -int csi_nn_c906_register_op_init(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc) +int shl_c906_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec) { - struct csi_bc_op_list *list_end = csi_bc_list_end(&csi_nn_c906_init_bc_op_list); - struct csi_bc_op_list *next = csi_mem_alloc(sizeof(struct csi_bc_op_list)); - next->bc = bc; + struct shl_cb_op_list *list_end = shl_cb_list_end(&shl_c906_cb_op_list); + struct shl_cb_op_list *next = shl_mem_alloc(sizeof(struct shl_cb_op_list)); + next->cb = shl_mem_alloc(sizeof(struct csinn_callback)); + next->cb->init = init; + next->cb->exec = exec; next->dtype = dtype; next->op_name = op_name; list_end->next = next; return CSINN_TRUE; } -int csi_nn_c906_register_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc) +int shl_c906_reg_op_est(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *est) { - struct csi_bc_op_list *list_end = csi_bc_list_end(&csi_nn_c906_func_bc_op_list); - struct csi_bc_op_list *next = csi_mem_alloc(sizeof(struct csi_bc_op_list)); - next->bc = bc; - next->dtype = dtype; - next->op_name = op_name; - list_end->next = next; - return CSINN_TRUE; -} - -static inline void register_op_init_all(enum csinn_op_enum op_name, void *bc) -{ - csi_nn_c906_register_op_init(CSINN_DTYPE_FLOAT16, op_name, bc); - csi_nn_c906_register_op_init(CSINN_DTYPE_FLOAT32, op_name, bc); -} + struct csinn_callback *cb = shl_cb_list_match(&shl_c906_cb_op_list, dtype, op_name); + if (cb == NULL) { + shl_debug_info("%s: cannot find c906 est\n", __func__); + } else { + cb->est = est; + } -void __attribute__((weak)) csi_nn_c906_bc_init_reg() -{ - register_op_init_all(CSINN_OP_CONV2D, csi_c906_conv2d_init); - register_op_init_all(CSINN_OP_GROUP_CONV2D, csi_c906_conv2d_init); - register_op_init_all(CSINN_OP_CONV1D, csi_c906_conv1d_init); - register_op_init_all(CSINN_OP_MAXPOOL2D, csi_c906_maxpool2d_init); - register_op_init_all(CSINN_OP_AVGPOOL2D, csi_c906_avgpool2d_init); - register_op_init_all(CSINN_OP_DEPTHWISE_CONV2D, csi_c906_depthwise_conv2d_init); - register_op_init_all(CSINN_OP_FULLYCONNECTED, csi_c906_fullyconnected_init); - register_op_init_all(CSINN_OP_CACHE_MATMUL, csi_c906_cache_matmul_init); - register_op_init_all(CSINN_OP_DIV, csi_c906_div_init); - register_op_init_all(CSINN_OP_CACHE_CONV1D, csi_c906_cache_conv1d_init); + return CSINN_TRUE; } -void *csi_init_map_c906(int op, int dtype) +struct csinn_callback *shl_cb_map_rvv(int op, int dtype); +struct csinn_callback *shl_cb_map_c906(int op, int dtype) { - static int has_reg; - if (has_reg == 0) { - csi_nn_c906_bc_init_reg(); - has_reg = 1; - } - void *ret = csi_bc_list_match(&csi_nn_c906_init_bc_op_list, dtype, op); - if (ret == NULL) { - csi_debug_info("no c906 init\n"); + struct csinn_callback *cb = shl_cb_list_match(&shl_c906_cb_op_list, dtype, op); + if (cb == NULL) { + cb = shl_cb_map_rvv(op, dtype); } - return ret; + return cb; } -void __attribute__((weak)) csi_nn_c906_bc_reg() +void __attribute__((weak)) shl_target_init_c906() { - /* float16 */ - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, csi_c906_abs_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ACOS, csi_ref_acos_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ACOSH, csi_ref_acosh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, csi_c906_add_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AND, csi_ref_and_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARANGE, csi_ref_arange_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARGMAX, csi_ref_argmax_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARGMIN, csi_ref_argmin_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ASIN, csi_ref_asin_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ASINH, csi_ref_asinh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ATAN, csi_ref_atan_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ATANH, csi_ref_atanh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, csi_ref_avgpool2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL3D, csi_ref_avgpool3d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BN, csi_ref_batch_normalization_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BATCH_TO_SPACE, - csi_ref_batch_to_space_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BROADCOST, csi_ref_broadcast_to_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, csi_c906_cache_matmul_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, csi_c906_cache_conv1d_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CEIL, csi_ref_ceil_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, csi_c906_clip_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, csi_c906_concat_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, csi_ref_conv1d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, csi_ref_conv2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D_RELU, csi_ref_conv2d_relu_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D_RELU6, csi_ref_conv2d_relu6_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, - csi_ref_depthwise_conv2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D_RELU, - csi_ref_depthwise_conv2d_relu_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D_RELU6, - csi_ref_depthwise_conv2d_relu6_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, csi_ref_group_conv2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV3D, csi_ref_conv3d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV2D, csi_ref_deconv2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_DECONV2D, - csi_ref_depthwise_deconv2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV3D, csi_ref_deconv3d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_COS, csi_ref_cos_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_COSH, csi_ref_cosh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CUMPROD, csi_ref_cumprod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CUMSUM, csi_ref_cumsum_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTH_TO_SPACE, - csi_ref_depth_to_space_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, csi_ref_div_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ELU, csi_ref_elu_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EQUANL, csi_ref_equal_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ERF, csi_ref_erf_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXP, csi_ref_exp_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXPAND_DIMS, csi_ref_expand_dims_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXPM1, csi_ref_expm1_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLATTEN, csi_ref_flatten); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR_DIVIDE, csi_ref_floor_divide_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR_MOD, csi_ref_floor_mod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR, csi_ref_floor_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FSMN, csi_ref_fsmn_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, - csi_c906_fullyconnected_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER_ND, csi_ref_gather_nd_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, csi_c906_gather_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, - csi_ref_global_avgpool2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, - csi_ref_global_maxpool2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GREATHER_EQUAL, - csi_ref_greater_equal_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GREATHER, csi_ref_greater_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_HARD_SIGMOID, csi_ref_hard_sigmoid_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_IM2COL, csi_ref_im2col_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_L2N, csi_ref_l2_normalization_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, csi_c906_layer_norm_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, csi_c906_leaky_relu_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LESS_EQUAL, csi_ref_less_equal_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LESS, csi_ref_less_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG_SOFTMAX, csi_ref_log_softmax_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG, csi_ref_log_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG1P, csi_ref_log1p_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_AND, csi_ref_logical_and_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_NOT, csi_ref_logical_not_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_OR, csi_ref_logical_or_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_XOR, csi_ref_logical_xor_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, csi_c906_lrn_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, csi_c906_matmul_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAX, csi_ref_max_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXIMUM, csi_ref_maximum_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, csi_ref_maxpool2d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D_LOCAT, - csi_ref_maxpool2d_locat_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL3D, csi_ref_maxpool3d_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MEAN, csi_ref_mean_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MEAN_STRIDE, csi_ref_mean_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MIN, csi_ref_min_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, csi_c906_minimum_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MOD, csi_ref_mod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, csi_c906_mul_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NDARRAY_SIZE, csi_ref_ndarray_size_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NEGATIIVE, csi_ref_negative_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NOT_EQUAL, csi_ref_not_equal_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NOT, csi_ref_not_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_OR, csi_ref_or_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PAD, csi_ref_pad_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_POWER, csi_ref_power_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, csi_c906_prelu_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PROD, csi_ref_prod_stride_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PROPOSAL, csi_ref_proposal_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PSROIPOOLING, csi_ref_psroipooling_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_LOGSUMEXP, - csi_ref_reduce_logsumexp_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MAX, csi_ref_reduce_max_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MEAN, csi_ref_reduce_mean_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MIN, csi_ref_reduce_min_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_PROD, csi_ref_reduce_prod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_SUM, csi_ref_reduce_sum_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, csi_c906_relu_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, csi_c906_relu1_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, csi_c906_relu6_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELUN, csi_ref_relun_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, csi_c906_reshape_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESIZE, csi_ref_resize_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REVERSE, csi_ref_reverse_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ROIPOOL, csi_ref_roipool_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ROUND, csi_ref_round_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RSQRT, csi_ref_rsqrt_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SCATTER_ND, csi_ref_scatter_nd_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MAX, csi_ref_segment_max_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MAX, - csi_ref_unsorted_segment_max_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MEAN, csi_ref_segment_mean_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MEAN, - csi_ref_unsorted_segment_mean_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MIN, csi_ref_segment_min_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MIN, - csi_ref_unsorted_segment_min_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_PROD, csi_ref_segment_prod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_PROD, - csi_ref_unsorted_segment_prod_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_SUM, csi_ref_segment_sum_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_SUM, - csi_ref_unsorted_segment_sum_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SELECT, csi_ref_select_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SHAPE, csi_ref_shape_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SHUFFLE_CHANNEL, - csi_ref_shuffle_channel_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, csi_nn_rvv_sigmoid_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGN, csi_ref_sign_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIN, csi_ref_sin_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SINH, csi_ref_sinh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SLICE, csi_ref_slice_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, csi_nn_rvv_softmax_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTPLUS, csi_ref_softplus_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTRELU, csi_ref_softrelu_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTSIGN, csi_ref_softsign_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPACE_TO_BATCH, - csi_ref_space_to_batch_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPACE_TO_DEPTH, - csi_ref_space_to_depth_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, csi_c906_split_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SQRT, csi_ref_sqrt_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SQUEEZE, csi_ref_squeeze); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STACK, csi_ref_stack_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STRIDED_SLICE, - csi_ref_strided_slice_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, csi_c906_sub_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, csi_c906_sum_stride_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TAN, csi_ref_tan_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TANH, csi_ref_tanh_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_THRESHOLD_RELU, - csi_ref_threshold_relu_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TILE, csi_ref_tile_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TOPK, csi_ref_topk_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRUNC, csi_ref_trunc_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, csi_c906_transpose_fp16); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNPOOLING, csi_ref_unpooling_quant); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSTACK, csi_ref_unstack_qunat); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_XOR, csi_ref_xor_i8); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_YUV_RGB_SCALE, - csi_ref_yuv_rgb_scale_quant); + shl_register_runtime_callback(CSINN_C906, NULL); + shl_register_op_callback(CSINN_C906, shl_cb_map_c906); - /* float32 */ - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, csi_c906_abs_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ACOS, csi_ref_acos_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ACOSH, csi_ref_acosh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, csi_c906_add_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARANGE, csi_ref_arange_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARGMAX, csi_ref_argmax_stride_i32_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARGMIN, csi_ref_argmin_stride_i32_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ASIN, csi_ref_asin_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ASINH, csi_ref_asinh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ATAN, csi_ref_atan_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ATANH, csi_ref_atanh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, csi_ref_avgpool2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL3D, csi_ref_avgpool3d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BN, csi_ref_batch_normalization_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BATCH_TO_SPACE, - csi_ref_batch_to_space_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BROADCOST, csi_ref_broadcast_to_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CACHE_MATMUL, csi_ref_cache_matmul_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CACHE_CONV1D, csi_ref_cache_conv1d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CEIL, csi_ref_ceil_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, csi_c906_clip_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, csi_c906_concat_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, csi_ref_conv1d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, csi_ref_conv2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D_RELU, csi_ref_conv2d_relu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, - csi_ref_depthwise_conv2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, csi_ref_group_conv2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV3D, csi_ref_conv3d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV2D, csi_ref_deconv2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_DECONV2D, - csi_ref_depthwise_deconv2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV3D, csi_ref_deconv3d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_COS, csi_ref_cos_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_COSH, csi_ref_cosh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CUMPROD, csi_ref_cumprod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CUMSUM, csi_ref_cumsum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTH_TO_SPACE, - csi_ref_depth_to_space_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, csi_ref_div_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ELU, csi_ref_elu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EQUANL, csi_ref_equal_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ERF, csi_ref_erf_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXP, csi_ref_exp_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXPAND_DIMS, csi_ref_expand_dims_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXPM1, csi_ref_expm1_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLATTEN, csi_ref_flatten); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR_DIVIDE, csi_ref_floor_divide_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR_MOD, csi_ref_floor_mod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR, csi_ref_floor_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FSMN, csi_ref_fsmn_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, - csi_c906_fullyconnected_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER_ND, csi_ref_gather_nd_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER, csi_ref_gather_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, - csi_c906_global_avgpool2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, - csi_c906_global_maxpool2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GREATHER_EQUAL, - csi_ref_greater_equal_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GREATHER, csi_ref_greater_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_HARD_SIGMOID, csi_ref_hard_sigmoid_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_IM2COL, csi_ref_im2col_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_L2N, csi_ref_l2_normalization_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_L2POOL2D, csi_ref_l2pool_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LAYER_NORM, csi_ref_layer_norm_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, csi_c906_leaky_relu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LESS_EQUAL, csi_ref_less_equal_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LESS, csi_ref_less_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG_SOFTMAX, csi_ref_log_softmax_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG, csi_ref_log_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG1P, csi_ref_log1p_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_AND, csi_ref_logical_and_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_NOT, csi_ref_logical_not_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_OR, csi_ref_logical_or_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_XOR, csi_ref_logical_xor_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LRN, csi_ref_lrn_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MATMUL, csi_ref_matmul_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAX, csi_ref_max_stride_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXIMUM, csi_ref_maximum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, csi_ref_maxpool2d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D_LOCAT, - csi_ref_maxpool2d_locat_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL3D, csi_ref_maxpool3d_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MEAN, csi_ref_mean_stride_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MEAN_STRIDE, csi_ref_mean_stride_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, csi_c906_minimum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MOD, csi_ref_mod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, csi_c906_mul_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NDARRAY_SIZE, csi_ref_ndarray_size_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NEGATIIVE, csi_ref_negative_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NOT_EQUAL, csi_ref_not_equal_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PAD, csi_ref_pad_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_POWER, csi_ref_power_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, csi_c906_prelu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PROD, csi_ref_prod_stride_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PROPOSAL, csi_ref_proposal_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PSROIPOOLING, csi_ref_psroipooling_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_LOGSUMEXP, - csi_ref_reduce_logsumexp_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MAX, csi_ref_reduce_max_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MEAN, csi_ref_reduce_mean_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MIN, csi_ref_reduce_min_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_PROD, csi_ref_reduce_prod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_SUM, csi_ref_reduce_sum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, csi_c906_relu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, csi_c906_relu1_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, csi_c906_relu6_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELUN, csi_ref_relun_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RESHAPE, csi_ref_reshape); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RESIZE, csi_ref_resize_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REVERSE, csi_ref_reverse_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROIALIGN, csi_ref_roi_align_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROIPOOL, csi_ref_roipool_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROUND, csi_ref_round_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RSQRT, csi_ref_rsqrt_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SCATTER_ND, csi_ref_scatter_nd_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MAX, csi_ref_segment_max_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MAX, - csi_ref_unsorted_segment_max_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MEAN, csi_ref_segment_mean_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MEAN, - csi_ref_unsorted_segment_mean_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MIN, csi_ref_segment_min_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MIN, - csi_ref_unsorted_segment_min_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_PROD, csi_ref_segment_prod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_PROD, - csi_ref_unsorted_segment_prod_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_SUM, csi_ref_segment_sum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_SUM, - csi_ref_unsorted_segment_sum_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SELECT, csi_ref_select_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SHUFFLE_CHANNEL, - csi_ref_shuffle_channel_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIGMOID, csi_ref_sigmoid_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIGN, csi_ref_sign_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIN, csi_ref_sin_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SINH, csi_ref_sinh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SLICE, csi_ref_slice_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTMAX, csi_ref_softmax_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTPLUS, csi_ref_softplus_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTRELU, csi_ref_softrelu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTSIGN, csi_ref_softsign_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPACE_TO_BATCH, - csi_ref_space_to_batch_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPACE_TO_DEPTH, - csi_ref_space_to_depth_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, csi_c906_split_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SQRT, csi_ref_sqrt_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SQUEEZE, csi_ref_square_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_STACK, csi_ref_stack_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_STRIDED_SLICE, csi_ref_strided_slice_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, csi_c906_sub_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUM, csi_ref_sum_stride_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TAN, csi_ref_tan_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TANH, csi_ref_tanh_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_THRESHOLD_RELU, - csi_ref_threshold_relu_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TILE, csi_ref_tile_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TOPK, csi_ref_topk_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TRUNC, csi_ref_trunc_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TRANSPOSE, csi_ref_transpose); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNPOOLING, csi_ref_unpooling_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSTACK, csi_ref_unstack_f32); - csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_YUV_RGB_SCALE, csi_ref_yuv_rgb_scale_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c906_conv2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c906_conv2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_c906_conv1d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_c906_conv1d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, shl_c906_depthwise_conv2d_init, + NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, shl_c906_depthwise_conv2d_init, + NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_c906_fullyconnected_init, + NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_c906_fullyconnected_init, + NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, shl_c906_div_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, shl_c906_div_init, NULL); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, NULL, shl_c906_abs_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, NULL, shl_c906_add_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, shl_c906_cache_matmul_init, + shl_c906_cache_matmul_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, shl_c906_cache_conv1d_init, + shl_c906_cache_conv1d_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, NULL, shl_c906_clip_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, NULL, shl_c906_concat_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, NULL, + shl_c906_global_avgpool2d_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, NULL, + shl_c906_global_maxpool2d_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, NULL, shl_c906_gather_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, NULL, shl_c906_layer_norm_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, NULL, shl_c906_leaky_relu_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, NULL, shl_c906_lrn_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, NULL, shl_c906_matmul_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, NULL, shl_c906_minimum_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, NULL, shl_c906_mul_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, NULL, shl_c906_prelu_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, NULL, shl_c906_relu_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, NULL, shl_c906_relu1_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_c906_relu6_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, NULL, shl_c906_reshape_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, NULL, shl_c906_split_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, NULL, shl_c906_sub_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, NULL, shl_c906_sum_stride_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, NULL, shl_c906_transpose_fp16); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, NULL, shl_c906_abs_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, NULL, shl_c906_add_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, NULL, shl_c906_clip_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_c906_concat_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, NULL, + shl_c906_global_avgpool2d_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, NULL, + shl_c906_global_maxpool2d_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, NULL, shl_c906_leaky_relu_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, NULL, shl_c906_minimum_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_c906_mul_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, NULL, shl_c906_prelu_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, NULL, shl_c906_relu_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, NULL, shl_c906_relu1_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, NULL, shl_c906_relu6_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, NULL, shl_c906_split_f32); + shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, NULL, shl_c906_sub_f32); - /* int8 */ - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_CONCAT, csi_nn_rvv_concat_int8); - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, csi_nn_rvv_mul_int8); - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_RELU, csi_nn_rvv_relu_int8); - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_RESHAPE, csi_ref_reshape); - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_SUM, csi_nn_rvv_sum_stride_int8); - csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_SOFTMAX, csi_ref_softmax_quant); -} - -void *csi_bc_map_c906(int op, int dtype) { - static int has_reg; - if (has_reg == 0) { - csi_nn_c906_bc_reg(); - has_reg = 1; - } - void *ret = csi_bc_list_match(&csi_nn_c906_func_bc_op_list, dtype, op); - if (ret == NULL) { - csi_debug_info("cannot find c906 func\n"); - } - return ret; +#ifdef SHL_BUILD_GREF + shl_register_runtime_callback(CSINN_C906, shl_gref_runtime_callback); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_gref_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_gref_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_gref_group_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_gref_group_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_gref_conv1d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_gref_conv1d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_gref_maxpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_gref_maxpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_gref_avgpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_gref_avgpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, shl_gref_depthwise_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, shl_gref_depthwise_conv2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_gref_fullyconnected); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_gref_fullyconnected); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, shl_gref_div); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, shl_gref_div); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, shl_gref_abs); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, shl_gref_add); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, shl_gref_cache_matmul); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, shl_gref_cache_conv1d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, shl_gref_clip); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, shl_gref_concat); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_gref_global_avgpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, shl_gref_global_maxpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, shl_gref_gather); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, shl_gref_layer_norm); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, shl_gref_leaky_relu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, shl_gref_lrn); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_gref_matmul); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, shl_gref_minimum); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, shl_gref_mul); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, shl_gref_prelu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, shl_gref_relu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, shl_gref_relu1); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, shl_gref_relu6); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, shl_gref_reshape); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, shl_gref_split); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, shl_gref_sub); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, shl_gref_sum); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, shl_gref_transpose); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, shl_gref_abs); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, shl_gref_add); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, shl_gref_clip); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, shl_gref_concat); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, shl_gref_global_avgpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, shl_gref_global_maxpool2d); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, shl_gref_leaky_relu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, shl_gref_minimum); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, shl_gref_mul); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, shl_gref_prelu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, shl_gref_relu); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, shl_gref_relu1); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, shl_gref_relu6); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, shl_gref_split); + shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, shl_gref_sub); +#endif } diff --git a/source/c906_opt/sgemm.c b/source/c906_opt/sgemm.c deleted file mode 100644 index 492ab65f..00000000 --- a/source/c906_opt/sgemm.c +++ /dev/null @@ -1,3165 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" - -/* The matrices are stored in row-major order */ -#define A(i,j) a[ (i)*lda + (j) ] -#define B(i,j) b[ (i)*ldb + (j) ] -#define C(i,j) c[ (i)*ldc + (j) ] - -#define DECOMPOSE_K \ - int ktmp = k; \ - int k8 = k >> 3; \ - k -= (k8 << 3); \ - int k4 = k >> 2; \ - k -= (k4 << 2); \ - int k2 = k >> 1; \ - k -= (k2 << 1); \ - int k1 = k; \ - k = ktmp; - -#define DECOMPOSE_N \ - int ntmp = n; \ - int n4 = n >> 2; \ - n -= (n4 << 2); \ - int n2 = n >> 1; \ - n -= (n2 << 1); \ - int n1 = n; \ - n = ntmp; - -#define DECOMPOSE_M \ - int mtmp = m; \ - int m4 = m >> 2; \ - m -= (m4 << 2); \ - int m2 = m >> 1; \ - m -= (m2 << 1); \ - int m1 = m; \ - m = mtmp; - -/* - change memory layout for matrix A (kernel matrix) - memory index from ------> to - 0 1 2 3 0 4 8 12 - 4 5 6 7 1 5 9 13 - 8 9 10 11 2 6 10 14 - 12 13 14 15 3 7 11 15 - 16 17 18 19 16 18 20 22 - 20 21 22 23 17 19 21 23 - 24 25 26 27 24 25 26 27 - - notice: called in the initialization function (csi_c906_conv2d_init) -*/ -void csi_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx) -{ -#if __riscv_vector == 128 - DECOMPOSE_M - DECOMPOSE_K - /* - Execution delay cycles: vlsw + vsw = 6 + 1 - vlw + vssw = 4 + 2 ✔ - */ - if(m4 > 0) { - float *a0 = a; - float *a1 = a0 + ldx; - float *a2 = a1 + ldx; - float *a3 = a2 + ldx; - int k_tail = k & 7; - int store_stride = 16; - asm volatile( - "slli t3, %10, 2\n\t" // t3 = ldx * 4 - "slli t4, t3, 2\n\t" // t4 = 4 * ldx * 4 - "mv t2, %5\n\t" // t2 = m4 - "slli t0, %7, 2\n\t" // t0 = k_tail * 4 - "slli t1, t0, 2\n\t" // t1 = t0 * 4 - - "1:\n\t" - // start packm4 - "mv %0, %9\n\t" // a0 = a - "add %1, %0, t3\n\t" // a1 = a0 + 4 * ldx - "add %2, %1, t3\n\t" // a2 = a1 + 4 * ldx - "add %3, %2, t3\n\t" // a3 = a2 + 4 * ldx - "mv t6, %6\n\t" // t6 = k8 - "beqz t6, 3f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "2:\n\t" - // start subpack_m4k8 - "vlw.v v0, (%0)\n\t" - "addi %0, %0, 32\n\t" - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 32\n\t" - "vlw.v v4, (%2)\n\t" - "addi %2, %2, 32\n\t" - "vlw.v v6, (%3)\n\t" - "addi %3, %3, 32\n\t" - - "vssw.v v0, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v2, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v4, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v6, (%4), %8\n\t" - "addi %4, %4, 116\n\t" // sa += 32 ele * 4 - - "addi t6, t6, -1\n\t" // k8-- - "bnez t6, 2b\n\t" - - "3:\n\t" - "beqz %7, 4f\n\t" // k_tail == 0 ? - // Processing k_tail - "vsetvli zero, %7, e32, m2\n\t" - "vlw.v v0, (%0)\n\t" - "add %0, %0, t0\n\t" - "vlw.v v2, (%1)\n\t" - "add %1, %1, t0\n\t" - "vlw.v v4, (%2)\n\t" - "add %2, %2, t0\n\t" - "vlw.v v6, (%3)\n\t" - "add %3, %3, t0\n\t" - - "vssw.v v0, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v2, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v4, (%4), %8\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v6, (%4), %8\n\t" - "addi %4, %4, -12\n\t" - "add %4, %4, t1\n\t" // sa += 4 * k_tail * 4 - - "4:\n\t" - // end packm4 - "add %9, %9, t4\n\t" // a += 4 * ldx * 4 - "addi t2, t2, -1\n\t" // m4-- - "bnez t2, 1b\n\t" - - :"=r"(a0), // %0 - "=r"(a1), // %1 - "=r"(a2), // %2 - "=r"(a3), // %3 - "=r"(sa), // %4 - "=r"(m4), // %5 - "=r"(k8), // %6 - "=r"(k_tail), // %7 - "=r"(store_stride), // %8 - "=r"(a), // %9 - "=r"(ldx) // %10 - :"0"(a0), - "1"(a1), - "2"(a2), - "3"(a3), - "4"(sa), - "5"(m4), - "6"(k8), - "7"(k_tail), - "8"(store_stride), - "9"(a), - "10"(ldx) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t2", "t3", "t4", "t6" - ); - } - if(m2 > 0) { - float *a0 = a; - float *a1 = a0 + ldx; - int k8 = k >> 3; - int k_tail = k & 7; - int store_stride = 8; - - asm volatile( - "slli t2, %7, 3\n\t" // t2 = ldx * 2 * 4 - "slli t0, %4, 2\n\t" // t0 = k_tail * 4 - "slli t1, t0, 1\n\t" // t1 = t0 * 2 - "beqz %3, 2f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "1:\n\t" - // start subpack_m2k8 - "vlw.v v0, (%0)\n\t" - "addi %0, %0, 32\n\t" - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 32\n\t" - - "vssw.v v0, (%2), %5\n\t" - "addi %2, %2, 4\n\t" - "vssw.v v2, (%2), %5\n\t" - "addi %2, %2, -4\n\t" - "addi %2, %2, 64\n\t" // sa += 16 ele * 4 - - "addi %3, %3, -1\n\t" - "bnez %3, 1b\n\t" - - "2:\n\t" - "beqz %4, 3f\n\t" // k_tail == 0 ? - // Processing k_tail - "vsetvli zero, %4, e32, m2\n\t" - "vlw.v v0, (%0)\n\t" - "add %0, %0, t0\n\t" - "vlw.v v2, (%1)\n\t" - "add %1, %1, t0\n\t" - - "vssw.v v0, (%2), %5\n\t" - "addi %2, %2, 4\n\t" - "vssw.v v2, (%2), %5\n\t" - "addi %2, %2, -4\n\t" - "add %2, %2, t1\n\t" // sa += k_tail * 2 * 4 - - "3:\n\t" - // end packm2 - "add %6, %6, t2\n\t" - - :"=r"(a0), // %0 - "=r"(a1), // %1 - "=r"(sa), // %2 - "=r"(k8), // %3 - "=r"(k_tail), // %4 - "=r"(store_stride), // %5 - "=r"(a), // %6 - "=r"(ldx) // %7 - :"0"(a0), - "1"(a1), - "2"(sa), - "3"(k8), - "4"(k_tail), - "5"(store_stride), - "6"(a), - "7"(ldx) - :"v0", "v1", "v2", "v3", "t0", "t1", "t2" - ); - } - if(m1 > 0) { - memcpy(sa, a, sizeof(float) * ldx); - } -#else - int i = 0; - for(; i + 3 < m; i += 4) { - float *p0 = a; - float *p1 = a + ldx; - float *p2 = a + 2 * ldx; - float *p3 = a + 3 * ldx; - int j = 0; - for(; j + 7 < k; j += 8) { - sa[0] = p0[0]; sa[16] = p0[4]; - sa[1] = p1[0]; sa[17] = p1[4]; - sa[2] = p2[0]; sa[18] = p2[4]; - sa[3] = p3[0]; sa[19] = p3[4]; - - sa[4] = p0[1]; sa[20] = p0[5]; - sa[5] = p1[1]; sa[21] = p1[5]; - sa[6] = p2[1]; sa[22] = p2[5]; - sa[7] = p3[1]; sa[23] = p3[5]; - - sa[8] = p0[2]; sa[24] = p0[6]; - sa[9] = p1[2]; sa[25] = p1[6]; - sa[10] = p2[2]; sa[26] = p2[6]; - sa[11] = p3[2]; sa[27] = p3[6]; - - sa[12] = p0[3]; sa[28] = p0[7]; - sa[13] = p1[3]; sa[29] = p1[7]; - sa[14] = p2[3]; sa[30] = p2[7]; - sa[15] = p3[3]; sa[31] = p3[7]; - - sa += 32; - p0 += 8; - p1 += 8; - p2 += 8; - p3 += 8; - - } - if(j + 3 < k) { - j += 4; - sa[0] = p0[0]; sa[8] = p0[2]; - sa[1] = p1[0]; sa[9] = p1[2]; - sa[2] = p2[0]; sa[10] = p2[2]; - sa[3] = p3[0]; sa[11] = p3[2]; - - sa[4] = p0[1]; sa[12] = p0[3]; - sa[5] = p1[1]; sa[13] = p1[3]; - sa[6] = p2[1]; sa[14] = p2[3]; - sa[7] = p3[1]; sa[15] = p3[3]; - - sa += 16; - p0 += 4; - p1 += 4; - p2 += 4; - p3 += 4; - } - if(j + 1 < k) { - j += 2; - sa[0] = p0[0]; - sa[1] = p1[0]; - sa[2] = p2[0]; - sa[3] = p3[0]; - - sa[4] = p0[1]; - sa[5] = p1[1]; - sa[6] = p2[1]; - sa[7] = p3[1]; - - sa += 8; - p0 += 2; - p1 += 2; - p2 += 2; - p3 += 2; - } - if(j < k) { - sa[0] = p0[0]; - sa[1] = p1[0]; - sa[2] = p2[0]; - sa[3] = p3[0]; - - sa += 4; - } - a += 4 * ldx; - } - if(i + 1 < m) { - i += 2; - float *p0 = a; - float *p1 = a + ldx; - - int j = 0; - for(; j + 7 < k; j += 8) { - sa[0] = p0[0]; - sa[1] = p1[0]; - sa[2] = p0[1]; - sa[3] = p1[1]; - sa[4] = p0[2]; - sa[5] = p1[2]; - sa[6] = p0[3]; - sa[7] = p1[3]; - sa[8] = p0[4]; - sa[9] = p1[4]; - sa[10] = p0[5]; - sa[11] = p1[5]; - sa[12] = p0[6]; - sa[13] = p1[6]; - sa[14] = p0[7]; - sa[15] = p1[7]; - - sa += 16; - p0 += 8; - p1 += 8; - } - if(j + 3 < k) { - j += 4; - sa[0] = p0[0]; - sa[1] = p1[0]; - sa[2] = p0[1]; - sa[3] = p1[1]; - sa[4] = p0[2]; - sa[5] = p1[2]; - sa[6] = p0[3]; - sa[7] = p1[3]; - - sa += 8; - p0 += 4; - p1 += 4; - } - if(j + 1 < k) { - j += 2; - sa[0] = p0[0]; - sa[1] = p1[0]; - sa[2] = p0[1]; - sa[3] = p1[1]; - - sa += 4; - p0 += 2; - p1 += 2; - } - if(j < k) { - sa[0] = p0[0]; - sa[1] = p1[0]; - - sa += 2; - } - a += 2 * ldx; - } - if(i < m) { - memcpy(sa, a, sizeof(float) * ldx); - } -#endif // __riscv_vector -} - -void csi_c906_reorder_input(float *b, float *sb, int k, int n, int ldx) -{ - -#if __riscv_vector == 128 - DECOMPOSE_N - DECOMPOSE_K - if(n4 > 0) { - float *b0 = b; - float *b1 = b0 + 1; - float *b2 = b1 + 1; - float *b3 = b2 + 1; - int k_tail = k & 7; - int load_stride = 4 * ldx; - int store_stride = 16; - asm volatile( - "slli t0, %11, 5\n\t" // t0 = 8 * ldx * 4 - "slli t1, %7, 4\n\t" // t1 = 4 * k_tail * 4 - - "1:\n\t" - // start packn4 - "mv %0, %10\n\t" // b0 = b - "addi %1, %0, 4\n\t" // b1 = b0 + 1 - "addi %2, %1, 4\n\t" // b2 = b1 + 1 - "addi %3, %2, 4\n\t" // b3 = b2 + 1 - "mv t6, %6\n\t" // t6 = k8 - "beqz t6, 3f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "2:\n\t" - // start subpack_n4k8 - "vlsw.v v0, (%0), %8\n\t" - "vlsw.v v2, (%1), %8\n\t" - "vlsw.v v4, (%2), %8\n\t" - "vlsw.v v6, (%3), %8\n\t" - "add %0, %0, t0\n\t" - "add %1, %1, t0\n\t" - "add %2, %2, t0\n\t" - "add %3, %3, t0\n\t" - - "vssw.v v0, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v2, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v4, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v6, (%4), %9\n\t" - "addi %4, %4, -12\n\t" - "addi %4, %4, 128\n\t" // sb += 32 * 4 - - "addi t6, t6, -1\n\t" // k8-- - "bnez t6, 2b\n\t" - - "3:\n\t" - "beqz %7, 4f\n\t" // k_tail == 0 ? - // Processing k_tail - "vsetvli zero, %7, e32, m2\n\t" - "vlsw.v v0, (%0), %8\n\t" - "vlsw.v v2, (%1), %8\n\t" - "vlsw.v v4, (%2), %8\n\t" - "vlsw.v v6, (%3), %8\n\t" - - "vssw.v v0, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v2, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v4, (%4), %9\n\t" - "addi %4, %4, 4\n\t" - "vssw.v v6, (%4), %9\n\t" - "addi %4, %4, -12\n\t" - "add %4, %4, t1\n\t" // sb += k_tail * 4 * 4 - - "4:\n\t" - // end packn4 - "addi %10, %10, 16\n\t" // b += 4 * 4 - "addi %5, %5, -1\n\t" // n4-- - "bnez %5, 1b\n\t" - - :"=r"(b0), // %0 - "=r"(b1), // %1 - "=r"(b2), // %2 - "=r"(b3), // %3 - "=r"(sb), // %4 - "=r"(n4), // %5 - "=r"(k8), // %6 - "=r"(k_tail), // %7 - "=r"(load_stride), // %8 - "=r"(store_stride), // %9 - "=r"(b), // %10 - "=r"(ldx) // %11 - :"0"(b0), - "1"(b1), - "2"(b2), - "3"(b3), - "4"(sb), - "5"(n4), - "6"(k8), - "7"(k_tail), - "8"(load_stride), - "9"(store_stride), - "10"(b), - "11"(ldx) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t6" - ); - } - int n_tail = n & 3; - if(n_tail > 0) { - float *b0 = b; - int k_tail = k & 7; - int load_stride = 4 * ldx; - asm volatile( - "slli t0, %7, 5\n\t" // t0 = 8 * ldx * 4 - "slli t1, %4, 2\n\t" // t1 = k_tail * 4 - - "1:\n\t" - // pack remain n_tail cols one by one - "mv %0, %6\n\t" // b0 = b - "mv t3, %3\n\t" // t3 = k8 - "beqz t3, 3f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "2:\n\t" - // start subpack_n1k8 - "vlsw.v v0, (%0), %5\n\t" - "add %0, %0, t0\n\t" - "vsw.v v0, (%1)\n\t" - "addi %1, %1, 32\n\t" // sb += 8 * 4 - - "addi t3, t3, -1\n\t" // k8-- - "bnez t3, 2b\n\t" - - "3:\n\t" - "beqz %4, 4f\n\t" // k_tail == 0 ? - // Processing k_tail - "vsetvli zero, %4, e32, m2\n\t" - "vlsw.v v0, (%0), %5\n\t" - "vsw.v v0, (%1)\n\t" - "add %1, %1, t1\n\t" - - "4:\n\t" - // end packn1 - "addi %6, %6, 4\n\t" // b += 1 * 4 - "addi %2, %2, -1\n\t" - "bnez %2, 1b\n\t" - - :"=r"(b0), // %0 - "=r"(sb), // %1 - "=r"(n_tail), // %2 - "=r"(k8), // %3 - "=r"(k_tail), // %4 - "=r"(load_stride), // %5 - "=r"(b), // %6 - "=r"(ldx) // %7 - :"0"(b0), - "1"(sb), - "2"(n_tail), - "3"(k8), - "4"(k_tail), - "5"(load_stride), - "6"(b), - "7"(ldx) - :"v0", "v1", "t0", "t1", "t3" - ); - } -#else - int i = 0; - for(; i + 3 < n; i += 4) { - const float* p0 = b + i; - const float* p1 = b + 1 * ldx + i; - const float* p2 = b + 2 * ldx + i; - const float* p3 = b + 3 * ldx + i; - - const float* p4 = b + 4 * ldx + i; - const float* p5 = b + 5 * ldx + i; - const float* p6 = b + 6 * ldx + i; - const float* p7 = b + 7 * ldx + i; - - int j = 0; - for(; j + 7 < k; j += 8) { - sb[0] = p0[0]; sb[4] = p1[0]; - sb[1] = p0[1]; sb[5] = p1[1]; - sb[2] = p0[2]; sb[6] = p1[2]; - sb[3] = p0[3]; sb[7] = p1[3]; - - sb[8] = p2[0]; sb[12] = p3[0]; - sb[9] = p2[1]; sb[13] = p3[1]; - sb[10] = p2[2]; sb[14] = p3[2]; - sb[11] = p2[3]; sb[15] = p3[3]; - - sb[16] = p4[0]; sb[20] = p5[0]; - sb[17] = p4[1]; sb[21] = p5[1]; - sb[18] = p4[2]; sb[22] = p5[2]; - sb[19] = p4[3]; sb[23] = p5[3]; - - sb[24] = p6[0]; sb[28] = p7[0]; - sb[25] = p6[1]; sb[29] = p7[1]; - sb[26] = p6[2]; sb[30] = p7[2]; - sb[27] = p6[3]; sb[31] = p7[3]; - - sb += 32; - p0 += 8 * ldx; - p1 += 8 * ldx; - p2 += 8 * ldx; - p3 += 8 * ldx; - p4 += 8 * ldx; - p5 += 8 * ldx; - p6 += 8 * ldx; - p7 += 8 * ldx; - } - if(j + 3 < k) { - j += 4; - sb[0] = p0[0]; - sb[1] = p0[1]; - sb[2] = p0[2]; - sb[3] = p0[3]; - - sb[4] = p1[0]; - sb[5] = p1[1]; - sb[6] = p1[2]; - sb[7] = p1[3]; - - sb[8] = p2[0]; - sb[9] = p2[1]; - sb[10] = p2[2]; - sb[11] = p2[3]; - - sb[12] = p3[0]; - sb[13] = p3[1]; - sb[14] = p3[2]; - sb[15] = p3[3]; - - sb += 16; - p0 += 4 * ldx; - p1 += 4 * ldx; - p2 += 4 * ldx; - p3 += 4 * ldx; - } - if(j + 1 < k) { - j += 2; - sb[0] = p0[0]; - sb[1] = p0[1]; - sb[2] = p0[2]; - sb[3] = p0[3]; - - sb[4] = p1[0]; - sb[5] = p1[1]; - sb[6] = p1[2]; - sb[7] = p1[3]; - - sb += 8; - p0 += 2 * ldx; - p1 += 2 * ldx; - } - if(j < k) { - sb[0] = p0[0]; - sb[1] = p0[1]; - sb[2] = p0[2]; - sb[3] = p0[3]; - - sb += 4; - p0 += ldx; - } - } - while(i < n) - { - const float *p = b + i; - for(int j = 0; j < k; j++) { - *sb = *p; - sb ++; - p += ldx; - } - i++; - } - -#endif // __riscv_vector -} - - -void csi_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx) -{ - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" // set vl = 8 - - "slli t2, %4, 2\n\t" // t2 = ldx * 4 (line stride) - - "srai t0, %3, 2\n\t" // t0 = n4 - "beqz t0, 3f\n\t" // jump to packn_tail - - "1:\n\t" // n4 - "mv a0, %0\n\t" - "addi %0, %0, 16\n\t" - "mv t1, %2\n\t" // k - - "2:\n\t" - // start packn8k1 - "vle.v v2, (a0)\n\t" - "add a0, a0, t2\n\t" - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" - - "addi t1, t1, -1\n\t" - "bnez t1, 2b\n\t" - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - "3:\n\t" // n_tail - "andi t0, %3, 3\n\t" // n & 3u - "beqz t0, 8f\n\t" - - "srai t3, %2, 2\n\t" // k4 - "slli t5, %4, 4\n\t" // t5 = ldx * 4 * 4 (4 lines) - "andi t6, %2, 3\n\t" // k_tail - "slli t4, t6, 2\n\t" // k_tail * 4 - - "4:\n\t" - "mv a0, %0\n\t" - "addi %0, %0, 4\n\t" - "mv t1, t3\n\t" // t1 = k4 - "beqz t3, 6f\n\t" - - "5:\n\t" - "vsetvli zero, zero, e32, m1\n\t" - "vlse.v v2, (a0), t2\n\t" - "add a0, a0, t5\n\t" - "vse.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" - - "addi t1, t1, -1\n\t" - "bnez t1, 5b\n\t" - - "6:\n\t" - "vsetvli zero, t6, e32, m1\n\t" - "vlse.v v2, (a0), t2\n\t" - "vse.v v2, (%1)\n\t" - "add %1, %1, t4\n\t" - - "7:\n\t" - "addi t0, t0, -1\n\t" - "bnez t0, 4b\n\t" - - - "8:\n\t" // ending - - - :"=r"(b), // %0 - "=r"(sb), // %1 - "=r"(k), // %2 - "=r"(n), // %3 - "=r"(ldx) // %4 - :"0"(b), - "1"(sb), - "2"(k), - "3"(n), - "4"(ldx) - :"v0", "v2", "a0", - "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); -} - -static inline void kernel_m1_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu) -{ - float *pa = sa; - float *pb = sb; - float *pc = dst; - DECOMPOSE_K - DECOMPOSE_N - -#if __riscv_vector == 128 - if(n4 > 0) { - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" - "flw ft0, (%8)\n\t" // bias - - "beqz %9, 1f\n\t" // if fuse_relu == 0 - "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu - - "1:\n\t" - // start kernel_m1n4 - "vfmv.v.f v24, ft0\n\t" // v24[0..3] = *bias - // "vlw.v v24, (%8)\n\t" // v24[0..3] = bias[0..3] - // "addi %8, %8, 16\n\t" - - "mv a1, %0\n\t" // a1 = pa - "mv t0, %3\n\t" // t0 = k8 - "beqz t0, 3f\n\t" // k8 == 0 ? - - "2:\n\t" - // start subkernel_m1n4k8 - "vlw.v v1, (%1)\n\t" // load pb - "flw ft1, 0(a1)\n\t" // load pa - "vfmv.v.f v2, ft1\n\t" - "addi %1, %1, 16\n\t" // pb += 4 * 4 - "vfmacc.vv v24, v1, v2\n\t" // 0 - - "vlw.v v3, (%1)\n\t" - "flw ft2, 4(a1)\n\t" - "vfmv.v.f v4, ft2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v3, v4\n\t" // 1 - - "vlw.v v5, (%1)\n\t" - "flw ft3, 8(a1)\n\t" - "vfmv.v.f v6, ft3\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v5, v6\n\t" // 2 - - "vlw.v v7, (%1)\n\t" - "flw ft4, 12(a1)\n\t" - "vfmv.v.f v8, ft4\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v7, v8\n\t" // 3 - - "vlw.v v9, (%1)\n\t" - "flw ft5, 16(a1)\n\t" - "vfmv.v.f v10, ft5\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v9, v10\n\t" // 4 - - "vlw.v v11, (%1)\n\t" - "flw ft6, 20(a1)\n\t" - "vfmv.v.f v12, ft6\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v11, v12\n\t" // 5 - - "vlw.v v13, (%1)\n\t" - "flw ft7, 24(a1)\n\t" - "vfmv.v.f v14, ft7\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v13, v14\n\t" // 6 - - "vlw.v v15, (%1)\n\t" - "flw ft8, 28(a1)\n\t" - "vfmv.v.f v16, ft8\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v15, v16\n\t" // 7 - "addi a1, a1, 32\n\t" - - "addi t0, t0, -1\n\t" - "bnez t0, 2b\n\t" - - "3:\n\t" - "beqz %4, 4f\n\t" // k4 == 0 ? - // start subkernel_m1n4k4 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - - "vlw.v v3, (%1)\n\t" - "flw ft2, 4(a1)\n\t" - "vfmv.v.f v4, ft2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v3, v4\n\t" // 1 - - "vlw.v v5, (%1)\n\t" - "flw ft3, 8(a1)\n\t" - "vfmv.v.f v6, ft3\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v5, v6\n\t" // 2 - - "vlw.v v7, (%1)\n\t" - "flw ft4, 12(a1)\n\t" - "vfmv.v.f v8, ft4\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v7, v8\n\t" // 3 - "addi a1, a1, 16\n\t" - - "4:\n\t" - "beqz %5, 5f\n\t" // k2 == 0 ? - // start subkernel_m1n4k2 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - - "vlw.v v3, (%1)\n\t" - "flw ft2, 4(a1)\n\t" - "vfmv.v.f v4, ft2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v3, v4\n\t" // 1 - "addi a1, a1, 8\n\t" - - "5:\n\t" - "beqz %6, 6f\n\t" // k1 == 0 ? - // start subkernel_m1n4k1 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - "addi a1, a1, 4\n\t" - - "6:\n\t" - "beqz %9, 7f\n\t" - // fused relu - "vfmax.vv v24, v24, v0\n\t" // **** relu **** - - "7:\n\t" - // end kernel_m1n4 - "vsw.v v24, (%2)\n\t" - "addi %2, %2, 16\n\t" // pc += 4 * 4 - - "addi %7, %7, -1\n\t" - "bnez %7, 1b\n\t" - - :"=r"(pa), // %0 - "=r"(pb), // %1 - "=r"(pc), // %2 - "=r"(k8), // %3 - "=r"(k4), // %4 - "=r"(k2), // %5 - "=r"(k1), // %6 - "=r"(n4), // %7 - "=r"(bias), // %8 - "=r"(fuse_relu) // %9 - :"0"(pa), - "1"(pb), - "2"(pc), - "3"(k8), - "4"(k4), - "5"(k2), - "6"(k1), - "7"(n4), - "8"(bias), - "9"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", - "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8" - ); - } - if(n2 > 0) { - int k_tail = k & 7; - float *pb0 = pb; - float *pb1 = pb0 + k; - - asm volatile( - "fmv.w.x ft4, zero\n\t" // for fuse relu - "mv t4, %4\n\t" // t4 = k8 - "vsetvli zero, zero, e32, m2\n\t" - "vxor.vv v6, v6, v6\n\t" // clear - "vxor.vv v8, v8, v8\n\t" // clear - "flw ft0, 0(%6)\n\t" // ft0 = *bias - // "flw ft3, 4(%6)\n\t" // ft3 = *(bias + 1) - // "addi %6, %6, 8\n\t" - "vfmv.s.f v10, ft0\n\t" // v10[0] = ft0 - "vfmv.s.f v12, ft0\n\t" // v10[0] = ft0 - // "vfmv.s.f v12, ft3\n\t" // v12[0] = ft3 - - "beqz %5, 1f\n\t" // k_tail == 0 ? - // Processing k_tail - "slli t0, %5, 2\n\t" // t0 = k_tail * 4 - "vsetvli zero, %5, e32, m2\n\t" - "vlw.v v0, (%0)\n\t" - "add %0, %0, t0\n\t" - "vlw.v v2, (%1)\n\t" - "add %1, %1, t0\n\t" - "vlw.v v4, (%2)\n\t" - "add %2, %2, t0\n\t" - "vfmacc.vv v6, v0, v2\n\t" - "vfmacc.vv v8, v0, v4\n\t" - "beqz t4, 2f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "1:\n\t" - // start subkernel_m1n2k8 - "vlw.v v0, (%0)\n\t" - "addi %0, %0, 32\n\t" - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 32\n\t" - "vlw.v v4, (%2)\n\t" - "addi %2, %2, 32\n\t" - "vfmacc.vv v6, v0, v2\n\t" - "vfmacc.vv v8, v0, v4\n\t" - "addi t4, t4, -1\n\t" - "bnez t4, 1b\n\t" - - "2:\n\t" - // end kernel_m1n2 - "vfredsum.vs v10, v6, v10\n\t" // v10[0] = v10[0] + sum(v6[0..i]) - "vfredsum.vs v12, v8, v12\n\t" // v12[0] = v12[0] + sum(v8[0..i]) - "vfmv.f.s ft1, v10\n\t" - "vfmv.f.s ft2, v12\n\t" - - "beqz %7, 3f\n\t" - // fuse relu - "fmax.s ft1, ft1, ft4\n\t" // **** relu **** - "fmax.s ft2, ft2, ft4\n\t" // **** relu **** - - "3:\n\t" - - "fsw ft1, 0(%3)\n\t" - "fsw ft2, 4(%3)\n\t" - - :"=r"(pa), // %0 - "=r"(pb0), // %1 - "=r"(pb1), // %2 - "=r"(pc), // %3 - "=r"(k8), // %4 - "=r"(k_tail), // %5 - "=r"(bias), // %6 - "=r"(fuse_relu) // %7 - :"0"(pa), - "1"(pb0), - "2"(pb1), - "3"(pc), - "4"(k8), - "5"(k_tail), - "6"(bias), - "7"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", - "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t4" - ); - pb += 2 * k; - pc += 2; - } - if(n1 > 0) { - pa = sa; - int k_tail = k & 7; - asm volatile( - "fmv.w.x ft2, zero\n\t" // for fuse relu - "vsetvli zero, zero, e32, m2\n\t" - "vxor.vv v4, v4, v4\n\t" // clear - - "flw ft0, 0(%5)\n\t" // ft0 = *bias - "vfmv.s.f v6, ft0\n\t" // v6[0] = ft0 - - "beqz %4, 1f\n\t" // k_tail == 0 ? - // Processing k_tail - "slli t0, %4, 2\n\t" // t0 = k_tail * 4 - "vsetvli zero, %4, e32, m2\n\t" - "vlw.v v0, (%0)\n\t" - "add %0, %0, t0\n\t" - "vlw.v v2, (%1)\n\t" - "add %1, %1, t0\n\t" - "vfmacc.vv v4, v0, v2\n\t" - "beqz %3, 2f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "1:\n\t" - // start subkernel_m1n1k8 - "vlw.v v0, (%0)\n\t" - "addi %0, %0, 32\n\t" - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 32\n\t" - "vfmacc.vv v4, v0, v2\n\t" - "addi %3, %3, -1\n\t" - "bnez %3, 1b\n\t" - - "2:\n\t" - // end kernel_m1n1 - "vfredsum.vs v6, v4, v6\n\t" // v6[0] = v6[0] + sum(v4[0..i]) - "vfmv.f.s ft1, v6\n\t" - - "beqz %6, 3f\n\t" - // fused relu - "fmax.s ft1, ft1, ft2\n\t" // **** relu **** - - "3:\n\t" - "fsw ft1, 0(%2)\n\t" - - :"=r"(pa), // %0 - "=r"(pb), // %1 - "=r"(pc), // %2 - "=r"(k8), // %3 - "=r"(k_tail), // %4 - "=r"(bias), // %5 - "=r"(fuse_relu) // %6 - :"0"(pa), - "1"(pb), - "2"(pc), - "3"(k8), - "4"(k_tail), - "5"(bias), - "6"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ft0", "ft1", "ft2", "t0" - ); - } -#else - for(int i = 0; i < n4; i++) { - int j = 0; - pa = sa; - pc[0] = pc[1] = pc[2] = pc[3] = *bias; - for(; j + 7 < k; j += 8) { - pc[0] += pa[0] * pb[0]; - pc[1] += pa[0] * pb[1]; - pc[2] += pa[0] * pb[2]; - pc[3] += pa[0] * pb[3]; - - pc[0] += pa[1] * pb[4]; - pc[1] += pa[1] * pb[5]; - pc[2] += pa[1] * pb[6]; - pc[3] += pa[1] * pb[7]; - - pc[0] += pa[2] * pb[8]; - pc[1] += pa[2] * pb[9]; - pc[2] += pa[2] * pb[10]; - pc[3] += pa[2] * pb[11]; - - pc[0] += pa[3] * pb[12]; - pc[1] += pa[3] * pb[13]; - pc[2] += pa[3] * pb[14]; - pc[3] += pa[3] * pb[15]; - - pc[0] += pa[4] * pb[16]; - pc[1] += pa[4] * pb[17]; - pc[2] += pa[4] * pb[18]; - pc[3] += pa[4] * pb[19]; - - pc[0] += pa[5] * pb[20]; - pc[1] += pa[5] * pb[21]; - pc[2] += pa[5] * pb[22]; - pc[3] += pa[5] * pb[23]; - - pc[0] += pa[6] * pb[24]; - pc[1] += pa[6] * pb[25]; - pc[2] += pa[6] * pb[26]; - pc[3] += pa[6] * pb[27]; - - pc[0] += pa[7] * pb[28]; - pc[1] += pa[7] * pb[29]; - pc[2] += pa[7] * pb[30]; - pc[3] += pa[7] * pb[31]; - - pa += 8; - pb += 32; - } - if(j + 3 < k) { - j += 4; - pc[0] += pa[0] * pb[0]; - pc[1] += pa[0] * pb[1]; - pc[2] += pa[0] * pb[2]; - pc[3] += pa[0] * pb[3]; - - pc[0] += pa[1] * pb[4]; - pc[1] += pa[1] * pb[5]; - pc[2] += pa[1] * pb[6]; - pc[3] += pa[1] * pb[7]; - - pc[0] += pa[2] * pb[8]; - pc[1] += pa[2] * pb[9]; - pc[2] += pa[2] * pb[10]; - pc[3] += pa[2] * pb[11]; - - pc[0] += pa[3] * pb[12]; - pc[1] += pa[3] * pb[13]; - pc[2] += pa[3] * pb[14]; - pc[3] += pa[3] * pb[15]; - - pa += 4; - pb += 16; - } - if(j + 1 < k) { - j += 2; - pc[0] += pa[0] * pb[0]; - pc[1] += pa[0] * pb[1]; - pc[2] += pa[0] * pb[2]; - pc[3] += pa[0] * pb[3]; - - pc[0] += pa[1] * pb[4]; - pc[1] += pa[1] * pb[5]; - pc[2] += pa[1] * pb[6]; - pc[3] += pa[1] * pb[7]; - - pa += 2; - pb += 8; - } - if(j < k) { - pc[0] += pa[0] * pb[0]; - pc[1] += pa[0] * pb[1]; - pc[2] += pa[0] * pb[2]; - pc[3] += pa[0] * pb[3]; - - pa += 1; - pb += 4; - } - if (fuse_relu) { - pc[0] = pc[0] > 0 ? pc[0] : 0; - pc[1] = pc[1] > 0 ? pc[1] : 0; - pc[2] = pc[2] > 0 ? pc[2] : 0; - pc[3] = pc[3] > 0 ? pc[3] : 0; - } - pc += 4; - } - if(n2 > 0) { - pa = sa; - pc[0] = pc[1] = *bias; - float *pb0 = pb; - float *pb1 = pb0 + k; - int j = 0; - for(; j + 7 < k; j += 8) { - pc[0] += pa[0] * pb0[0]; - pc[1] += pa[0] * pb1[0]; - - pc[0] += pa[1] * pb0[1]; - pc[1] += pa[1] * pb1[1]; - - pc[0] += pa[2] * pb0[2]; - pc[1] += pa[2] * pb1[2]; - - pc[0] += pa[3] * pb0[3]; - pc[1] += pa[3] * pb1[3]; - - pc[0] += pa[4] * pb0[4]; - pc[1] += pa[4] * pb1[4]; - - pc[0] += pa[5] * pb0[5]; - pc[1] += pa[5] * pb1[5]; - - pc[0] += pa[6] * pb0[6]; - pc[1] += pa[6] * pb1[6]; - - pc[0] += pa[7] * pb0[7]; - pc[1] += pa[7] * pb1[7]; - - pa += 8; - pb0 += 8; - pb1 += 8; - } - if(j + 3 < k) { - j += 4; - pc[0] += pa[0] * pb0[0]; - pc[1] += pa[0] * pb1[0]; - - pc[0] += pa[1] * pb0[1]; - pc[1] += pa[1] * pb1[1]; - - pc[0] += pa[2] * pb0[2]; - pc[1] += pa[2] * pb1[2]; - - pc[0] += pa[3] * pb0[3]; - pc[1] += pa[3] * pb1[3]; - - pa += 4; - pb0 += 4; - pb1 += 4; - } - if(j + 1 < k) { - j += 2; - pc[0] += pa[0] * pb0[0]; - pc[1] += pa[0] * pb1[0]; - - pc[0] += pa[1] * pb0[1]; - pc[1] += pa[1] * pb1[1]; - - pa += 2; - pb0 += 2; - pb1 += 2; - } - if(j < k) { - pc[0] += pa[0] * pb0[0]; - pc[1] += pa[0] * pb1[0]; - - pa += 1; - pb0 += 1; - pb1 += 1; - } - if (fuse_relu) { - pc[0] = pc[0] > 0 ? pc[0] : 0; - pc[1] = pc[1] > 0 ? pc[1] : 0; - } - pc += 2; - pb += 2 * k; - } - if(n1 > 0) { - pa = sa; - pc[0] = *bias; - int j = 0; - for(; j + 7 < k; j += 8) { - pc[0] += pa[0] * pb[0]; - pc[0] += pa[1] * pb[1]; - pc[0] += pa[2] * pb[2]; - pc[0] += pa[3] * pb[3]; - pc[0] += pa[4] * pb[4]; - pc[0] += pa[5] * pb[5]; - pc[0] += pa[6] * pb[6]; - pc[0] += pa[7] * pb[7]; - - pa += 8; - pb += 8; - } - if(j + 3 < k) { - j += 4; - pc[0] += pa[0] * pb[0]; - pc[0] += pa[1] * pb[1]; - pc[0] += pa[2] * pb[2]; - pc[0] += pa[3] * pb[3]; - - pa += 4; - pb += 4; - } - if(j + 1 < k) { - j += 2; - pc[0] += pa[0] * pb[0]; - pc[0] += pa[1] * pb[1]; - - pa += 2; - pb += 2; - } - if(j < k) { - pc[0] += pa[0] * pb[0]; - - pa += 1; - pb += 1; - } - if (fuse_relu) { - pc[0] = pc[0] > 0 ? pc[0] : 0; - } - pc += 1; - } -#endif // __riscv_vector -} - -static inline void kernel_m2_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu) -{ - float *pa = sa; - float *pb = sb; - float *pc0 = dst; - float *pc1 = pc0 + ldc; - DECOMPOSE_K - DECOMPOSE_N -#if __riscv_vector == 128 - if(n4 > 0) { - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" - "flw ft0, (%9)\n\t" // ft0 = *bias - "flw ft10, 4(%9)\n\t" // ft1 = *(bias + 1) - - "beqz %10, 1f\n\t" // if fuse_relu == 0 - "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu - - "1:\n\t" // n4 - // start kernel_m2n4 - "vfmv.v.f v24, ft0\n\t" // v24[0..3] = ft0 = *bias - "vfmv.v.f v25, ft10\n\t" // v25[0..3] = ft10 = *(bias + 1) - // "vlw.v v24, (%9)\n\t" // v24[0..3] = bias[0..3] - // "vlw.v v25, (%9)\n\t" // v24[0..3] = bias[0..3] - // "addi %9, %9, 16\n\t" - - "mv a1, %0\n\t" // a1 = pa - "mv t0, %4\n\t" // t0 = k8 - "beqz t0, 3f\n\t" // k8 == 0 ? - - "2:\n\t" - // start subkernel_m2n4k8 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "flw fa1, 4(a1)\n\t" - "vfmv.v.f v3, fa1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - "vfmacc.vv v25, v1, v3\n\t" - - "vlw.v v4, (%1)\n\t" - "flw ft2, 8(a1)\n\t" - "vfmv.v.f v5, ft2\n\t" - "flw fa2, 12(a1)\n\t" - "vfmv.v.f v6, fa2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v4, v5\n\t" // 1 - "vfmacc.vv v25, v4, v6\n\t" - - "vlw.v v7, (%1)\n\t" - "flw ft3, 16(a1)\n\t" - "vfmv.v.f v8, ft3\n\t" - "flw fa3, 20(a1)\n\t" - "vfmv.v.f v9, fa3\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v7, v8\n\t" // 2 - "vfmacc.vv v25, v7, v9\n\t" - - "vlw.v v10, (%1)\n\t" - "flw ft4, 24(a1)\n\t" - "vfmv.v.f v11, ft4\n\t" - "flw fa4, 28(a1)\n\t" - "vfmv.v.f v12, fa4\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v10, v11\n\t" // 3 - "vfmacc.vv v25, v10, v12\n\t" - - "vlw.v v13, (%1)\n\t" - "flw ft5, 32(a1)\n\t" - "vfmv.v.f v14, ft5\n\t" - "flw fa5, 36(a1)\n\t" - "vfmv.v.f v15, fa5\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v13, v14\n\t" // 4 - "vfmacc.vv v25, v13, v15\n\t" - - "vlw.v v16, (%1)\n\t" - "flw ft6, 40(a1)\n\t" - "vfmv.v.f v17, ft6\n\t" - "flw fa6, 44(a1)\n\t" - "vfmv.v.f v18, fa6\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v16, v17\n\t" // 5 - "vfmacc.vv v25, v16, v18\n\t" - - "vlw.v v19, (%1)\n\t" - "flw ft7, 48(a1)\n\t" - "vfmv.v.f v20, ft7\n\t" - "flw fa7, 52(a1)\n\t" - "vfmv.v.f v21, fa7\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v19, v20\n\t" // 6 - "vfmacc.vv v25, v19, v21\n\t" - - "vlw.v v28, (%1)\n\t" - "flw ft8, 56(a1)\n\t" - "vfmv.v.f v29, ft8\n\t" - "flw fa0, 60(a1)\n\t" - "vfmv.v.f v30, fa0\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v28, v29\n\t" // 7 - "vfmacc.vv v25, v28, v30\n\t" - "addi a1, a1, 64\n\t" - - "addi t0, t0, -1\n\t" - "bnez t0, 2b\n\t" - - "3:\n\t" - "beqz %5, 4f\n\t" // k4 == 0 ? - // start subkernel_m2n4k4 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "flw fa1, 4(a1)\n\t" - "vfmv.v.f v3, fa1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - "vfmacc.vv v25, v1, v3\n\t" - - "vlw.v v4, (%1)\n\t" - "flw ft2, 8(a1)\n\t" - "vfmv.v.f v5, ft2\n\t" - "flw fa2, 12(a1)\n\t" - "vfmv.v.f v6, fa2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v4, v5\n\t" // 1 - "vfmacc.vv v25, v4, v6\n\t" - - "vlw.v v7, (%1)\n\t" - "flw ft3, 16(a1)\n\t" - "vfmv.v.f v8, ft3\n\t" - "flw fa3, 20(a1)\n\t" - "vfmv.v.f v9, fa3\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v7, v8\n\t" // 2 - "vfmacc.vv v25, v7, v9\n\t" - - "vlw.v v10, (%1)\n\t" - "flw ft4, 24(a1)\n\t" - "vfmv.v.f v11, ft4\n\t" - "flw fa4, 28(a1)\n\t" - "vfmv.v.f v12, fa4\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v10, v11\n\t" // 3 - "vfmacc.vv v25, v10, v12\n\t" - "addi a1, a1, 32\n\t" - - "4:\n\t" - "beqz %6, 5f\n\t" // k2 == 0 ? - // start subkernel_m2n4k2 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "flw fa1, 4(a1)\n\t" - "vfmv.v.f v3, fa1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - "vfmacc.vv v25, v1, v3\n\t" - - "vlw.v v4, (%1)\n\t" - "flw ft2, 8(a1)\n\t" - "vfmv.v.f v5, ft2\n\t" - "flw fa2, 12(a1)\n\t" - "vfmv.v.f v6, fa2\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v4, v5\n\t" // 1 - "vfmacc.vv v25, v4, v6\n\t" - "addi a1, a1, 16\n\t" - - - "5:\n\t" - "beqz %7, 6f\n\t" // k1 == 0 ? - // start subkernel_m2n4k1 - "vlw.v v1, (%1)\n\t" - "flw ft1, 0(a1)\n\t" - "vfmv.v.f v2, ft1\n\t" - "flw fa1, 4(a1)\n\t" - "vfmv.v.f v3, fa1\n\t" - "addi %1, %1, 16\n\t" - "vfmacc.vv v24, v1, v2\n\t" // 0 - "vfmacc.vv v25, v1, v3\n\t" - "addi a1, a1, 8\n\t" - - "6:\n\t" - "beqz %10, 7f\n\t" - // fused relu - "vfmax.vv v25, v25, v0\n\t" // **** relu **** - "vfmax.vv v25, v25, v0\n\t" // **** relu **** - - "7:\n\t" - // end kernel_m2n4 - "vsw.v v24, (%2)\n\t" // pc0[0..3] = v24 - "addi %2, %2, 16\n\t" - "vsw.v v25, (%3)\n\t" // pc1[0..3] = v25 - "addi %3, %3, 16\n\t" - - "addi %8, %8, -1\n\t" - "bnez %8, 1b\n\t" - - :"=r"(pa), // %0 - "=r"(pb), // %1 - "=r"(pc0), // %2 - "=r"(pc1), // %3 - "=r"(k8), // %4 - "=r"(k4), // %5 - "=r"(k2), // %6 - "=r"(k1), // %7 - "=r"(n4), // %8 - "=r"(bias), // %9 - "=r"(fuse_relu) // %10 - :"0"(pa), - "1"(pb), - "2"(pc0), - "3"(pc1), - "4"(k8), - "5"(k4), - "6"(k2), - "7"(k1), - "8"(n4), - "9"(bias), - "10"(fuse_relu) - : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v24", "v25", "v28", "v29", "v30", - "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7" - ); - } - if(n2 > 0) { - int k_tail = k & 7; - float *pa0 = sa; - float *pa1 = pa0 + 1; - float *pb0 = pb; - float *pb1 = pb0 + k; - int load_stride = 8; - - asm volatile( - "fmv.w.x ft6, zero\n\t" // for fuse relu - "mv t6, %6\n\t" // t6 = k8 - "vsetvli zero, zero, e32, m2\n\t" - "vxor.vv v8, v8, v8\n\t" // clear - "vxor.vv v10, v10, v10\n\t" // clear - "vxor.vv v12, v12, v12\n\t" // clear - "vxor.vv v14, v14, v14\n\t" // clear - "flw ft0, 0(%8)\n\t" // ft0 = *bias - "flw ft1, 4(%8)\n\t" // ft1 = *(bias + 1) - // "addi %8, %8, 8\n\t" - "vfmv.s.f v16, ft0\n\t" // v16[0] = ft0 - "vfmv.s.f v18, ft0\n\t" // v18[0] = ft0 - "vfmv.s.f v20, ft1\n\t" // v20[0] = ft1 - "vfmv.s.f v22, ft1\n\t" // v22[1] = ft1 - - "beqz %7, 1f\n\t" // k_tail == 0 ? - // Processing k_tail - "slli t0, %7, 2\n\t" // t0 = k_tail * 4 - "slli t1, t0, 1\n\t" // t1 = t0 * 2 - "vsetvli zero, %7, e32, m2\n\t" - "vlsw.v v0, (%0), %9\n\t" - "add %0, %0, t1\n\t" - "vlsw.v v2, (%1), %9\n\t" - "addi %1, %0, 4\n\t" - - "vlw.v v4, (%2)\n\t" - "add %2, %2, t0\n\t" - "vlw.v v6, (%3)\n\t" - "add %3, %3, t0\n\t" - - "vfmacc.vv v8, v0, v4\n\t" - "vfmacc.vv v10, v0, v6\n\t" - "vfmacc.vv v12, v2, v4\n\t" - "vfmacc.vv v14, v2, v6\n\t" - "beqz t6, 2f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "1:\n\t" - // start subkernel_m2n2k8 - "vlsw.v v0, (%0), %9\n\t" - "addi %0, %0, 64\n\t" - "vlsw.v v2, (%1), %9\n\t" - "addi %1, %0, 4\n\t" - - "vlw.v v4, (%2)\n\t" - "addi %2, %2, 32\n\t" - "vlw.v v6, (%3)\n\t" - "addi %3, %3, 32\n\t" - - "vfmacc.vv v8, v0, v4\n\t" - "vfmacc.vv v10, v0, v6\n\t" - "vfmacc.vv v12, v2, v4\n\t" - "vfmacc.vv v14, v2, v6\n\t" - "addi t6, t6, -1\n\t" - "bnez t6, 1b\n\t" - - "2:\n\t" - // end kernel_m2n2 - "vfredsum.vs v16, v8, v16\n\t" // v16[0] = v16[0] + sum(v8[0..i]) - "vfredsum.vs v18, v10, v18\n\t" // v18[0] = v18[0] + sum(v10[0..i]) - "vfredsum.vs v20, v12, v20\n\t" // v20[0] = v20[0] + sum(v12[0..i]) - "vfredsum.vs v22, v14, v22\n\t" // v22[0] = v22[0] + sum(v14[0..i]) - "vfmv.f.s ft2, v16\n\t" - "vfmv.f.s ft3, v18\n\t" - "vfmv.f.s ft4, v20\n\t" - "vfmv.f.s ft5, v22\n\t" - - "beqz %10, 3f\n\t" - // fuse relu - "fmax.s ft2, ft2, ft6\n\t" // **** relu **** - "fmax.s ft3, ft3, ft6\n\t" // **** relu **** - "fmax.s ft4, ft4, ft6\n\t" // **** relu **** - "fmax.s ft5, ft5, ft6\n\t" // **** relu **** - - "3:\n\t" - - "fsw ft2, 0(%4)\n\t" - "fsw ft3, 4(%4)\n\t" - "fsw ft4, 0(%5)\n\t" - "fsw ft5, 4(%5)\n\t" - - :"=r"(pa0), // %0 - "=r"(pa1), // %1 - "=r"(pb0), // %2 - "=r"(pb1), // %3 - "=r"(pc0), // %4 - "=r"(pc1), // %5 - "=r"(k8), // %6 - "=r"(k_tail), // %7 - "=r"(bias), // %8 - "=r"(load_stride), // %9 - "=r"(fuse_relu) // %10 - :"0"(pa0), - "1"(pa1), - "2"(pb0), - "3"(pb1), - "4"(pc0), - "5"(pc1), - "6"(k8), - "7"(k_tail), - "8"(bias), - "9"(load_stride), - "10"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "t0", "t1", "t6" - ); - pb += 2 * k; - pc0 += 2; - pc1 += 2; - } - if(n1 > 0) { - float *pa0 = sa; - float *pa1 = pa0 + 1; - int k8 = k >> 3; - int k_tail = k & 7; - int load_stride = 8; - asm volatile( - "fmv.w.x ft4, zero\n\t" // for fuse relu - "mv t5, %5\n\t" // t5 = k8 - "vsetvli zero, zero, e32, m2\n\t" - "vxor.vv v6, v6, v6\n\t" // clear - "vxor.vv v8, v8, v8\n\t" // clear - "flw ft0, 0(%7)\n\t" // ft0 = *bias - "flw ft1, 4(%7)\n\t" // ft1 = *(bias + 1) - "vfmv.s.f v10, ft0\n\t" // v10[0] = ft0 - "vfmv.s.f v12, ft1\n\t" // v12[0] = ft1 - - "beqz %6, 1f\n\t" // k_tail == 0 ? - // Processing k_tail - "slli t0, %6, 2\n\t" // t0 = k_tail * 4 - "slli t1, t0, 1\n\t" // t1 = t0 * 2 - "vsetvli zero, %6, e32, m2\n\t" - "vlsw.v v0, (%0), %8\n\t" - "add %0, %0, t1\n\t" - "vlsw.v v2, (%1), %8\n\t" - "addi %1, %0, 4\n\t" - - "vlw.v v4, (%2)\n\t" - "add %2, %2, t0\n\t" - - "vfmacc.vv v6, v0, v4\n\t" - "vfmacc.vv v8, v2, v4\n\t" - "beqz t5, 2f\n\t" // k8 == 0 ? - "vsetvli zero, zero, e32, m2\n\t" - - "1:\n\t" - // start subkernel_m2n1k8 - "vlsw.v v0, (%0), %8\n\t" - "addi %0, %0, 64\n\t" - "vlsw.v v2, (%1), %8\n\t" - "addi %1, %0, 4\n\t" - - "vlw.v v4, (%2)\n\t" - "addi %2, %2, 32\n\t" - - "vfmacc.vv v6, v0, v4\n\t" - "vfmacc.vv v8, v2, v4\n\t" - "addi t5, t5, -1\n\t" - "bnez t5, 1b\n\t" - - "2:\n\t" - // end kernel_m2n1 - "vfredsum.vs v10, v6, v10\n\t" // v10[0] = v10[0] + sum(v6[0..i]) - "vfredsum.vs v12, v8, v12\n\t" // v12[0] = v12[0] + sum(v8[0..i]) - "vfmv.f.s ft2, v10\n\t" - "vfmv.f.s ft3, v12\n\t" - - "beqz %9, 3f\n\t" - // fuse relu - "fmax.s ft2, ft3, ft4\n\t" // **** relu **** - "fmax.s ft2, ft3, ft4\n\t" // **** relu **** - - "3:\n\t" - "fsw ft2, 0(%3)\n\t" - "fsw ft3, 0(%4)\n\t" - - :"=r"(pa0), // %0 - "=r"(pa1), // %1 - "=r"(pb), // %2 - "=r"(pc0), // %3 - "=r"(pc1), // %4 - "=r"(k8), // %5 - "=r"(k_tail), // %6 - "=r"(bias), // %7 - "=r"(load_stride), // %8 - "=r"(fuse_relu) // %9 - :"0"(pa0), - "1"(pa1), - "2"(pb), - "3"(pc0), - "4"(pc1), - "5"(k8), - "6"(k_tail), - "7"(bias), - "8"(load_stride), - "9"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t1", "t5" - ); - } -#else - for(int i = 0; i < n4; i++) { - pa = sa; - pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias; - pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1); - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; - - pc0[0] += pa[2] * pb[4]; pc1[0] += pa[3] * pb[4]; - pc0[1] += pa[2] * pb[5]; pc1[1] += pa[3] * pb[5]; - pc0[2] += pa[2] * pb[6]; pc1[2] += pa[3] * pb[6]; - pc0[3] += pa[2] * pb[7]; pc1[3] += pa[3] * pb[7]; - - pc0[0] += pa[4] * pb[8]; pc1[0] += pa[5] * pb[8]; - pc0[1] += pa[4] * pb[9]; pc1[1] += pa[5] * pb[9]; - pc0[2] += pa[4] * pb[10]; pc1[2] += pa[5] * pb[10]; - pc0[3] += pa[4] * pb[11]; pc1[3] += pa[5] * pb[11]; - - pc0[0] += pa[6] * pb[12]; pc1[0] += pa[7] * pb[12]; - pc0[1] += pa[6] * pb[13]; pc1[1] += pa[7] * pb[13]; - pc0[2] += pa[6] * pb[14]; pc1[2] += pa[7] * pb[14]; - pc0[3] += pa[6] * pb[15]; pc1[3] += pa[7] * pb[15]; - - pc0[0] += pa[8] * pb[16]; pc1[0] += pa[9] * pb[16]; - pc0[1] += pa[8] * pb[17]; pc1[1] += pa[9] * pb[17]; - pc0[2] += pa[8] * pb[18]; pc1[2] += pa[9] * pb[18]; - pc0[3] += pa[8] * pb[19]; pc1[3] += pa[9] * pb[19]; - - pc0[0] += pa[10] * pb[20]; pc1[0] += pa[11] * pb[20]; - pc0[1] += pa[10] * pb[21]; pc1[1] += pa[11] * pb[21]; - pc0[2] += pa[10] * pb[22]; pc1[2] += pa[11] * pb[22]; - pc0[3] += pa[10] * pb[23]; pc1[3] += pa[11] * pb[23]; - - pc0[0] += pa[12] * pb[24]; pc1[0] += pa[13] * pb[24]; - pc0[1] += pa[12] * pb[25]; pc1[1] += pa[13] * pb[25]; - pc0[2] += pa[12] * pb[26]; pc1[2] += pa[13] * pb[26]; - pc0[3] += pa[12] * pb[27]; pc1[3] += pa[13] * pb[27]; - - pc0[0] += pa[14] * pb[28]; pc1[0] += pa[15] * pb[28]; - pc0[1] += pa[14] * pb[29]; pc1[1] += pa[15] * pb[29]; - pc0[2] += pa[14] * pb[30]; pc1[2] += pa[15] * pb[30]; - pc0[3] += pa[14] * pb[31]; pc1[3] += pa[15] * pb[31]; - - pa += 16; - pb += 32; - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; - - pc0[0] += pa[2] * pb[4]; pc1[0] += pa[3] * pb[4]; - pc0[1] += pa[2] * pb[5]; pc1[1] += pa[3] * pb[5]; - pc0[2] += pa[2] * pb[6]; pc1[2] += pa[3] * pb[6]; - pc0[3] += pa[2] * pb[7]; pc1[3] += pa[3] * pb[7]; - - pc0[0] += pa[4] * pb[8]; pc1[0] += pa[5] * pb[8]; - pc0[1] += pa[4] * pb[9]; pc1[1] += pa[5] * pb[9]; - pc0[2] += pa[4] * pb[10]; pc1[2] += pa[5] * pb[10]; - pc0[3] += pa[4] * pb[11]; pc1[3] += pa[5] * pb[11]; - - pc0[0] += pa[6] * pb[12]; pc1[0] += pa[7] * pb[12]; - pc0[1] += pa[6] * pb[13]; pc1[1] += pa[7] * pb[13]; - pc0[2] += pa[6] * pb[14]; pc1[2] += pa[7] * pb[14]; - pc0[3] += pa[6] * pb[15]; pc1[3] += pa[7] * pb[15]; - - pa += 8; - pb += 16; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; - - pc0[0] += pa[2] * pb[4]; pc1[0] += pa[3] * pb[4]; - pc0[1] += pa[2] * pb[5]; pc1[1] += pa[3] * pb[5]; - pc0[2] += pa[2] * pb[6]; pc1[2] += pa[3] * pb[6]; - pc0[3] += pa[2] * pb[7]; pc1[3] += pa[3] * pb[7]; - - pa += 4; - pb += 8; - } - if(j < k) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; - - pa += 2; - pb += 4; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - pc0[1] = pc0[1] > 0 ? pc0[1] : 0; - pc0[2] = pc0[2] > 0 ? pc0[2] : 0; - pc0[3] = pc0[3] > 0 ? pc0[3] : 0; - - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - pc1[1] = pc1[1] > 0 ? pc1[1] : 0; - pc1[2] = pc1[2] > 0 ? pc1[2] : 0; - pc1[3] = pc1[3] > 0 ? pc1[3] : 0; - } - pc0 += 4; - pc1 += 4; - } - if(n2 > 0) { - pa = sa; - pc0[0] = pc0[1] = *bias; - pc1[0] = pc1[1] = *(bias + 1); - float *pb0 = pb; - float *pb1 = pb0 + k; - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; - - pc0[0] += pa[2] * pb0[1]; pc1[0] += pa[3] * pb0[1]; - pc0[1] += pa[2] * pb1[1]; pc1[1] += pa[3] * pb1[1]; - - pc0[0] += pa[4] * pb0[2]; pc1[0] += pa[5] * pb0[2]; - pc0[1] += pa[4] * pb1[2]; pc1[1] += pa[5] * pb1[2]; - - pc0[0] += pa[6] * pb0[3]; pc1[0] += pa[7] * pb0[3]; - pc0[1] += pa[6] * pb1[3]; pc1[1] += pa[7] * pb1[3]; - - pc0[0] += pa[8] * pb0[4]; pc1[0] += pa[9] * pb0[4]; - pc0[1] += pa[8] * pb1[4]; pc1[1] += pa[9] * pb1[4]; - - pc0[0] += pa[10] * pb0[5]; pc1[0] += pa[11] * pb0[5]; - pc0[1] += pa[10] * pb1[5]; pc1[1] += pa[11] * pb1[5]; - - pc0[0] += pa[12] * pb0[6]; pc1[0] += pa[13] * pb0[6]; - pc0[1] += pa[12] * pb1[6]; pc1[1] += pa[13] * pb1[6]; - - pc0[0] += pa[14] * pb0[7]; pc1[0] += pa[15] * pb0[7]; - pc0[1] += pa[14] * pb1[7]; pc1[1] += pa[15] * pb1[7]; - - pa += 16; - pb0 += 8; - pb1 += 8; - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; - - pc0[0] += pa[2] * pb0[1]; pc1[0] += pa[3] * pb0[1]; - pc0[1] += pa[2] * pb1[1]; pc1[1] += pa[3] * pb1[1]; - - pc0[0] += pa[4] * pb0[2]; pc1[0] += pa[5] * pb0[2]; - pc0[1] += pa[4] * pb1[2]; pc1[1] += pa[5] * pb1[2]; - - pc0[0] += pa[6] * pb0[3]; pc1[0] += pa[7] * pb0[3]; - pc0[1] += pa[6] * pb1[3]; pc1[1] += pa[7] * pb1[3]; - - pa += 8; - pb0 += 4; - pb1 += 4; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; - - pc0[0] += pa[2] * pb0[1]; pc1[0] += pa[3] * pb0[1]; - pc0[1] += pa[2] * pb1[1]; pc1[1] += pa[3] * pb1[1]; - - pa += 4; - pb0 += 2; - pb1 += 2; - } - if(j < k) { - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; - - pa += 2; - pb0 += 1; - pb1 += 1; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - pc0[1] = pc0[1] > 0 ? pc0[1] : 0; - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - pc1[1] = pc1[1] > 0 ? pc1[1] : 0; - } - pc0 += 2; - pc1 += 2; - pb += 2 * k; - } - if(n1 > 0) { - pa = sa; - pc0[0] = *bias; - pc1[0] = *(bias + 1); - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - - pc0[0] += pa[2] * pb[1]; pc1[0] += pa[3] * pb[1]; - - pc0[0] += pa[4] * pb[2]; pc1[0] += pa[5] * pb[2]; - - pc0[0] += pa[6] * pb[3]; pc1[0] += pa[7] * pb[3]; - - pc0[0] += pa[8] * pb[4]; pc1[0] += pa[9] * pb[4]; - - pc0[0] += pa[10] * pb[5]; pc1[0] += pa[11] * pb[5]; - - pc0[0] += pa[12] * pb[6]; pc1[0] += pa[13] * pb[6]; - - pc0[0] += pa[14] * pb[7]; pc1[0] += pa[15] * pb[7]; - - pa += 16; - pb += 8; - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - - pc0[0] += pa[2] * pb[1]; pc1[0] += pa[3] * pb[1]; - - pc0[0] += pa[4] * pb[2]; pc1[0] += pa[5] * pb[2]; - - pc0[0] += pa[6] * pb[3]; pc1[0] += pa[7] * pb[3]; - - pa += 8; - pb += 4; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - - pc0[0] += pa[2] * pb[1]; pc1[0] += pa[3] * pb[1]; - - pa += 4; - pb += 2; - } - if(j < k) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; - - pa += 2; - pb += 1; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - } - pc0 += 1; - pc1 += 1; - } -#endif // __riscv_vector -} - -static inline void kernel_m4_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu) -{ - float *pa = sa; - float *pb = sb; - float *pc0 = dst; - float *pc1 = pc0 + ldc; - float *pc2 = pc1 + ldc; - float *pc3 = pc2 + ldc; - DECOMPOSE_K - DECOMPOSE_N - -#if __riscv_vector == 128 - if(n4 > 0) { - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" - "flw ft8, (%11)\n\t" - "flw ft9, 4(%11)\n\t" - "flw ft10, 8(%11)\n\t" - "flw ft11, 12(%11)\n\t" - "beqz %12, 1f\n\t" // if fuse_relu == 0 - "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu - - "1:\n\t" // n4 - // start kernel_m4n4 - "vfmv.v.f v24, ft8\n\t" // v24[0..3] = *bias - "vfmv.v.f v25, ft9\n\t" // v25[0..3] = *(bias + 1) - "vfmv.v.f v26, ft10\n\t" // v26[0..3] = *(bias + 2) - "vfmv.v.f v27, ft11\n\t" // v27[0..3] = *(bias + 3) - // "vlw.v v24, (%11)\n\t" // v24[0..3] = bias[0..3] - // "vlw.v v25, (%11)\n\t" // v25[0..3] = bias[0..3] - // "vlw.v v26, (%11)\n\t" // v26[0..3] = bias[0..3] - // "vlw.v v27, (%11)\n\t" // v27[0..3] = bias[0..3] - // "addi %11, %11, 16\n\t" // bias += 4 * 4 - - "mv a1, %0\n\t" // a1 = pa - "mv t0, %6\n\t" // t0 = k8 - - "flw ft0, (a1)\n\t" - "flw ft1, 4(a1)\n\t" - "flw ft2, 8(a1)\n\t" - "flw ft3, 12(a1)\n\t" // pre load pa - - "beqz t0, 3f\n\t" // k8 == 0 ? - - "vlw.v v1, (%1)\n\t" // pre load pb - "addi %1, %1, 16\n\t" - - "2:\n\t" - // start subkernel_m4n4k8 - - "vlw.v v2, (%1)\n\t" // load pb - "addi %1, %1, 16\n\t" - "flw ft4, 16(a1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw ft5, 20(a1)\n\t" - "vfmacc.vf v25, ft1, v1\n\t" - "flw ft6, 24(a1)\n\t" - "vfmacc.vf v26, ft2, v1\n\t" - "flw ft7, 28(a1)\n\t" - "vfmacc.vf v27, ft3, v1\n\t" // 0 - - - "vlw.v v3, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, 32(a1)\n\t" - "vfmacc.vf v24, ft4, v2\n\t" - "flw ft1, 36(a1)\n\t" - "vfmacc.vf v25, ft5, v2\n\t" - "flw ft2, 40(a1)\n\t" - "vfmacc.vf v26, ft6, v2\n\t" - "flw ft3, 44(a1)\n\t" - "vfmacc.vf v27, ft7, v2\n\t" // 1 - - - "vlw.v v4, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft4, 48(a1)\n\t" - "vfmacc.vf v24, ft0, v3\n\t" - "flw ft5, 52(a1)\n\t" - "vfmacc.vf v25, ft1, v3\n\t" - "flw ft6, 56(a1)\n\t" - "vfmacc.vf v26, ft2, v3\n\t" - "flw ft7, 60(a1)\n\t" - "vfmacc.vf v27, ft3, v3\n\t" // 2 - - - "vlw.v v5, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, 64(a1)\n\t" - "vfmacc.vf v24, ft4, v4\n\t" - "flw ft1, 68(a1)\n\t" - "vfmacc.vf v25, ft5, v4\n\t" - "flw ft2, 72(a1)\n\t" - "vfmacc.vf v26, ft6, v4\n\t" - "flw ft3, 76(a1)\n\t" - "vfmacc.vf v27, ft7, v4\n\t" // 3 - - - "vlw.v v6, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft4, 80(a1)\n\t" - "vfmacc.vf v24, ft0, v5\n\t" - "flw ft5, 84(a1)\n\t" - "vfmacc.vf v25, ft1, v5\n\t" - "flw ft6, 88(a1)\n\t" - "vfmacc.vf v26, ft2, v5\n\t" - "flw ft7, 92(a1)\n\t" - "vfmacc.vf v27, ft3, v5\n\t" // 4 - - - "vlw.v v7, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, 96(a1)\n\t" - "vfmacc.vf v24, ft4, v6\n\t" - "flw ft1, 100(a1)\n\t" - "vfmacc.vf v25, ft5, v6\n\t" - "flw ft2, 104(a1)\n\t" - "vfmacc.vf v26, ft6, v6\n\t" - "flw ft3, 108(a1)\n\t" - "vfmacc.vf v27, ft7, v6\n\t" // 5 - - - "vlw.v v8, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft4, 112(a1)\n\t" - "vfmacc.vf v24, ft0, v7\n\t" - "flw ft5, 116(a1)\n\t" - "vfmacc.vf v25, ft1, v7\n\t" - "flw ft6, 120(a1)\n\t" - "vfmacc.vf v26, ft2, v7\n\t" - "flw ft7, 124(a1)\n\t" - "vfmacc.vf v27, ft3, v7\n\t" // 6 - "addi a1, a1, 128\n\t" // += 32 elements, bump pa to next k8 addr - - - "vlw.v v1, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, (a1)\n\t" - "vfmacc.vf v24, ft4, v8\n\t" - "flw ft1, 4(a1)\n\t" - "vfmacc.vf v25, ft5, v8\n\t" - "flw ft2, 8(a1)\n\t" - "vfmacc.vf v26, ft6, v8\n\t" - "flw ft3, 12(a1)\n\t" - "vfmacc.vf v27, ft7, v8\n\t" // 7 - - "addi t0, t0, -1\n\t" // k8 -- - "bnez t0, 2b\n\t" - - "addi %1, %1, -16\n\t" // pb -= 4 ********* bump pb to origin addr ************ - - "3:\n\t" - "beqz %7, 4f\n\t" // k4 == 0 ? - // start subkernel_m4n4k4 - "vlw.v v1, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft4, 16(a1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw ft5, 20(a1)\n\t" - "vfmacc.vf v25, ft1, v1\n\t" - "flw ft6, 24(a1)\n\t" - "vfmacc.vf v26, ft2, v1\n\t" - "flw ft7, 28(a1)\n\t" - "vfmacc.vf v27, ft3, v1\n\t" // 0 - - - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, 32(a1)\n\t" - "vfmacc.vf v24, ft4, v2\n\t" - "flw ft1, 36(a1)\n\t" - "vfmacc.vf v25, ft5, v2\n\t" - "flw ft2, 40(a1)\n\t" - "vfmacc.vf v26, ft6, v2\n\t" - "flw ft3, 44(a1)\n\t" - "vfmacc.vf v27, ft7, v2\n\t" // 1 - - - "vlw.v v3, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft4, 48(a1)\n\t" - "vfmacc.vf v24, ft0, v3\n\t" - "flw ft5, 52(a1)\n\t" - "vfmacc.vf v25, ft1, v3\n\t" - "flw ft6, 56(a1)\n\t" - "vfmacc.vf v26, ft2, v3\n\t" - "flw ft7, 60(a1)\n\t" - "vfmacc.vf v27, ft3, v3\n\t" // 2 - "addi a1, a1, 64\n\t" // += 16 elements, bump pa to next k addr - - - "vlw.v v4, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, (a1)\n\t" - "vfmacc.vf v24, ft4, v4\n\t" - "flw ft1, 4(a1)\n\t" - "vfmacc.vf v25, ft5, v4\n\t" - "flw ft2, 8(a1)\n\t" - "vfmacc.vf v26, ft6, v4\n\t" - "flw ft3, 12(a1)\n\t" - "vfmacc.vf v27, ft7, v4\n\t" // 3 - - "4:\n\t" - "beqz %8, 5f\n\t" // k2 == 0 ? - // start subkernel_m4n4k2 - - "vlw.v v1, (%1)\n\t" - "addi %1, %1, 16\n\t" - - "flw ft4, 16(a1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw ft5, 20(a1)\n\t" - "vfmacc.vf v25, ft1, v1\n\t" - "flw ft6, 24(a1)\n\t" - "vfmacc.vf v26, ft2, v1\n\t" - "flw ft7, 28(a1)\n\t" - "vfmacc.vf v27, ft3, v1\n\t" // 0 - "addi a1, a1, 32\n\t" // += 8 elements, bump pa to next k addr - - "vlw.v v2, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw ft0, (a1)\n\t" - "vfmacc.vf v24, ft4, v2\n\t" - "flw ft1, 4(a1)\n\t" - "vfmacc.vf v25, ft5, v2\n\t" - "flw ft2, 8(a1)\n\t" - "vfmacc.vf v26, ft6, v2\n\t" - "flw ft3, 12(a1)\n\t" - "vfmacc.vf v27, ft7, v2\n\t" // 1 - - "5:\n\t" - "beqz %9, 6f\n\t" // k1 == 0 ? - // start subkernel_m4n4k1 - "vlw.v v1, (%1)\n\t" - "addi %1, %1, 16\n\t" - - "vfmacc.vf v24, ft0, v1\n\t" - "vfmacc.vf v25, ft1, v1\n\t" - "vfmacc.vf v26, ft2, v1\n\t" - "vfmacc.vf v27, ft3, v1\n\t" // 0 - - "6:\n\t" - "beqz %12, 7f\n\t" - // fused relu - "vfmax.vv v24, v24, v0\n\t" // **** relu **** - "vfmax.vv v25, v25, v0\n\t" // **** relu **** - "vfmax.vv v26, v26, v0\n\t" // **** relu **** - "vfmax.vv v27, v27, v0\n\t" // **** relu **** - - "7:\n\t" - // end kernel_m4n4 - "vsw.v v24, (%2)\n\t" - "addi %2, %2, 16\n\t" - "vsw.v v25, (%3)\n\t" - "addi %3, %3, 16\n\t" - "vsw.v v26, (%4)\n\t" - "addi %4, %4, 16\n\t" - "vsw.v v27, (%5)\n\t" - "addi %5, %5, 16\n\t" - - "addi %10, %10, -1\n\t" - "bnez %10, 1b\n\t" - - :"=r"(pa), // %0 - "=r"(pb), // %1 - "=r"(pc0), // %2 - "=r"(pc1), // %3 - "=r"(pc2), // %4 - "=r"(pc3), // %5 - "=r"(k8), // %6 - "=r"(k4), // %7 - "=r"(k2), // %8 - "=r"(k1), // %9 - "=r"(n4), // %10 - "=r"(bias), // %11 - "=r"(fuse_relu) // %12 - :"0"(pa), - "1"(pb), - "2"(pc0), - "3"(pc1), - "4"(pc2), - "5"(pc3), - "6"(k8), - "7"(k4), - "8"(k2), - "9"(k1), - "10"(n4), - "11"(bias), - "12"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "a1", "t0", - "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11" - ); - } - if(n2 > 0) { - float *pa = sa; - float *pb0 = pb; - float *pb1 = pb0 + k; - float *pc00 = pc0; - float *pc11 = pc00 + 1; - asm volatile( - "slli t1, %10, 2\n\t" - "vsetvli zero, zero, e32, m1\n\t" - // "flw ft8, (%9)\n\t" - // "flw ft9, 4(%9)\n\t" - // "addi %9, %9, 8\n\t" - - "vlw.v v24, (%9)\n\t" // v24[0..3] = bias[0]..bias[3] - "vlw.v v25, (%9)\n\t" // v25[0..3] = bias[0]..bias[3] - // "vfmv.v.f v24, ft8\n\t" // v24[0..3] = bias[0]; - // "vfmv.v.f v25, ft9\n\t" // v25[0..3] = bias[1]; - - "flw ft0, (%1)\n\t" // pre load pb0 - "flw fa0, (%2)\n\t" // pre load pb1 - - "beqz %11, 0f\n\t" // if fuse_relu == 0 - "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu - - "0:\n\t" - "mv t0, %5\n\t" // t0 = k8 - "beqz t0, 2f\n\t" // k8 == 0 ? - - "1:\n\t" - // start subkernel_m4n2k8 - "vlw.v v1, (%0)\n\t" // load pa - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw fa1, 4(%2)\n\t" - "vfmacc.vf v25, fa0, v1\n\t" // 0 - - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 8(%1)\n\t" - "vfmacc.vf v24, ft1, v2\n\t" - "flw fa0, 8(%2)\n\t" - "vfmacc.vf v25, fa1, v2\n\t" // 1 - - - "vlw.v v3, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 12(%1)\n\t" - "vfmacc.vf v24, ft0, v3\n\t" - "flw fa1, 12(%2)\n\t" - "vfmacc.vf v25, fa0, v3\n\t" // 2 - - - "vlw.v v4, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 16(%1)\n\t" - "vfmacc.vf v24, ft1, v4\n\t" - "flw fa0, 16(%2)\n\t" - "vfmacc.vf v25, fa1, v4\n\t" // 3 - - - "vlw.v v5, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 20(%1)\n\t" - "vfmacc.vf v24, ft0, v5\n\t" - "flw fa1, 20(%2)\n\t" - "vfmacc.vf v25, fa0, v5\n\t" // 4 - - - "vlw.v v6, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 24(%1)\n\t" - "vfmacc.vf v24, ft1, v6\n\t" - "flw fa0, 24(%2)\n\t" - "vfmacc.vf v25, fa1, v6\n\t" // 5 - - - "vlw.v v7, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 28(%1)\n\t" - "vfmacc.vf v24, ft0, v7\n\t" - "flw fa1, 28(%2)\n\t" - "vfmacc.vf v25, fa0, v7\n\t" // 6 - "addi %1, %1, 32\n\t" // += 8 elements, bump pb0 to next k8 addr - "addi %2, %2, 32\n\t" // += 8 elements, bump pb1 to next k8 addr - - - "vlw.v v8, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v24, ft1, v8\n\t" - "flw fa0, (%2)\n\t" - "vfmacc.vf v25, fa1, v8\n\t" // 7 - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - "2:\n\t" - "beqz %6, 3f\n\t" // k4 == 0 ? - // start subkernel_m4n2k4 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw fa1, 4(%2)\n\t" - "vfmacc.vf v25, fa0, v1\n\t" // 0 - - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 8(%1)\n\t" - "vfmacc.vf v24, ft1, v2\n\t" - "flw fa0, 8(%2)\n\t" - "vfmacc.vf v25, fa1, v2\n\t" // 1 - - - "vlw.v v3, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 12(%1)\n\t" - "vfmacc.vf v24, ft0, v3\n\t" - "flw fa1, 12(%2)\n\t" - "vfmacc.vf v25, fa0, v3\n\t" // 2 - "addi %1, %1, 16\n\t" // += 4 elements, bump pb0 to next k addr - "addi %2, %2, 16\n\t" // += 4 elements, bump pb1 to next k addr - - - "vlw.v v4, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v24, ft1, v4\n\t" - "flw fa0, (%2)\n\t" - "vfmacc.vf v25, fa1, v4\n\t" // 3 - - "3:\n\t" - "beqz %7, 4f\n\t" // k2 == 0 ? - // start subkernel_m4n2k2 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v24, ft0, v1\n\t" - "flw fa1, 4(%2)\n\t" - "vfmacc.vf v25, fa0, v1\n\t" // 0 - "addi %1, %1, 8\n\t" // += 2 elements, bump pb0 to next k addr - "addi %2, %2, 8\n\t" // += 2 elements, bump pb1 to next k addr - - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v24, ft1, v2\n\t" - "flw fa0, (%2)\n\t" - "vfmacc.vf v25, fa1, v2\n\t" // 1 - - "4:\n\t" - "beqz %8, 5f\n\t" // k1 == 0 ? - // start subkernel_m4n2k1 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - - "vfmacc.vf v24, ft0, v1\n\t" - "vfmacc.vf v25, fa0, v1\n\t" // 0 - - "5:\n\t" - "beqz %11, 6f\n\t" - // fused relu - "vfmax.vv v24, v24, v0\n\t" // **** relu **** - "vfmax.vv v25, v25, v0\n\t" // **** relu **** - - "6:\n\t" - "vssw.v v24, (%3), t1\n\t" - "vssw.v v25, (%4), t1\n\t" - - :"=r"(pa), // %0 - "=r"(pb0), // %1 - "=r"(pb1), // %2 - "=r"(pc00), // %3 - "=r"(pc11), // %4 - "=r"(k8), // %5 - "=r"(k4), // %6 - "=r"(k2), // %7 - "=r"(k1), // %8 - "=r"(bias), // %9 - "=r"(ldc), // %10 - "=r"(fuse_relu) // %11 - :"0"(pa), - "1"(pb0), - "2"(pb1), - "3"(pc00), - "4"(pc11), - "5"(k8), - "6"(k4), - "7"(k2), - "8"(k1), - "9"(bias), - "10"(ldc), - "11"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", - "t0", "t1", "ft0", "ft1", "fa0", "fa1" - ); - pb += 2 * k; - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; - } - if(n1 > 0) { - pa = sa; - float *pc00 = pc0; - asm volatile( - "slli t1, %8, 2\n\t" // t1 = ldc * 4 - "vsetvli zero, zero, e32, m1\n\t" - // "flw ft8, 0(%7)\n\t" - // "vfmv.v.f v16, ft8\n\t" - "vlw.v v16, (%7)\n\t" // v24[0..3] = bias[0]..bias[3] - "flw ft0, (%1)\n\t" // pre load pb - - "beqz %9, 0f\n\t" // if fuse_relu == 0 - "vmv.v.x v0, zero\n\t" // v0 hold const zero, using for relu - - "0:\n\t" - "beqz %3, 2f\n\t" // k8 == 0 ? - - "1:\n\t" - // start subkernel_m4n1k8 - "vlw.v v1, (%0)\n\t" // load pa - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v16, ft0, v1\n\t" // 0 - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 8(%1)\n\t" - "vfmacc.vf v16, ft1, v2\n\t" // 1 - - - "vlw.v v3, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 12(%1)\n\t" - "vfmacc.vf v16, ft0, v3\n\t" // 2 - - - "vlw.v v4, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 16(%1)\n\t" - "vfmacc.vf v16, ft1, v4\n\t" // 3 - - - "vlw.v v5, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 20(%1)\n\t" - "vfmacc.vf v16, ft0, v5\n\t" // 4 - - - "vlw.v v6, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 24(%1)\n\t" - "vfmacc.vf v16, ft1, v6\n\t" // 5 - - - "vlw.v v7, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 28(%1)\n\t" - "vfmacc.vf v16, ft0, v7\n\t" // 6 - "addi %1, %1, 32\n\t" // += 8 elements, bump pb to next k8 addr - - - "vlw.v v8, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v16, ft1, v8\n\t" // 7 - - "addi %3, %3, -1\n\t" - "bnez %3, 1b\n\t" - - "2:\n\t" - "beqz %4, 3f\n\t" // k4 == 0 ? - // start subkernel_m4n1k4 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v16, ft0, v1\n\t" // 0 - - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, 8(%1)\n\t" - "vfmacc.vf v16, ft1, v2\n\t" // 1 - - - "vlw.v v3, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 12(%1)\n\t" - "vfmacc.vf v16, ft0, v3\n\t" // 2 - "addi %1, %1, 16\n\t" // += 4 elements, bump pb to next k addr - - - "vlw.v v4, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v16, ft1, v4\n\t" // 3 - - "3:\n\t" - "beqz %5, 4f\n\t" // k2 == 0 ? - // start subkernel_m4n1k2 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft1, 4(%1)\n\t" - "vfmacc.vf v16, ft0, v1\n\t" // 0 - "addi %1, %1, 8\n\t" // += 2 elements, bump pb to next k addr - - - "vlw.v v2, (%0)\n\t" - "addi %0, %0, 16\n\t" - "flw ft0, (%1)\n\t" - "vfmacc.vf v16, ft1, v2\n\t" // 1 - - "4:\n\t" - "beqz %6, 5f\n\t" // k1 == 0 ? - // start subkernel_m4n2k1 - "vlw.v v1, (%0)\n\t" - "addi %0, %0, 16\n\t" - - "vfmacc.vf v16, ft0, v1\n\t" // 0 - - "5:\n\t" - "beqz %9, 6f\n\t" - // fused relu - "vfmax.vv v16, v16, v0\n\t" // **** relu **** - - "6:\n\t" - "vssw.v v16, (%2), t1\n\t" - - :"=r"(pa), // %0 - "=r"(pb), // %1 - "=r"(pc00), // %2 - "=r"(k8), // %3 - "=r"(k4), // %4 - "=r"(k2), // %5 - "=r"(k1), // %6 - "=r"(bias), // %7 - "=r"(ldc), // %8 - "=r"(fuse_relu) // %9 - :"0"(pa), - "1"(pb), - "2"(pc00), - "3"(k8), - "4"(k4), - "5"(k2), - "6"(k1), - "7"(bias), - "8"(ldc), - "9"(fuse_relu) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", - "t0", "t1", "ft0", "ft1" - ); - } -#else - for(int i = 0; i < n4; i++) { - pa = sa; - pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias; - pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1); - pc2[0] = pc2[1] = pc2[2] = pc2[3] = *(bias + 2); - pc3[0] = pc3[1] = pc3[2] = pc3[3] = *(bias + 3); - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; pc2[1] += pa[2] * pb[1]; pc3[1] += pa[3] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; pc2[2] += pa[2] * pb[2]; pc3[2] += pa[3] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; pc2[3] += pa[2] * pb[3]; pc3[3] += pa[3] * pb[3]; - - pc0[0] += pa[4] * pb[4]; pc1[0] += pa[5] * pb[4]; pc2[0] += pa[6] * pb[4]; pc3[0] += pa[7] * pb[4]; - pc0[1] += pa[4] * pb[5]; pc1[1] += pa[5] * pb[5]; pc2[1] += pa[6] * pb[5]; pc3[1] += pa[7] * pb[5]; - pc0[2] += pa[4] * pb[6]; pc1[2] += pa[5] * pb[6]; pc2[2] += pa[6] * pb[6]; pc3[2] += pa[7] * pb[6]; - pc0[3] += pa[4] * pb[7]; pc1[3] += pa[5] * pb[7]; pc2[3] += pa[6] * pb[7]; pc3[3] += pa[7] * pb[7]; - - pc0[0] += pa[8] * pb[8]; pc1[0] += pa[9] * pb[8]; pc2[0] += pa[10] * pb[8]; pc3[0] += pa[11] * pb[8]; - pc0[1] += pa[8] * pb[9]; pc1[1] += pa[9] * pb[9]; pc2[1] += pa[10] * pb[9]; pc3[1] += pa[11] * pb[9]; - pc0[2] += pa[8] * pb[10]; pc1[2] += pa[9] * pb[10]; pc2[2] += pa[10] * pb[10]; pc3[2] += pa[11] * pb[10]; - pc0[3] += pa[8] * pb[11]; pc1[3] += pa[9] * pb[11]; pc2[3] += pa[10] * pb[11]; pc3[3] += pa[11] * pb[11]; - - pc0[0] += pa[12] * pb[12]; pc1[0] += pa[13] * pb[12]; pc2[0] += pa[14] * pb[12]; pc3[0] += pa[15] * pb[12]; - pc0[1] += pa[12] * pb[13]; pc1[1] += pa[13] * pb[13]; pc2[1] += pa[14] * pb[13]; pc3[1] += pa[15] * pb[13]; - pc0[2] += pa[12] * pb[14]; pc1[2] += pa[13] * pb[14]; pc2[2] += pa[14] * pb[14]; pc3[2] += pa[15] * pb[14]; - pc0[3] += pa[12] * pb[15]; pc1[3] += pa[13] * pb[15]; pc2[3] += pa[14] * pb[15]; pc3[3] += pa[15] * pb[15]; - - pc0[0] += pa[16] * pb[16]; pc1[0] += pa[17] * pb[16]; pc2[0] += pa[18] * pb[16]; pc3[0] += pa[19] * pb[16]; - pc0[1] += pa[16] * pb[17]; pc1[1] += pa[17] * pb[17]; pc2[1] += pa[18] * pb[17]; pc3[1] += pa[19] * pb[17]; - pc0[2] += pa[16] * pb[18]; pc1[2] += pa[17] * pb[18]; pc2[2] += pa[18] * pb[18]; pc3[2] += pa[19] * pb[18]; - pc0[3] += pa[16] * pb[19]; pc1[3] += pa[17] * pb[19]; pc2[3] += pa[18] * pb[19]; pc3[3] += pa[19] * pb[19]; - - pc0[0] += pa[20] * pb[20]; pc1[0] += pa[21] * pb[20]; pc2[0] += pa[22] * pb[20]; pc3[0] += pa[23] * pb[20]; - pc0[1] += pa[20] * pb[21]; pc1[1] += pa[21] * pb[21]; pc2[1] += pa[22] * pb[21]; pc3[1] += pa[23] * pb[21]; - pc0[2] += pa[20] * pb[22]; pc1[2] += pa[21] * pb[22]; pc2[2] += pa[22] * pb[22]; pc3[2] += pa[23] * pb[22]; - pc0[3] += pa[20] * pb[23]; pc1[3] += pa[21] * pb[23]; pc2[3] += pa[22] * pb[23]; pc3[3] += pa[23] * pb[23]; - - pc0[0] += pa[24] * pb[24]; pc1[0] += pa[25] * pb[24]; pc2[0] += pa[26] * pb[24]; pc3[0] += pa[27] * pb[24]; - pc0[1] += pa[24] * pb[25]; pc1[1] += pa[25] * pb[25]; pc2[1] += pa[26] * pb[25]; pc3[1] += pa[27] * pb[25]; - pc0[2] += pa[24] * pb[26]; pc1[2] += pa[25] * pb[26]; pc2[2] += pa[26] * pb[26]; pc3[2] += pa[27] * pb[26]; - pc0[3] += pa[24] * pb[27]; pc1[3] += pa[25] * pb[27]; pc2[3] += pa[26] * pb[27]; pc3[3] += pa[27] * pb[27]; - - pc0[0] += pa[28] * pb[28]; pc1[0] += pa[29] * pb[28]; pc2[0] += pa[30] * pb[28]; pc3[0] += pa[31] * pb[28]; - pc0[1] += pa[28] * pb[29]; pc1[1] += pa[29] * pb[29]; pc2[1] += pa[30] * pb[29]; pc3[1] += pa[31] * pb[29]; - pc0[2] += pa[28] * pb[30]; pc1[2] += pa[29] * pb[30]; pc2[2] += pa[30] * pb[30]; pc3[2] += pa[31] * pb[30]; - pc0[3] += pa[28] * pb[31]; pc1[3] += pa[29] * pb[31]; pc2[3] += pa[30] * pb[31]; pc3[3] += pa[31] * pb[31]; - - pa += 32; - pb += 32; - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; pc2[1] += pa[2] * pb[1]; pc3[1] += pa[3] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; pc2[2] += pa[2] * pb[2]; pc3[2] += pa[3] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; pc2[3] += pa[2] * pb[3]; pc3[3] += pa[3] * pb[3]; - - pc0[0] += pa[4] * pb[4]; pc1[0] += pa[5] * pb[4]; pc2[0] += pa[6] * pb[4]; pc3[0] += pa[7] * pb[4]; - pc0[1] += pa[4] * pb[5]; pc1[1] += pa[5] * pb[5]; pc2[1] += pa[6] * pb[5]; pc3[1] += pa[7] * pb[5]; - pc0[2] += pa[4] * pb[6]; pc1[2] += pa[5] * pb[6]; pc2[2] += pa[6] * pb[6]; pc3[2] += pa[7] * pb[6]; - pc0[3] += pa[4] * pb[7]; pc1[3] += pa[5] * pb[7]; pc2[3] += pa[6] * pb[7]; pc3[3] += pa[7] * pb[7]; - - pc0[0] += pa[8] * pb[8]; pc1[0] += pa[9] * pb[8]; pc2[0] += pa[10] * pb[8]; pc3[0] += pa[11] * pb[8]; - pc0[1] += pa[8] * pb[9]; pc1[1] += pa[9] * pb[9]; pc2[1] += pa[10] * pb[9]; pc3[1] += pa[11] * pb[9]; - pc0[2] += pa[8] * pb[10]; pc1[2] += pa[9] * pb[10]; pc2[2] += pa[10] * pb[10]; pc3[2] += pa[11] * pb[10]; - pc0[3] += pa[8] * pb[11]; pc1[3] += pa[9] * pb[11]; pc2[3] += pa[10] * pb[11]; pc3[3] += pa[11] * pb[11]; - - pc0[0] += pa[12] * pb[12]; pc1[0] += pa[13] * pb[12]; pc2[0] += pa[14] * pb[12]; pc3[0] += pa[15] * pb[12]; - pc0[1] += pa[12] * pb[13]; pc1[1] += pa[13] * pb[13]; pc2[1] += pa[14] * pb[13]; pc3[1] += pa[15] * pb[13]; - pc0[2] += pa[12] * pb[14]; pc1[2] += pa[13] * pb[14]; pc2[2] += pa[14] * pb[14]; pc3[2] += pa[15] * pb[14]; - pc0[3] += pa[12] * pb[15]; pc1[3] += pa[13] * pb[15]; pc2[3] += pa[14] * pb[15]; pc3[3] += pa[15] * pb[15]; - - pa += 16; - pb += 16; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; pc2[1] += pa[2] * pb[1]; pc3[1] += pa[3] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; pc2[2] += pa[2] * pb[2]; pc3[2] += pa[3] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; pc2[3] += pa[2] * pb[3]; pc3[3] += pa[3] * pb[3]; - - pc0[0] += pa[4] * pb[4]; pc1[0] += pa[5] * pb[4]; pc2[0] += pa[6] * pb[4]; pc3[0] += pa[7] * pb[4]; - pc0[1] += pa[4] * pb[5]; pc1[1] += pa[5] * pb[5]; pc2[1] += pa[6] * pb[5]; pc3[1] += pa[7] * pb[5]; - pc0[2] += pa[4] * pb[6]; pc1[2] += pa[5] * pb[6]; pc2[2] += pa[6] * pb[6]; pc3[2] += pa[7] * pb[6]; - pc0[3] += pa[4] * pb[7]; pc1[3] += pa[5] * pb[7]; pc2[3] += pa[6] * pb[7]; pc3[3] += pa[7] * pb[7]; - - pa += 8; - pb += 8; - } - if(j < k) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - pc0[1] += pa[0] * pb[1]; pc1[1] += pa[1] * pb[1]; pc2[1] += pa[2] * pb[1]; pc3[1] += pa[3] * pb[1]; - pc0[2] += pa[0] * pb[2]; pc1[2] += pa[1] * pb[2]; pc2[2] += pa[2] * pb[2]; pc3[2] += pa[3] * pb[2]; - pc0[3] += pa[0] * pb[3]; pc1[3] += pa[1] * pb[3]; pc2[3] += pa[2] * pb[3]; pc3[3] += pa[3] * pb[3]; - - pa += 4; - pb += 4; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - pc0[1] = pc0[1] > 0 ? pc0[1] : 0; - pc0[2] = pc0[2] > 0 ? pc0[2] : 0; - pc0[3] = pc0[3] > 0 ? pc0[3] : 0; - - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - pc1[1] = pc1[1] > 0 ? pc1[1] : 0; - pc1[2] = pc1[2] > 0 ? pc1[2] : 0; - pc1[3] = pc1[3] > 0 ? pc1[3] : 0; - - pc2[0] = pc2[0] > 0 ? pc2[0] : 0; - pc2[1] = pc2[1] > 0 ? pc2[1] : 0; - pc2[2] = pc2[2] > 0 ? pc2[2] : 0; - pc2[3] = pc2[3] > 0 ? pc2[3] : 0; - - pc3[0] = pc3[0] > 0 ? pc3[0] : 0; - pc3[1] = pc3[1] > 0 ? pc3[1] : 0; - pc3[2] = pc3[2] > 0 ? pc3[2] : 0; - pc3[3] = pc3[3] > 0 ? pc3[3] : 0; - } - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; - } - if(n2 > 0) { - pa = sa; - pc0[0] = pc0[1] = *bias; - pc1[0] = pc1[1] = *(bias + 1); - pc2[0] = pc2[1] = *(bias + 2); - pc3[0] = pc3[1] = *(bias + 3); - float *pb0 = pb; - float *pb1 = pb0 + k; - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; pc2[0] += pa[2] * pb0[0]; pc3[0] += pa[3] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; pc2[1] += pa[2] * pb1[0]; pc3[1] += pa[3] * pb1[0]; - - pc0[0] += pa[4] * pb0[1]; pc1[0] += pa[5] * pb0[1]; pc2[0] += pa[6] * pb0[1]; pc3[0] += pa[7] * pb0[1]; - pc0[1] += pa[4] * pb1[1]; pc1[1] += pa[5] * pb1[1]; pc2[1] += pa[6] * pb1[1]; pc3[1] += pa[7] * pb1[1]; - - pc0[0] += pa[8] * pb0[2]; pc1[0] += pa[9] * pb0[2]; pc2[0] += pa[10] * pb0[2]; pc3[0] += pa[11] * pb0[2]; - pc0[1] += pa[8] * pb1[2]; pc1[1] += pa[9] * pb1[2]; pc2[1] += pa[10] * pb1[2]; pc3[1] += pa[11] * pb1[2]; - - pc0[0] += pa[12] * pb0[3]; pc1[0] += pa[13] * pb0[3]; pc2[0] += pa[14] * pb0[3]; pc3[0] += pa[15] * pb0[3]; - pc0[1] += pa[12] * pb1[3]; pc1[1] += pa[13] * pb1[3]; pc2[1] += pa[14] * pb1[3]; pc3[1] += pa[15] * pb1[3]; - - pc0[0] += pa[16] * pb0[4]; pc1[0] += pa[17] * pb0[4]; pc2[0] += pa[18] * pb0[4]; pc3[0] += pa[19] * pb0[4]; - pc0[1] += pa[16] * pb1[4]; pc1[1] += pa[17] * pb1[4]; pc2[1] += pa[18] * pb1[4]; pc3[1] += pa[19] * pb1[4]; - - pc0[0] += pa[20] * pb0[5]; pc1[0] += pa[21] * pb0[5]; pc2[0] += pa[22] * pb0[5]; pc3[0] += pa[23] * pb0[5]; - pc0[1] += pa[20] * pb1[5]; pc1[1] += pa[21] * pb1[5]; pc2[1] += pa[22] * pb1[5]; pc3[1] += pa[23] * pb1[5]; - - pc0[0] += pa[24] * pb0[6]; pc1[0] += pa[25] * pb0[6]; pc2[0] += pa[26] * pb0[6]; pc3[0] += pa[27] * pb0[6]; - pc0[1] += pa[24] * pb1[6]; pc1[1] += pa[25] * pb1[6]; pc2[1] += pa[26] * pb1[6]; pc3[1] += pa[27] * pb1[6]; - - pc0[0] += pa[28] * pb0[7]; pc1[0] += pa[29] * pb0[7]; pc2[0] += pa[30] * pb0[7]; pc3[0] += pa[31] * pb0[7]; - pc0[1] += pa[28] * pb1[7]; pc1[1] += pa[29] * pb1[7]; pc2[1] += pa[30] * pb1[7]; pc3[1] += pa[31] * pb1[7]; - - pa += 32; - pb0 += 8; - pb1 += 8; - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; pc2[0] += pa[2] * pb0[0]; pc3[0] += pa[3] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; pc2[1] += pa[2] * pb1[0]; pc3[1] += pa[3] * pb1[0]; - - pc0[0] += pa[4] * pb0[1]; pc1[0] += pa[5] * pb0[1]; pc2[0] += pa[6] * pb0[1]; pc3[0] += pa[7] * pb0[1]; - pc0[1] += pa[4] * pb1[1]; pc1[1] += pa[5] * pb1[1]; pc2[1] += pa[6] * pb1[1]; pc3[1] += pa[7] * pb1[1]; - - pc0[0] += pa[8] * pb0[2]; pc1[0] += pa[9] * pb0[2]; pc2[0] += pa[10] * pb0[2]; pc3[0] += pa[11] * pb0[2]; - pc0[1] += pa[8] * pb1[2]; pc1[1] += pa[9] * pb1[2]; pc2[1] += pa[10] * pb1[2]; pc3[1] += pa[11] * pb1[2]; - - pc0[0] += pa[12] * pb0[3]; pc1[0] += pa[13] * pb0[3]; pc2[0] += pa[14] * pb0[3]; pc3[0] += pa[15] * pb0[3]; - pc0[1] += pa[12] * pb1[3]; pc1[1] += pa[13] * pb1[3]; pc2[1] += pa[14] * pb1[3]; pc3[1] += pa[15] * pb1[3]; - - pa += 16; - pb0 += 4; - pb1 += 4; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; pc2[0] += pa[2] * pb0[0]; pc3[0] += pa[3] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; pc2[1] += pa[2] * pb1[0]; pc3[1] += pa[3] * pb1[0]; - - pc0[0] += pa[4] * pb0[1]; pc1[0] += pa[5] * pb0[1]; pc2[0] += pa[6] * pb0[1]; pc3[0] += pa[7] * pb0[1]; - pc0[1] += pa[4] * pb1[1]; pc1[1] += pa[5] * pb1[1]; pc2[1] += pa[6] * pb1[1]; pc3[1] += pa[7] * pb1[1]; - - pa += 8; - pb0 += 2; - pb1 += 2; - } - if(j < k) { - pc0[0] += pa[0] * pb0[0]; pc1[0] += pa[1] * pb0[0]; pc2[0] += pa[2] * pb0[0]; pc3[0] += pa[3] * pb0[0]; - pc0[1] += pa[0] * pb1[0]; pc1[1] += pa[1] * pb1[0]; pc2[1] += pa[2] * pb1[0]; pc3[1] += pa[3] * pb1[0]; - - pa += 4; - pb0 += 1; - pb1 += 1; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - pc0[1] = pc0[1] > 0 ? pc0[1] : 0; - - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - pc1[1] = pc1[1] > 0 ? pc1[1] : 0; - - pc2[0] = pc2[0] > 0 ? pc2[0] : 0; - pc2[1] = pc2[1] > 0 ? pc2[1] : 0; - - pc3[0] = pc3[0] > 0 ? pc3[0] : 0; - pc3[1] = pc3[1] > 0 ? pc3[1] : 0; - } - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; - pb += 2 * k; - } - if(n1 > 0) { - pa = sa; - pc0[0] = *bias; - pc1[0] = *(bias + 1); - pc2[0] = *(bias + 2); - pc3[0] = *(bias + 3); - int j = 0; - for(; j + 7 < k; j += 8) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - - pc0[0] += pa[4] * pb[1]; pc1[0] += pa[5] * pb[1]; pc2[0] += pa[6] * pb[1]; pc3[0] += pa[7] * pb[1]; - - pc0[0] += pa[8] * pb[2]; pc1[0] += pa[9] * pb[2]; pc2[0] += pa[10] * pb[2]; pc3[0] += pa[11] * pb[2]; - - pc0[0] += pa[12] * pb[3]; pc1[0] += pa[13] * pb[3]; pc2[0] += pa[14] * pb[3]; pc3[0] += pa[15] * pb[3]; - - pc0[0] += pa[16] * pb[4]; pc1[0] += pa[17] * pb[4]; pc2[0] += pa[18] * pb[4]; pc3[0] += pa[19] * pb[4]; - - pc0[0] += pa[20] * pb[5]; pc1[0] += pa[21] * pb[5]; pc2[0] += pa[22] * pb[5]; pc3[0] += pa[23] * pb[5]; - - pc0[0] += pa[24] * pb[6]; pc1[0] += pa[25] * pb[6]; pc2[0] += pa[26] * pb[6]; pc3[0] += pa[27] * pb[6]; - - pc0[0] += pa[28] * pb[7]; pc1[0] += pa[29] * pb[7]; pc2[0] += pa[30] * pb[7]; pc3[0] += pa[31] * pb[7]; - - pa += 32; - pb += 8; - - } - if(j + 3 < k) { - j += 4; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - - pc0[0] += pa[4] * pb[1]; pc1[0] += pa[5] * pb[1]; pc2[0] += pa[6] * pb[1]; pc3[0] += pa[7] * pb[1]; - - pc0[0] += pa[8] * pb[2]; pc1[0] += pa[9] * pb[2]; pc2[0] += pa[10] * pb[2]; pc3[0] += pa[11] * pb[2]; - - pc0[0] += pa[12] * pb[3]; pc1[0] += pa[13] * pb[3]; pc2[0] += pa[14] * pb[3]; pc3[0] += pa[15] * pb[3]; - - pa += 16; - pb += 4; - } - if(j + 1 < k) { - j += 2; - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - - pc0[0] += pa[4] * pb[1]; pc1[0] += pa[5] * pb[1]; pc2[0] += pa[6] * pb[1]; pc3[0] += pa[7] * pb[1]; - - pa += 8; - pb += 2; - } - if(j < k) { - pc0[0] += pa[0] * pb[0]; pc1[0] += pa[1] * pb[0]; pc2[0] += pa[2] * pb[0]; pc3[0] += pa[3] * pb[0]; - - pa += 4; - pb += 1; - } - if (fuse_relu) { - pc0[0] = pc0[0] > 0 ? pc0[0] : 0; - - pc1[0] = pc1[0] > 0 ? pc1[0] : 0; - - pc2[0] = pc2[0] > 0 ? pc2[0] : 0; - - pc3[0] = pc3[0] > 0 ? pc3[0] : 0; - } - pc0 += 1; - pc1 += 1; - pc2 += 1; - pc3 += 1; - } -#endif // __riscv_vector -} - - -static inline void kernel_m4_f32_1(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu) -{ - asm volatile( - "vsetvli zero, zero, e32, m1\n\t" // set vl = 4 - - "flw fs0, 0(%2)\n\t" - "flw fs1, 4(%2)\n\t" - "flw fs2, 8(%2)\n\t" - "flw fs3, 12(%2)\n\t" - - // init output addr - "slli t5, %6, 2\n\t" // t5_tmp = ldx * 4 - "mv a0, %3\n\t" - "add a1, a0, t5\n\t" - "add a2, a1, t5\n\t" - "add a3, a2, t5\n\t" - - "srai t0, %5, 2\n\t" // t0 = n >> 2 (n4) - "beqz t0, 4f\n\t" - - "1:\n\t" // m4n4 - // start kernel_m4n4 - "vfmv.v.f v24, fs0\n\t" - "vfmv.v.f v25, fs1\n\t" - "vfmv.v.f v26, fs2\n\t" - "vfmv.v.f v27, fs3\n\t" // init acc = bias - - "mv t6, %0\n\t" // t6 hold kernel 4 lines start addr - "mv t5, %4\n\t" // t5 = k (k > 0) - - "2:\n\t" - // start subkernel_m4n4k1 - "vle.v v1, (%1)\n\t" - "addi %1, %1, 16\n\t" - "flw fa0, 0(t6)\n\t" - "flw fa1, 4(t6)\n\t" - "flw fa2, 8(t6)\n\t" - "flw fa3, 12(t6)\n\t" - "addi t6, t6, 16\n\t" - - "vfmacc.vf v24, fa0, v1\n\t" - "vfmacc.vf v25, fa1, v1\n\t" - "vfmacc.vf v26, fa2, v1\n\t" - "vfmacc.vf v27, fa3, v1\n\t" - - "addi t5, t5, -1\n\t" - "bnez t5, 2b\n\t" - - "3:\n\t" // end kernel_m4n4 - - "vse.v v24, (a0)\n\t" - "addi a0, a0, 16\n\t" - "vse.v v25, (a1)\n\t" - "addi a1, a1, 16\n\t" - "vse.v v26, (a2)\n\t" - "addi a2, a2, 16\n\t" - "vse.v v27, (a3)\n\t" - "addi a3, a3, 16\n\t" - - "addi t0, t0, -1\n\t" - "bnez t0, 1b\n\t" - - "4:\n\t" // m4n2 - "andi t0, %5, 3\n\t" // n & 3 - "srai t0, t0, 1\n\t" // (n & 3) >> 2 - "beqz t0, 7f\n\t" // jump to m4n1 - // start kernel_m4n2 - "vle.v v24, (%2)\n\t" - "vle.v v25, (%2)\n\t" // init acc = bias - - // init addr for pa, pb and pc - "slli t0, %4, 2\n\t" // t0_tmp = k * 4 - - "mv t6, %0\n\t" // t6 hold pa(kernel) 2 lines start addr - - "mv a4, %1\n\t" - "add a5, a4, t0\n\t" // a4-a5 hold pb(input) 2 cols addr - - "addi a1, a0, 4\n\t" // a0-a1 hold pc(output) addr - - "mv t5, %4\n\t" // t5 = k - - "5:\n\t" - // start subkernel_m4n2k1 - "vle.v v1, (t6)\n\t" - "addi t6, t6, 16\n\t" - "flw fa0, 0(a4)\n\t" - "vfmacc.vf v24, fa0, v1\n\t" - "flw fa1, 0(a5)\n\t" - "vfmacc.vf v25, fa1, v1\n\t" - - "addi a4, a4, 4\n\t" - "addi a5, a5, 4\n\t" - - "addi t5, t5, -1\n\t" - "bnez t5, 5b\n\t" - - "6:\n\t" // end kernel_m4n2 - "slli t0, %6, 2\n\t" // t0_tmp = ldx * 4 (store_stride) - - "vsse.v v24, (a0), t0\n\t" - "vsse.v v25, (a1), t0\n\t" - - "addi a0, a0, 8\n\t" // updata output start addr ( +2 cols) - "slli t0, %4, 3\n\t" // t_tmp = k * 2 * 4 - "add %1, %1, t0\n\t" // updata pb start addr - - - "7:\n\t" // m4n1 - "andi t0, %5, 1\n\t" // n & 1 - "beqz t0, 10f\n\t" // jump to ending - // start kernel_m8n1 - - "vle.v v24, (%2)\n\t" // init out_tmp = bias - - // init addr for pa, pb and pc - "mv t6, %0\n\t" // t6 hold pa(kernel) 8 lines start addr - "mv a4, %1\n\t" // a4 hold pb(input) 1 cols addr - // a0 hold pc(output) addr - - "mv t5, %4\n\t" // t5 = k - - "8:\n\t" - // start subkernel_m8n1k8 - "vle.v v1, (t6)\n\t" - "addi t6, t6, 16\n\t" - "flw fa0, 0(a4)\n\t" - "vfmacc.vf v24, fa0, v1\n\t" // 0 - - "addi a4, a4, 4\n\t" - - "addi t5, t5, -1\n\t" - "bnez t5, 8b\n\t" - - "9:\n\t" // end kernel_m8n1 - "slli t0, %6, 2\n\t" // t0_tmp = ldx * 4 (store_stride) - - "vsse.v v24, (a0), t0\n\t" - - "10:\n\t" // ending - - - :"=r"(sa), // %0 - "=r"(sb), // %1 - "=r"(bias), // %2 - "=r"(dst), // %3 - "=r"(k), // %4 - "=r"(n), // %5 - "=r"(ldc) // %6 - :"0"(sa), - "1"(sb), - "2"(bias), - "3"(dst), - "4"(k), - "5"(n), - "6"(ldc) - :"v1", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", - "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t5", "t6", - "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7" - ); - -} - - -void csi_c906_sgemm_kernel_f32(float* dst, const float* sa, const float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu) -{ - float* pa = (float *)sa; - float* pb = (float *)sb; - float* pc = dst; - - bool flag_bias = 1; // default: conv2d layer include bias - if (bias == NULL) { - flag_bias = 0; - bias = (float *)csi_mem_alloc(m * 4); - } - float *bias_tmp = bias; - - const int mm = (m >> 2) << 2; - - for (int i = 0; i < mm; i += 4) { - kernel_m4_f32_1(pc + i * ldc, pa + i * k, pb, m, k, n, ldc, bias_tmp + i, fuse_relu); - } - - pa += mm * k; - pc += mm * ldc; - bias_tmp += mm; - - switch (m - mm) { - case 3: - kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); - pc += 2 * ldc; - pa += 2 * k; - bias_tmp += 2; - kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); - break; - case 2: - kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); - break; - case 1: - kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu); - break; - case 0: - break; - default: - break; - } - if (!flag_bias) { - csi_mem_free(bias); - bias = NULL; - } -} diff --git a/source/c906_opt/shl_c906_u8_to_f32.S b/source/c906_opt/shl_c906_u8_to_f32.S new file mode 100644 index 00000000..243dda3c --- /dev/null +++ b/source/c906_opt/shl_c906_u8_to_f32.S @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void shl_c906_u8_to_f32(const uint8_t *input, + float *output, + int32_t offset, + float *scale, + uint32_t length) + + Algorithm works as follows: + (1) + + register definition: + a0: input addr + a1: output addr + a2: offset + a3: scale point + a4: element length + + note: vector extension 0.7.1 [support flexible vlen] + + *************************************************************************************************/ + .file "shl_c906_u8_to_f32.S" + .section .text.shl_c906_u8_to_f32, "ax", @progbits + .align 5 + .global shl_c906_u8_to_f32 + .type shl_c906_u8_to_f32, @function + +shl_c906_u8_to_f32: + csrr t0, vlenb // t0 = vlen/8 + slli t1, t0, 1 + flw fa0, (a3) + slli t2, t0, 2 + vsetvli zero, zero, e32, m4 + vfmv.v.f v28, fa0 + +.L2: + bgt t1, a4, .L1 + vsetvli zero, zero, e8, m1 + vle.v v0, (a0) + add a0, a0, t0 + vle.v v1, (a0) + add a0, a0, t0 + + sub a4, a4, t1 + bgt t1, a4, .L2End + +.L2Loop: + vsetvli zero, zero, e16, m2 + vwaddu.vx v2, v0, zero + vwaddu.vx v4, v1, zero + + vsetvli zero, zero, e8, m1 + vle.v v0, (a0) + add a0, a0, t0 + vle.v v1, (a0) + add a0, a0, t0 + + vsetvli zero, zero, e32, m4 + vwsub.vx v8, v2, a2 + vwsub.vx v12, v4, a2 + vfcvt.f.x.v v16, v8 + vfcvt.f.x.v v20, v12 + vfmul.vv v8, v16, v28 + vfmul.vv v12, v20, v28 + vse.v v8, (a1) + add a1, a1, t2 + vse.v v12, (a1) + add a1, a1, t2 + + sub a4, a4, t1 + bgt a4, t1, .L2Loop // xxx: >= + +.L2End: + vsetvli zero, zero, e16, m2 + vwaddu.vx v2, v0, zero + vwaddu.vx v4, v1, zero + vsetvli zero, zero, e32, m4 + vwsub.vx v8, v2, a2 + vwsub.vx v12, v4, a2 + + vfcvt.f.x.v v16, v8 + vfcvt.f.x.v v20, v12 + + vfmul.vv v8, v16, v28 + vfmul.vv v12, v20, v28 + + vse.v v8, (a1) + add a1, a1, t2 + vse.v v12, (a1) + add a1, a1, t2 + +.L1: + beqz a4, .End + +.L1Loop: + vsetvli t0, a4, e8, m1 + slli t1, t0, 2 + vle.v v0, (a0) + add a0, a0, t0 + vsetvli t0, a4, e16, m2 + vwaddu.vx v2, v0, zero + vsetvli t0, a4, e32, m4 + vwsub.vx v4, v2, a2 + vfcvt.f.x.v v8, v4 + vfmul.vv v4, v8, v28 + vse.v v4, (a1) + add a1, a1, t1 + + sub a4, a4, t0 + bgtz a4, .L1Loop + +.End: + ret + .end diff --git a/source/c906_opt/split.c b/source/c906_opt/split.c index d8b72298..bcd84383 100644 --- a/source/c906_opt/split.c +++ b/source/c906_opt/split.c @@ -18,11 +18,10 @@ /* CSI-NN2 version 1.9.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_split_f32(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params) +int shl_c906_split_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { int32_t inner_size = 1; @@ -56,16 +55,15 @@ int csi_c906_split_f32(struct csi_tensor *input, for (int out = 0; out < out_size; out++) { int in_index = out * input->dim[params->axis] * inner_size + s_index * inner_size; int out_index = out * inner_size; - csi_c906_memcpy(output_i_data + out_index, input_data + in_index, p_size * sizeof(float)); + shl_c906_memcpy(output_i_data + out_index, input_data + in_index, + p_size * sizeof(float)); } } return CSINN_TRUE; } - -int csi_c906_split_fp16(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params) +int shl_c906_split_fp16(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { int32_t inner_size = 1; int32_t out_size = 1; @@ -98,7 +96,8 @@ int csi_c906_split_fp16(struct csi_tensor *input, for (int out = 0; out < out_size; out++) { int in_index = out * input->dim[params->axis] * inner_size + s_index * inner_size; int out_index = out * inner_size; - csi_c906_memcpy(output_i_data + out_index, input_data + in_index, p_size * sizeof(__fp16)); + shl_c906_memcpy(output_i_data + out_index, input_data + in_index, + p_size * sizeof(__fp16)); } } return CSINN_TRUE; diff --git a/source/c906_opt/sub.c b/source/c906_opt/sub.c index f091f48f..c03a526f 100644 --- a/source/c906_opt/sub.c +++ b/source/c906_opt/sub.c @@ -16,51 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_c906.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_c906.h" static void element_sub_f32(float *input0, float *input1, float *output, int size) { asm volatile( - "1:\n\t" - "vsetvli t0, %3, e32, m2\n\t" - "vle.v v8, (%1)\n\t" - "sub %3, %3, t0\n\t" - "slli t0, t0, 2\n\t" // element: 4 bytes - "add %1, %1, t0\n\t" - "vle.v v12, (%2)\n\t" - "add %2, %2, t0\n\t" - "vfsub.vv v16, v8, v12\n\t" - "vse.v v16, (%0)\n\t" - "add %0, %0, t0\n\t" - "bnez %3, 1b\n\t" - - :"=r"(output), // %0 - "=r"(input0), // %1 - "=r"(input1), // %2 - "=r"(size) // %3 - :"0"(output), - "1"(input0), - "2"(input1), - "3"(size) - : "v8", "v9", "v12", "v13", "v16", "v17", "t0" - ); + "1:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vle.v v8, (%1)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" // element: 4 bytes + "add %1, %1, t0\n\t" + "vle.v v12, (%2)\n\t" + "add %2, %2, t0\n\t" + "vfsub.vv v16, v8, v12\n\t" + "vse.v v16, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, 1b\n\t" + + : "=r"(output), // %0 + "=r"(input0), // %1 + "=r"(input1), // %2 + "=r"(size) // %3 + : "0"(output), "1"(input0), "2"(input1), "3"(size) + : "v8", "v9", "v12", "v13", "v16", "v17", "t0"); } -int csi_c906_sub_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; float *output_data = (float *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // HACK: special case // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55] @@ -98,28 +91,24 @@ int csi_c906_sub_f32(struct csi_tensor *input0, // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224] if (in_size1 == 1) { asm volatile( - "flw ft0, 0(%2)\n\t" - "1:\n\t" - "vsetvli t0, %3, e32, m2\n\t" - "vle.v v8, (%1)\n\t" - "sub %3, %3, t0\n\t" - "slli t0, t0, 2\n\t" // element: 4 bytes - "add %1, %1, t0\n\t" - "vfsub.vf v16, v8, ft0\n\t" - "vse.v v16, (%0)\n\t" - "add %0, %0, t0\n\t" - "bnez %3, 1b\n\t" + "flw ft0, 0(%2)\n\t" + "1:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vle.v v8, (%1)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" // element: 4 bytes + "add %1, %1, t0\n\t" + "vfsub.vf v16, v8, ft0\n\t" + "vse.v v16, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, 1b\n\t" - :"=r"(output_data), // %0 - "=r"(input0_data), // %1 - "=r"(input1_data), // %2 - "=r"(out_size) // %3 - :"0"(output_data), - "1"(input0_data), - "2"(input1_data), - "3"(out_size) - : "v8", "v9", "v16", "v17", "t0", "ft0" - ); + : "=r"(output_data), // %0 + "=r"(input0_data), // %1 + "=r"(input1_data), // %2 + "=r"(out_size) // %3 + : "0"(output_data), "1"(input0_data), "2"(input1_data), "3"(out_size) + : "v8", "v9", "v16", "v17", "t0", "ft0"); } // example: [1, 3, 224, 224] + [1, 3, 224, 224] = [1, 3, 224, 224] else if (in_size0 == in_size1) { @@ -135,31 +124,31 @@ int csi_c906_sub_f32(struct csi_tensor *input0, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { + float *in0_data_b = shl_mem_alloc(out_size * 4); + float *in1_data_b = shl_mem_alloc(out_size * 4); - float *in0_data_b = csi_mem_alloc(out_size * 4); - float *in1_data_b = csi_mem_alloc(out_size * 4); - - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_sub_f32(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } - // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] + // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or [1, 3, 224, 224] + [224, 224] = + // [1, 3, 224, 224] else { int inner_size = in_size1; int outer_size = out_size / in_size1; @@ -173,48 +162,40 @@ int csi_c906_sub_f32(struct csi_tensor *input0, return CSINN_TRUE; } - static void element_sub_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size) { asm volatile( - "1:\n\t" - "vsetvli t0, %3, e16, m2\n\t" - "vle.v v8, (%1)\n\t" - "sub %3, %3, t0\n\t" - "slli t0, t0, 1\n\t" // element: 2 bytes - "add %1, %1, t0\n\t" - "vle.v v12, (%2)\n\t" - "add %2, %2, t0\n\t" - "vfsub.vv v16, v8, v12\n\t" - "vse.v v16, (%0)\n\t" - "add %0, %0, t0\n\t" - "bnez %3, 1b\n\t" - - :"=r"(output), // %0 - "=r"(input0), // %1 - "=r"(input1), // %2 - "=r"(size) // %3 - :"0"(output), - "1"(input0), - "2"(input1), - "3"(size) - : "v8", "v9", "v12", "v13", "v16", "v17", "t0" - ); + "1:\n\t" + "vsetvli t0, %3, e16, m2\n\t" + "vle.v v8, (%1)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 1\n\t" // element: 2 bytes + "add %1, %1, t0\n\t" + "vle.v v12, (%2)\n\t" + "add %2, %2, t0\n\t" + "vfsub.vv v16, v8, v12\n\t" + "vse.v v16, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, 1b\n\t" + + : "=r"(output), // %0 + "=r"(input0), // %1 + "=r"(input1), // %2 + "=r"(size) // %3 + : "0"(output), "1"(input0), "2"(input1), "3"(size) + : "v8", "v9", "v12", "v13", "v16", "v17", "t0"); } - -int csi_c906_sub_fp16(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_c906_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { __fp16 *input0_data = (__fp16 *)input0->data; __fp16 *input1_data = (__fp16 *)input1->data; __fp16 *output_data = (__fp16 *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) { int inner_size = input0->dim[2] * input0->dim[3]; @@ -249,28 +230,24 @@ int csi_c906_sub_fp16(struct csi_tensor *input0, if (in_size1 == 1) { asm volatile( - "flh ft0, 0(%2)\n\t" - "1:\n\t" - "vsetvli t0, %3, e16, m2\n\t" - "vle.v v8, (%1)\n\t" - "sub %3, %3, t0\n\t" - "slli t0, t0, 1\n\t" // element: 4 bytes - "add %1, %1, t0\n\t" - "vfsub.vf v16, v8, ft0\n\t" - "vse.v v16, (%0)\n\t" - "add %0, %0, t0\n\t" - "bnez %3, 1b\n\t" + "flh ft0, 0(%2)\n\t" + "1:\n\t" + "vsetvli t0, %3, e16, m2\n\t" + "vle.v v8, (%1)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 1\n\t" // element: 4 bytes + "add %1, %1, t0\n\t" + "vfsub.vf v16, v8, ft0\n\t" + "vse.v v16, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, 1b\n\t" - :"=r"(output_data), // %0 - "=r"(input0_data), // %1 - "=r"(input1_data), // %2 - "=r"(out_size) // %3 - :"0"(output_data), - "1"(input0_data), - "2"(input1_data), - "3"(out_size) - : "v8", "v9", "v16", "v17", "t0", "ft0" - ); + : "=r"(output_data), // %0 + "=r"(input0_data), // %1 + "=r"(input1_data), // %2 + "=r"(out_size) // %3 + : "0"(output_data), "1"(input0_data), "2"(input1_data), "3"(out_size) + : "v8", "v9", "v16", "v17", "t0", "ft0"); } else if (in_size0 == in_size1) { element_sub_fp16(input0_data, input1_data, output_data, out_size); } else { @@ -281,29 +258,28 @@ int csi_c906_sub_fp16(struct csi_tensor *input0, } } if (!flag) { + __fp16 *in0_data_b = shl_mem_alloc(out_size * 2); + __fp16 *in1_data_b = shl_mem_alloc(out_size * 2); - __fp16 *in0_data_b = csi_mem_alloc(out_size * 2); - __fp16 *in1_data_b = csi_mem_alloc(out_size * 2); - - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_sub_fp16(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } else { int inner_size = in_size1; int outer_size = out_size / in_size1; diff --git a/source/c906_opt/sum.c b/source/c906_opt/sum.c index 9514bd96..57e0ee90 100644 --- a/source/c906_opt/sum.c +++ b/source/c906_opt/sum.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" // reduce_sum -int csi_c906_sum_stride_fp16(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_c906_sum_stride_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/c906_opt/transpose.c b/source/c906_opt/transpose.c index 93d39de9..4237a173 100644 --- a/source/c906_opt/transpose.c +++ b/source/c906_opt/transpose.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" -int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params) +int shl_c906_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { if (params->permute_num == 4 && params->permute[0] == 0 && params->permute[1] == 2 && params->permute[2] == 1 && params->permute[3] == 3) { @@ -77,5 +77,5 @@ int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output, } return CSINN_TRUE; } - return csi_ref_siso_callback_base(input, output, params, csi_ref_transpose); + return shl_ref_siso_callback_base(input, output, params, shl_ref_transpose); } diff --git a/source/c906_opt/utils.c b/source/c906_opt/utils.c index 6a352b79..3489f974 100644 --- a/source/c906_opt/utils.c +++ b/source/c906_opt/utils.c @@ -16,14 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_c906.h" +#include "shl_c906.h" // constrains: The destination address and source address copy do not overlap // notice: riscv gnu compiler tool-chain c-library memcpy may not use vector inst // now gcc version: gcc version 10.2.0 (T-HEAD RISCV Tools V2.0.1 B20210512) -void csi_c906_memcpy(void *dst, const void *src, size_t n) +void shl_c906_memcpy(void *dst, const void *src, size_t n) { asm volatile( "1:\n\t" @@ -56,7 +56,7 @@ void csi_c906_memcpy(void *dst, const void *src, size_t n) pad_top: origin pad top pad_left: origin pad left */ -void csi_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw, +void shl_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw, int padded_h, int padded_w, int pad_top, int pad_left) { int padded_hw = padded_h * padded_w; @@ -192,8 +192,7 @@ void csi_c906_pad_input(const float *input, float *input_padded, int inc, int in #endif // __riscv_vector } - -void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, +void shl_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, int padded_h, int padded_w, int pad_top, int pad_left) { int padded_hw = padded_h * padded_w; @@ -315,7 +314,7 @@ void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, wino_h: winograd conv out_h, alignment with 2/4/6 wino_w: winograd conv out_w, alignment with 2/4/6 */ -void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w, +void shl_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w, int wino_h, int wino_w) { int resi_h = wino_h - out_h; @@ -333,8 +332,8 @@ void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out } } -void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h, int out_w, - int wino_h, int wino_w) +void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h, + int out_w, int wino_h, int wino_w) { int resi_h = wino_h - out_h; int resi_w = wino_w - out_w; @@ -370,17 +369,9 @@ void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, 0: NX - 非精确异常 */ -void csi_c906_reset_fcsr() -{ - asm volatile( - "csrrw x0, fcsr, zero\n\t" - : - : - :"memory" - ); -} +void shl_c906_reset_fcsr() { asm volatile("csrrw x0, fcsr, zero\n\t" : : : "memory"); } -int csi_c906_get_fcsr() +int shl_c906_get_fcsr() { int f_flag = 0; asm volatile( diff --git a/source/c908_opt/avgpool.c b/source/c908_opt/avgpool.c new file mode 100644 index 00000000..54688307 --- /dev/null +++ b/source/c908_opt/avgpool.c @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(float); + + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 + : shl_rvv_global_avgpool2d_fp32; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32 + : shl_rvv_avgpool2x2s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32 + : shl_rvv_avgpool2x2s2_p1_fp32; + } + } else if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32 + : shl_rvv_avgpool3x3s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32 + : shl_rvv_avgpool3x3s2_p1_fp32; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp32 + : shl_rvv_avgpool3x3s1_p1_fp32; + } + } + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_f32; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 + : shl_rvv_global_avgpool2d_fp16; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16 + : shl_rvv_avgpool2x2s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16 + : shl_rvv_avgpool2x2s2_p1_fp16; + } + } else if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16 + : shl_rvv_avgpool3x3s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16 + : shl_rvv_avgpool3x3s2_p1_fp16; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp16 + : shl_rvv_avgpool3x3s1_p1_fp16; + } + } + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 + : shl_ref_global_avgpool2d_quant; + return CSINN_TRUE; + } + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; // fixme: consider ncxhwx + } +} + +int shl_c908_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + return CSINN_FALSE; +} diff --git a/source/c908_opt/convolution.c b/source/c908_opt/convolution.c new file mode 100644 index 00000000..6897836d --- /dev/null +++ b/source/c908_opt/convolution.c @@ -0,0 +1,408 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(float); + + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packn_fp32; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { + if (params->group > 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_fp32; + return CSINN_TRUE; + } else { + params->conv_extra.conv_mode = CSINN_WINOGRAD; + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp32; + } else { + shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp32; + } + params->conv_extra.kernel_tm = t_kernel; + } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_fp32; + } + } + + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp32; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_pack1ton_fp32; + } + } + + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp32; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packnto1_fp32; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_fp32; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_fp32(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_fp32; + } + } + return CSINN_TRUE; +} + +int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packn_fp16; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { + if (params->group > 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_fp16; + return CSINN_TRUE; + } else { + params->conv_extra.conv_mode = CSINN_WINOGRAD; + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp16; + } else { + shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp16; + } + params->conv_extra.kernel_tm = t_kernel; + } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_fp16; + } + } + + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp16; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_pack1ton_fp16; + } + } + + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp16; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packnto1_fp16; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_fp16; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_fp16(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_fp16; + } + } + return CSINN_TRUE; +} + +int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packn_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { + if (params->group > 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_int8; + return CSINN_TRUE; + } else { + params->conv_extra.conv_mode = CSINN_WINOGRAD; + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_int8; + params->conv_extra.kernel_tm = t_kernel; + } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packn_int8; + } + } + + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_int8; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_pack1ton_int8; + } + } + + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_packnto1_int8; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_packnto1_int8; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_c908_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + cb->exec = shl_c908_conv1x1s1_gemm_int8; + } else { + shl_c908_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + cb->exec = shl_c908_conv_im2col_gemm_int8; + } + } + + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + // trick for winograd b4f3 + if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { + real_scale = real_scale / 576.0f; + } + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + + // enable fuse zeropoint to bias for gemm + if (params->conv_extra.conv_mode == CSINN_GEMM) { + if (!params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + bias->data = bias_data; + } + int kernel_inner = in_c * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] -= tmp; + } + } + } + + // recover fuse zeropoint to bias for winograd + if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { + if (params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + int kernel_inner = in_c * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] += tmp; + } + } + } + return CSINN_TRUE; +} + +int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + // xxx: only int4 support nhwc layout now + if (input->layout == CSINN_LAYOUT_NHWC) { + out_c = kernel->dim[0]; + in_c = kernel->dim[3]; + in_h = input->dim[1]; + in_w = input->dim[2]; + kernel_h = kernel->dim[1]; + kernel_w = kernel->dim[2]; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (input->dtype == CSINN_DTYPE_INT4) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(kernel, params); + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = + input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + cb->exec = shl_rvv_conv1x1s1_gemm_int4; + } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + if (input->dtype == CSINN_DTYPE_INT4) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_int4(kernel, params); + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = + input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + cb->exec = shl_rvv_conv_im2col_gemm_int4; + } + } + return CSINN_TRUE; + } + return CSINN_FALSE; +} diff --git a/source/c908_opt/convolution_1x1_fp16.c b/source/c908_opt/convolution_1x1_fp16.c new file mode 100644 index 00000000..71b44b72 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp16.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // out_ch + int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); + for (int g = 0; g < group; g++) { + shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + } + memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + shl_mem_free(pa_reorder); +} + +int shl_c908_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + const int vlen = csrr_vlenb() * 8; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *pa = kernel_data + g * m * k; + __fp16 *pb = pb_reorder; + __fp16 *pc = output_data; + if (vlen == 128) { + // pack + shl_c908_reorder_input_z24_fp16(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z32_fp16_v256(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n); + } + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp16_pack1ton.c b/source/c908_opt/convolution_1x1_fp16_pack1ton.c new file mode 100644 index 00000000..48b32f06 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp16_pack1ton.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n); + + // gemm + shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + false); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp16_packn.c b/source/c908_opt/convolution_1x1_fp16_packn.c new file mode 100644 index 00000000..6ded7e80 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp16_packn.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + // pack + shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n); + // GEMM + shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + false); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp16_packnto1.c b/source/c908_opt/convolution_1x1_fp16_packnto1.c new file mode 100644 index 00000000..6f549ccc --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp16_packnto1.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n); + // GEMM + shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, + n, false); + + shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp32.c b/source/c908_opt/convolution_1x1_fp32.c new file mode 100644 index 00000000..a9f66b05 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp32.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // out_ch / group + int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); + for (int g = 0; g < group; g++) { + shl_c908_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + } + memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_c908_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + + const int vlen = csrr_vlenb() * 8; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *pa = kernel_data + g * m * k; + float *pb = pb_reorder; + float *pc = output_data; + if (vlen == 128) { + // pack + shl_c908_reorder_input_z12_fp32(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z16_fp32_v256(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n); + } + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp32_pack1ton.c b/source/c908_opt/convolution_1x1_fp32_pack1ton.c new file mode 100644 index 00000000..55404df5 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp32_pack1ton.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n); + + // gemm + // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + // n); + shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + false); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp32_packn.c b/source/c908_opt/convolution_1x1_fp32_packn.c new file mode 100644 index 00000000..083e6132 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp32_packn.c @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + + // float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float)); + // float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w); + + // pack + shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n); + // GEMM + // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + false); + + // shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + // shl_mem_free(input_ncxhwx); + // shl_mem_free(output_ncxhwx); + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_fp32_packnto1.c b/source/c908_opt/convolution_1x1_fp32_packnto1.c new file mode 100644 index 00000000..53b7e1b3 --- /dev/null +++ b/source/c908_opt/convolution_1x1_fp32_packnto1.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n); + // GEMM + shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, + n, false); + + shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_int8.c b/source/c908_opt/convolution_1x1_int8.c new file mode 100644 index 00000000..607570de --- /dev/null +++ b/source/c908_opt/convolution_1x1_int8.c @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // out_ch + int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); + int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; + + for (int g = 0; g < group; g++) { + shl_c908_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k); + } +} + +int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + // int8_t *kernel_data = (int8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + const int vlen = csrr_vlenb() * 8; + + int j = 0; + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + int8_t *pa = kernel_data + g * m * k4; + int8_t *pb = pb_reorder; + int8_t *pc = output_data; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + if (vlen == 128) { + // pack + shl_c908_reorder_input_z8_int8(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z16_int8_v256(input_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x16_int8_v256(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); + } + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_int8_pack1ton.c b/source/c908_opt/convolution_1x1_int8_pack1ton.c new file mode 100644 index 00000000..93ee4223 --- /dev/null +++ b/source/c908_opt/convolution_1x1_int8_pack1ton.c @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); +} + +static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, int inc, int inh, + int inw) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e8mf2(inc); + int vl4 = ((vl - 1) & -4) + 4; + int8_t *in_ptr = (int8_t *)src; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl); + in_ptr++; + vse8_v_i8mf2(dst, _tmp, vl); + dst += vl4; + } + } + src += in_size * vl; + inc -= vl; + } +} + +int shl_c908_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + int32_t k4 = ((k - 1) & -4) + 4; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k4; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_int8(input_ncxhwx, in_ptr, k4, 1, n, n); + + // gemm + // shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, n, + // output->qinfo->zero_point, multiplier, shift); + + shl_c908_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, + output->qinfo->zero_point, multiplier, shift); + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_int8_packn.c b/source/c908_opt/convolution_1x1_int8_packn.c new file mode 100644 index 00000000..a89159cd --- /dev/null +++ b/source/c908_opt/convolution_1x1_int8_packn.c @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n); + + shl_c908_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, + output->qinfo->zero_point, multiplier, shift); + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_1x1_int8_packnto1.c b/source/c908_opt/convolution_1x1_int8_packnto1.c new file mode 100644 index 00000000..c997c0a8 --- /dev/null +++ b/source/c908_opt/convolution_1x1_int8_packnto1.c @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); +} + +int shl_c908_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n); + + shl_c908_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, + output->qinfo->zero_point, multiplier, shift); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_3x3_fp16.c b/source/c908_opt/convolution_3x3_fp16.c new file mode 100644 index 00000000..aba2070c --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp16.c @@ -0,0 +1,2834 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + note: VLEN = 128 +*************************************************************/ + +/****************************************************************************************** + * padding input for winograd input transform , and change memory layout + * input layout: [n c h w] + * input_padded layout: [n, c/8, h, w, 8] + * constrain: input channel % 8 = 0 + ******************************************************************************************/ +static void winograd_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, + int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + int padded_hw = padded_h * padded_w; + const int in_size = inh * inw; // per-channel size + + __fp16 *pad_ptr = input_padded; + __fp16 *inp_ptr = (__fp16 *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + inp_ptr = (__fp16 *)input + c * in_size; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(inp_ptr, in_size * sizeof(__fp16), vl); + inp_ptr++; + vse16_v_f16m1(pad_ptr, _tmp, vl); + pad_ptr += vl; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } +} + +/****************************************************************************************** + * cut winograd output transform for output, and change memory layout + * winograd output transform layout: [n, c/8, h, w, 8] + * output layout: [n, c, h, w] + * constrain: output channel % 8 = 0 + ******************************************************************************************/ +static void winograd_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output, + int out_c, int out_h, int out_w, int wino_h, + int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + __fp16 *out_tm_ptr = (__fp16 *)output_trans; + __fp16 *out_ptr = output; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + out_tm_ptr = (__fp16 *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + __fp16 *crop_ptr = out_tm_ptr + h * wino_w * vl; + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl); + crop_ptr += vl; + vsse16_v_f16m1(out_ptr, out_size * sizeof(__fp16), _tmp, vl); + out_ptr++; + } + } + } +} + +static void winograd_crop_output_pack16to1_fp16(const __fp16 *output_trans, __fp16 *output, + int out_c, int out_h, int out_w, int wino_h, + int wino_w) +{ + const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2; + const int vl = vsetvl_e16m2(pack2n); + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + __fp16 *out_tm_ptr = (__fp16 *)output_trans; + __fp16 *out_ptr = output; + + int c = 0; + for (; c + pack2n - 1 < out_c; c += pack2n) { + out_tm_ptr = (__fp16 *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + __fp16 *crop_ptr = out_tm_ptr + h * wino_w * vl; + for (int w = 0; w < out_w; w++) { + vfloat16m2_t _tmp = vle16_v_f16m2(crop_ptr, vl); + crop_ptr += vl; + vsse16_v_f16m2(out_ptr, out_size * sizeof(__fp16), _tmp, vl); + out_ptr++; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_pack8_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + __fp16 tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 6*6 start addr + const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + + vfloat16m1_t _tmp0m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f, + vfadd_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f, + vfsub_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp5m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _r0tm0 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm5 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +// TODO: remove useless code for unsatisfactory performance +static inline void wg_b4f3s1_trans_output_pack8_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[4][6][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * 8; // 6*6 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _out00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _out03 = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f16m1(_bias, _out00, vl); + _out01 = vfadd_vv_f16m1(_bias, _out01, vl); + _out02 = vfadd_vv_f16m1(_bias, _out02, vl); + _out03 = vfadd_vv_f16m1(_bias, _out03, vl); + + vse16_v_f16m1(output0, _out00, vl); + vse16_v_f16m1(output0 + packn * 1, _out01, vl); + vse16_v_f16m1(output0 + packn * 2, _out02, vl); + vse16_v_f16m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +// TODO: remove useless code for unsatisfactory performance +static inline void wg_bxf3s1_reorder_input_tile16_fp16(const __fp16 *src, __fp16 *dst, int ch, + int tiles, int area) +{ + int vl = vsetvl_e16m1(8); + for (int r = 0; r < area; r++) { + __fp16 *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 15 < tiles; t += 16) { + const __fp16 *tm1 = src; + + tm1 += (r * tiles + t) * 8; + for (int q = 0; q < ch / 8; q++) { + vfloat16m1_t _b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7; + vfloat16m1_t _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15; + + vlseg8e16_v_f16m1(&_b0, &_b1, &_b2, &_b3, &_b4, &_b5, &_b6, &_b7, tm1, vl); + vlseg8e16_v_f16m1(&_b8, &_b9, &_b10, &_b11, &_b12, &_b13, &_b14, &_b15, tm1 + 64, + vl); + + vse16_v_f16m1(img_tm2, _b0, vl); + img_tm2 += vl; // += 8 + vse16_v_f16m1(img_tm2, _b8, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b1, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b9, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b2, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b10, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b3, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b11, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b4, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b12, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b5, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b13, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b6, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b14, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b7, vl); + img_tm2 += vl; + vse16_v_f16m1(img_tm2, _b15, vl); + img_tm2 += vl; + + tm1 += area * tiles * 8; + // img_tm2 += 16 * 8; + } + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * 8; + for (int q = 0; q < ch / 8; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + 8 * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + 8 * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + 8 * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + 8 * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + 8 * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + 8 * 7, vl); + + vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * 8; + img_tm2 += 8 * 8; + } + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * 8; + for (int q = 0; q < ch / 8; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + 8 * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + 8 * 3, vl); + + vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * 8; + img_tm2 += 4 * 8; + } + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * 8; + for (int q = 0; q < ch / 8; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl); + + vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * 8; + img_tm2 += 2 * 8; + } + } + for (; t < tiles; t++) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * 8; + for (int q = 0; q < ch / 8; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + + vse16_v_f16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * 8; + img_tm2 += 1 * 8; + } + } + } +} + +// TODO: remove useless code for unsatisfactory performance +static inline void wg_bxf3s1_batch_gemm_m8n16_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 7 < out_ch; p += 8) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 15 < tiles; t += 16) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + "flh fa4, 8(%[input_ptr])\n\t" + "flh fa5, 10(%[input_ptr])\n\t" + "flh fa6, 12(%[input_ptr])\n\t" + "flh fa7, 14(%[input_ptr])\n\t" + + "1:\n\t" // m8n16k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v16, fa0, v2\n\t" + "flh ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v17, fa1, v2\n\t" + "flh ft1, 18(%[input_ptr])\n\t" + "vfmacc.vf v18, fa2, v2\n\t" + "flh ft2, 20(%[input_ptr])\n\t" + "vfmacc.vf v19, fa3, v2\n\t" + "flh ft3, 22(%[input_ptr])\n\t" + "vfmacc.vf v20, fa4, v2\n\t" + "flh ft4, 24(%[input_ptr])\n\t" + "vfmacc.vf v21, fa5, v2\n\t" + "flh ft5, 26(%[input_ptr])\n\t" + "vfmacc.vf v22, fa6, v2\n\t" + "flh ft6, 28(%[input_ptr])\n\t" + "vfmacc.vf v23, fa7, v2\n\t" + "flh ft7, 30(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flh fa0, 32(%[input_ptr])\n\t" + "vfmacc.vf v25, ft1, v2\n\t" + "flh fa1, 34(%[input_ptr])\n\t" + "vfmacc.vf v26, ft2, v2\n\t" + "flh fa2, 36(%[input_ptr])\n\t" + "vfmacc.vf v27, ft3, v2\n\t" + "flh fa3, 38(%[input_ptr])\n\t" + "vfmacc.vf v28, ft4, v2\n\t" + "flh fa4, 40(%[input_ptr])\n\t" + "vfmacc.vf v29, ft5, v2\n\t" + "flh fa5, 42(%[input_ptr])\n\t" + "vfmacc.vf v30, ft6, v2\n\t" + "flh fa6, 44(%[input_ptr])\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flh fa7, 46(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 48(%[input_ptr])\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 50(%[input_ptr])\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flh ft2, 52(%[input_ptr])\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flh ft3, 54(%[input_ptr])\n\t" + "vfmacc.vf v20, fa4, v4\n\t" + "flh ft4, 56(%[input_ptr])\n\t" + "vfmacc.vf v21, fa5, v4\n\t" + "flh ft5, 58(%[input_ptr])\n\t" + "vfmacc.vf v22, fa6, v4\n\t" + "flh ft6, 60(%[input_ptr])\n\t" + "vfmacc.vf v23, fa7, v4\n\t" + "flh ft7, 62(%[input_ptr])\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // input_ptr += 32 + + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v25, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v26, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v27, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + "vfmacc.vf v28, ft4, v4\n\t" + "flh fa4, 8(%[input_ptr])\n\t" + "vfmacc.vf v29, ft5, v4\n\t" + "flh fa5, 10(%[input_ptr])\n\t" + "vfmacc.vf v30, ft6, v4\n\t" + "flh fa6, 12(%[input_ptr])\n\t" + "vfmacc.vf v31, ft7, v4\n\t" + "flh fa7, 14(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vse16.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v17, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v19, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v21, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v23, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v25, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v27, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v29, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v31, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", + "ft3", "ft4", "ft5", "ft6", "ft7", "t0"); + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + "flh fa4, 8(%[input_ptr])\n\t" + "flh fa5, 10(%[input_ptr])\n\t" + "flh fa6, 12(%[input_ptr])\n\t" + "flh fa7, 14(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v24, fa0, v2\n\t" + "flh ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v25, fa1, v2\n\t" + "flh ft1, 18(%[input_ptr])\n\t" + "vfmacc.vf v26, fa2, v2\n\t" + "flh ft2, 20(%[input_ptr])\n\t" + "vfmacc.vf v27, fa3, v2\n\t" + "flh ft3, 22(%[input_ptr])\n\t" + "vfmacc.vf v28, fa4, v2\n\t" + "flh ft4, 24(%[input_ptr])\n\t" + "vfmacc.vf v29, fa5, v2\n\t" + "flh ft5, 26(%[input_ptr])\n\t" + "vfmacc.vf v30, fa6, v2\n\t" + "flh ft6, 28(%[input_ptr])\n\t" + "vfmacc.vf v31, fa7, v2\n\t" + "flh ft7, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v25, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v26, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v27, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + "vfmacc.vf v28, ft4, v4\n\t" + "flh fa4, 8(%[input_ptr])\n\t" + "vfmacc.vf v29, ft5, v4\n\t" + "flh fa5, 10(%[input_ptr])\n\t" + "vfmacc.vf v30, ft6, v4\n\t" + "flh fa6, 12(%[input_ptr])\n\t" + "vfmacc.vf v31, ft7, v4\n\t" + "flh fa7, 14(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v25, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v27, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v29, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v31, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", + "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v28, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v29, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v30, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v31, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v28, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v29, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v30, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v31, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v29, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v31, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n2k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v30, fa0, v2\n\t" + "flh ft0, 4(%[input_ptr])\n\t" + "vfmacc.vf v31, fa1, v2\n\t" + "flh ft1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v30, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v31, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + "vse16.v v31, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "fa1", "ft0", + "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n1k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v31, fa0, v2\n\t" + "flh ft0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vfmacc.vf v31, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vse16.v v31, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 16\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_b6f3s1_trans_input_pack8_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + __fp16 tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 8*8 start addr + const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn; + // input_tm1 8*8 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); + + vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f16m1(_r03, _r05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = + vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[7][m], _tmp7m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vse16_v_f16m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + __fp16 *r0_tm6 = r0_tm5 + tiles * packn; + __fp16 *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = vfmacc_vf_f16m1( + _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm7, _r0tm7, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + vse16_v_f16m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +// TODO: remove useless code for unsatisfactory performance +static inline void wg_b6f3s1_trans_output_pack8_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[6][8][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6; + const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7; + + __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp5m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * packn * 8; + output0_tm_1 += tiles * packn * 8; + output0_tm_2 += tiles * packn * 8; + output0_tm_3 += tiles * packn * 8; + output0_tm_4 += tiles * packn * 8; + output0_tm_5 += tiles * packn * 8; + output0_tm_6 += tiles * packn * 8; + output0_tm_7 += tiles * packn * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); + + vfloat16m1_t _output00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _output02 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _output04 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _output01 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _output03 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _output05 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f16m1(_bias, _output00, vl); + _output01 = vfadd_vv_f16m1(_bias, _output01, vl); + _output02 = vfadd_vv_f16m1(_bias, _output02, vl); + _output03 = vfadd_vv_f16m1(_bias, _output03, vl); + _output04 = vfadd_vv_f16m1(_bias, _output04, vl); + _output05 = vfadd_vv_f16m1(_bias, _output05, vl); + + vse16_v_f16m1(output0, _output00, vl); + vse16_v_f16m1(output0 + packn * 2, _output02, vl); + vse16_v_f16m1(output0 + packn * 4, _output04, vl); + vse16_v_f16m1(output0 + packn * 1, _output01, vl); + vse16_v_f16m1(output0 + packn * 3, _output03, vl); + vse16_v_f16m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_pack16_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2; + const int vl = vsetvl_e16m2(pack2n); + int tiles = blk_h * blk_w; + for (int p = 0; p + pack2n - 1 < ch; p += pack2n) { + const __fp16 *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[4][6][pack2n]; + + vfloat16m2_t _bias = bias ? vle16_v_f16m2(bias + p, vl) : vfmv_v_f_f16m2(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n; // 6*6 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5; + + __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * pack2n; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat16m2_t _r00 = vle16_v_f16m2(output0_tm_0, vl); + vfloat16m2_t _r01 = vle16_v_f16m2(output0_tm_1, vl); + vfloat16m2_t _r02 = vle16_v_f16m2(output0_tm_2, vl); + vfloat16m2_t _r03 = vle16_v_f16m2(output0_tm_3, vl); + vfloat16m2_t _r04 = vle16_v_f16m2(output0_tm_4, vl); + vfloat16m2_t _r05 = vle16_v_f16m2(output0_tm_5, vl); + + vfloat16m2_t _tmp02a = vfadd_vv_f16m2(_r01, _r02, vl); + vfloat16m2_t _tmp13a = vfsub_vv_f16m2(_r01, _r02, vl); + + vfloat16m2_t _tmp02b = vfadd_vv_f16m2(_r03, _r04, vl); + vfloat16m2_t _tmp13b = vfsub_vv_f16m2(_r03, _r04, vl); + + vfloat16m2_t _tmp0m = + vfadd_vv_f16m2(vfadd_vv_f16m2(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m2_t _tmp1m = vfmacc_vf_f16m2(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m2_t _tmp2m = vfmacc_vf_f16m2(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m2_t _tmp3m = + vfmacc_vf_f16m2(vfadd_vv_f16m2(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse16_v_f16m2(tmp[0][m], _tmp0m, vl); + vse16_v_f16m2(tmp[1][m], _tmp1m, vl); + vse16_v_f16m2(tmp[2][m], _tmp2m, vl); + vse16_v_f16m2(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * pack2n * 6; + output0_tm_1 += tiles * pack2n * 6; + output0_tm_2 += tiles * pack2n * 6; + output0_tm_3 += tiles * pack2n * 6; + output0_tm_4 += tiles * pack2n * 6; + output0_tm_5 += tiles * pack2n * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat16m2_t _tmp00 = vle16_v_f16m2(tmp[m][0], vl); + vfloat16m2_t _tmp01 = vle16_v_f16m2(tmp[m][1], vl); + vfloat16m2_t _tmp02 = vle16_v_f16m2(tmp[m][2], vl); + vfloat16m2_t _tmp03 = vle16_v_f16m2(tmp[m][3], vl); + vfloat16m2_t _tmp04 = vle16_v_f16m2(tmp[m][4], vl); + vfloat16m2_t _tmp05 = vle16_v_f16m2(tmp[m][5], vl); + + vfloat16m2_t _tmp02a = vfadd_vv_f16m2(_tmp01, _tmp02, vl); + vfloat16m2_t _tmp13a = vfsub_vv_f16m2(_tmp01, _tmp02, vl); + + vfloat16m2_t _tmp02b = vfadd_vv_f16m2(_tmp03, _tmp04, vl); + vfloat16m2_t _tmp13b = vfsub_vv_f16m2(_tmp03, _tmp04, vl); + + vfloat16m2_t _out00 = vfadd_vv_f16m2( + _bias, vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp00, _tmp02a, vl), _tmp02b, vl), + vl); + vfloat16m2_t _out01 = + vfadd_vv_f16m2(_bias, vfmacc_vf_f16m2(_tmp13a, 2.f, _tmp13b, vl), vl); + vfloat16m2_t _out02 = + vfadd_vv_f16m2(_bias, vfmacc_vf_f16m2(_tmp02a, 4.f, _tmp02b, vl), vl); + vfloat16m2_t _out03 = vfadd_vv_f16m2( + _bias, + vfmacc_vf_f16m2(vfadd_vv_f16m2(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl), vl); + + vse16_v_f16m2(output0, _out00, vl); + vse16_v_f16m2(output0 + pack2n * 1, _out01, vl); + vse16_v_f16m2(output0 + pack2n * 2, _out02, vl); + vse16_v_f16m2(output0 + pack2n * 3, _out03, vl); + + output0 += blk_w * 4 * pack2n; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile8_fp16(const __fp16 *src, __fp16 *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + __fp16 *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + + vse16_v_f16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flh fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v2\n\t" + "flh fa1, 18(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v2\n\t" + "flh fa2, 20(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v2\n\t" + "flh fa3, 22(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 24(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flh ft1, 26(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flh ft2, 28(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flh ft3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v26, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v28, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v30, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, fa0, v2\n\t" + "flh ft0, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, fa1, v2\n\t" + "flh ft1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v30, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, fa0, v2\n\t" + "flh ft0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m32n8_fp16_v256(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int p = 0; p + 31 < out_ch; p += 32) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 32; + + asm volatile( + "li t0, 32\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v16, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flh fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v2\n\t" + "flh fa1, 18(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v2\n\t" + "flh fa2, 20(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v2\n\t" + "flh fa3, 22(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 24(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flh ft1, 26(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flh ft2, 28(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flh ft3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 32 + + "vse16.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 32; + + asm volatile( + "li t0, 32\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v24, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v26, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v28, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v30, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 32 + + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 32; + + asm volatile( + "li t0, 32\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v28, fa0, v2\n\t" + "flh ft0, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, fa1, v2\n\t" + "flh ft1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v28, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v30, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 32 + + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 32; + + asm volatile( + "li t0, 32\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v30, fa0, v2\n\t" + "flh ft0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 32 + + "vfmacc.vf v30, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 32 + + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_b6f3s1_trans_output_pack16_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2; + const int vl = vsetvl_e16m2(pack2n); + int tiles = blk_h * blk_w; + for (int p = 0; p + pack2n - 1 < ch; p += pack2n) { + const __fp16 *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[6][8][pack2n]; + + vfloat16m2_t _bias = bias ? vle16_v_f16m2(bias + p, vl) : vfmv_v_f_f16m2(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n; // 8*8 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5; + const __fp16 *output0_tm_6 = output0_tm_0 + tiles * pack2n * 6; + const __fp16 *output0_tm_7 = output0_tm_0 + tiles * pack2n * 7; + + __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * pack2n; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat16m2_t _r00 = vle16_v_f16m2(output0_tm_0, vl); + vfloat16m2_t _r01 = vle16_v_f16m2(output0_tm_1, vl); + vfloat16m2_t _r02 = vle16_v_f16m2(output0_tm_2, vl); + vfloat16m2_t _r03 = vle16_v_f16m2(output0_tm_3, vl); + vfloat16m2_t _r04 = vle16_v_f16m2(output0_tm_4, vl); + vfloat16m2_t _r05 = vle16_v_f16m2(output0_tm_5, vl); + vfloat16m2_t _r06 = vle16_v_f16m2(output0_tm_6, vl); + vfloat16m2_t _r07 = vle16_v_f16m2(output0_tm_7, vl); + + vfloat16m2_t _tmp024a = vfadd_vv_f16m2(_r01, _r02, vl); + vfloat16m2_t _tmp135a = vfsub_vv_f16m2(_r01, _r02, vl); + + vfloat16m2_t _tmp024b = vfadd_vv_f16m2(_r03, _r04, vl); + vfloat16m2_t _tmp135b = vfsub_vv_f16m2(_r03, _r04, vl); + + vfloat16m2_t _tmp024c = vfadd_vv_f16m2(_r05, _r06, vl); + vfloat16m2_t _tmp135c = vfsub_vv_f16m2(_r05, _r06, vl); + + vfloat16m2_t _tmp0m = + vfadd_vv_f16m2(vfadd_vv_f16m2(_r00, _tmp024a, vl), + vfmacc_vf_f16m2(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m2_t _tmp2m = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m2_t _tmp4m = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m2_t _tmp1m = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m2_t _tmp3m = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m2_t _tmp5m = + vfadd_vv_f16m2(vfadd_vv_f16m2(_r07, _tmp135a, vl), + vfmacc_vf_f16m2(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse16_v_f16m2(tmp[0][m], _tmp0m, vl); + vse16_v_f16m2(tmp[2][m], _tmp2m, vl); + vse16_v_f16m2(tmp[4][m], _tmp4m, vl); + vse16_v_f16m2(tmp[1][m], _tmp1m, vl); + vse16_v_f16m2(tmp[3][m], _tmp3m, vl); + vse16_v_f16m2(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * pack2n * 8; + output0_tm_1 += tiles * pack2n * 8; + output0_tm_2 += tiles * pack2n * 8; + output0_tm_3 += tiles * pack2n * 8; + output0_tm_4 += tiles * pack2n * 8; + output0_tm_5 += tiles * pack2n * 8; + output0_tm_6 += tiles * pack2n * 8; + output0_tm_7 += tiles * pack2n * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat16m2_t _tmp00 = vle16_v_f16m2(tmp[m][0], vl); + vfloat16m2_t _tmp01 = vle16_v_f16m2(tmp[m][1], vl); + vfloat16m2_t _tmp02 = vle16_v_f16m2(tmp[m][2], vl); + vfloat16m2_t _tmp03 = vle16_v_f16m2(tmp[m][3], vl); + vfloat16m2_t _tmp04 = vle16_v_f16m2(tmp[m][4], vl); + vfloat16m2_t _tmp05 = vle16_v_f16m2(tmp[m][5], vl); + vfloat16m2_t _tmp06 = vle16_v_f16m2(tmp[m][6], vl); + vfloat16m2_t _tmp07 = vle16_v_f16m2(tmp[m][7], vl); + + vfloat16m2_t _tmp024a = vfadd_vv_f16m2(_tmp01, _tmp02, vl); + vfloat16m2_t _tmp135a = vfsub_vv_f16m2(_tmp01, _tmp02, vl); + + vfloat16m2_t _tmp024b = vfadd_vv_f16m2(_tmp03, _tmp04, vl); + vfloat16m2_t _tmp135b = vfsub_vv_f16m2(_tmp03, _tmp04, vl); + + vfloat16m2_t _tmp024c = vfadd_vv_f16m2(_tmp05, _tmp06, vl); + vfloat16m2_t _tmp135c = vfsub_vv_f16m2(_tmp05, _tmp06, vl); + + vfloat16m2_t _output00 = + vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp00, _tmp024a, vl), + vfmacc_vf_f16m2(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m2_t _output02 = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m2_t _output04 = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m2_t _output01 = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m2_t _output03 = vfmacc_vf_f16m2( + vfmacc_vf_f16m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m2_t _output05 = + vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp07, _tmp135a, vl), + vfmacc_vf_f16m2(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f16m2(_bias, _output00, vl); + _output01 = vfadd_vv_f16m2(_bias, _output01, vl); + _output02 = vfadd_vv_f16m2(_bias, _output02, vl); + _output03 = vfadd_vv_f16m2(_bias, _output03, vl); + _output04 = vfadd_vv_f16m2(_bias, _output04, vl); + _output05 = vfadd_vv_f16m2(_bias, _output05, vl); + + vse16_v_f16m2(output0, _output00, vl); + vse16_v_f16m2(output0 + pack2n * 2, _output02, vl); + vse16_v_f16m2(output0 + pack2n * 4, _output04, vl); + vse16_v_f16m2(output0 + pack2n * 1, _output01, vl); + vse16_v_f16m2(output0 + pack2n * 3, _output03, vl); + vse16_v_f16m2(output0 + pack2n * 5, _output05, vl); + + output0 += blk_w * 6 * pack2n; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/8, 36, I, 8] + * constrain: output channel % 8 = 0 + * input channel % 8 = 0 + * // TODO: remove useless code for unsatisfactory performance + ******************************************************************************************/ +void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + + // kernel transform matrix: G + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/8, 6*6, I, 8] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + for (int oc = 0; oc + 7 < outch; oc += 8) { + const __fp16 *k0 = kernel_tm + (oc + 0) * inch * 36; + const __fp16 *k1 = kernel_tm + (oc + 1) * inch * 36; + const __fp16 *k2 = kernel_tm + (oc + 2) * inch * 36; + const __fp16 *k3 = kernel_tm + (oc + 3) * inch * 36; + const __fp16 *k4 = kernel_tm + (oc + 4) * inch * 36; + const __fp16 *k5 = kernel_tm + (oc + 5) * inch * 36; + const __fp16 *k6 = kernel_tm + (oc + 6) * inch * 36; + const __fp16 *k7 = kernel_tm + (oc + 7) * inch * 36; + + __fp16 *g0 = kernel_tm_packn + oc * inch * 36; + + for (int t = 0; t < 36; t++) { + __fp16 *g00 = g0 + t * inch * 8; + + for (int ic = 0; ic < inch; ic++) { + const __fp16 *k00 = k0 + ic * 36; + const __fp16 *k10 = k1 + ic * 36; + const __fp16 *k20 = k2 + ic * 36; + const __fp16 *k30 = k3 + ic * 36; + const __fp16 *k40 = k4 + ic * 36; + const __fp16 *k50 = k5 + ic * 36; + const __fp16 *k60 = k6 + ic * 36; + const __fp16 *k70 = k7 + ic * 36; + + g00[0] = k00[t]; + g00[1] = k10[t]; + g00[2] = k20[t]; + g00[3] = k30[t]; + g00[4] = k40[t]; + g00[5] = k50[t]; + g00[6] = k60[t]; + g00[7] = k70[t]; + g00 += 8; + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 8 = 0 + * input channel % 8 = 0 + * // TODO: remove useless code for unsatisfactory performance + ******************************************************************************************/ +int shl_c908_wg_b4f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + /****************************** bias *****************************/ + bool flag_bias = 1; // default: conv2d layer include bias + if (bias_data == NULL) { + flag_bias = 0; + bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16)); + } + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/8 h w 8] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 64, tiles, 8] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/16, in_c, 16] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile16_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 36, tiles, 8] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_m8n16_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h4, out_w4, 8] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_output_pack8_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + if (!flag_bias) { + shl_mem_free(bias_data); + bias_data = NULL; + } + return CSINN_TRUE; +} + +// TODO: remove useless code for unsatisfactory performance +void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + // kernel transform matrix: G + const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 8 * inch * 8 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + for (int oc = 0; oc + 7 < outch; oc += 8) { + const __fp16 *k0 = kernel_tm + (oc + 0) * inch * 64; + const __fp16 *k1 = kernel_tm + (oc + 1) * inch * 64; + const __fp16 *k2 = kernel_tm + (oc + 2) * inch * 64; + const __fp16 *k3 = kernel_tm + (oc + 3) * inch * 64; + const __fp16 *k4 = kernel_tm + (oc + 4) * inch * 64; + const __fp16 *k5 = kernel_tm + (oc + 5) * inch * 64; + const __fp16 *k6 = kernel_tm + (oc + 6) * inch * 64; + const __fp16 *k7 = kernel_tm + (oc + 7) * inch * 64; + + __fp16 *g0 = kernel_tm_packn + oc * inch * 64; + + for (int t = 0; t < 64; t++) { + __fp16 *g00 = g0 + t * inch * 8; + + for (int ic = 0; ic < inch; ic++) { + const __fp16 *k00 = k0 + ic * 64; + const __fp16 *k10 = k1 + ic * 64; + const __fp16 *k20 = k2 + ic * 64; + const __fp16 *k30 = k3 + ic * 64; + const __fp16 *k40 = k4 + ic * 64; + const __fp16 *k50 = k5 + ic * 64; + const __fp16 *k60 = k6 + ic * 64; + const __fp16 *k70 = k7 + ic * 64; + + g00[0] = k00[t]; + g00[1] = k10[t]; + g00[2] = k20[t]; + g00[3] = k30[t]; + g00[4] = k40[t]; + g00[5] = k50[t]; + g00[6] = k60[t]; + g00[7] = k70[t]; + g00 += 8; + } + } + } + shl_mem_free(kernel_tm); +} + +// TODO: remove useless code for unsatisfactory performance +int shl_c908_wg_b6f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + /****************************** bias *****************************/ + bool flag_bias = 1; // default: conv2d layer include bias + if (bias_data == NULL) { + flag_bias = 0; + bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16)); + } + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/8 h w 8] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 64, tiles, 8] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/16, in_c, 16] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile16_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 64, tiles, 8] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_m8n16_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h6, out_w6, 8] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_output_pack8_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + if (!flag_bias) { + shl_mem_free(bias_data); + bias_data = NULL; + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * constrain: output channel % 16 = 0 + * input channel % 8 = 0 + ******************************************************************************************/ +void shl_c908_wg_b4f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + + // kernel transform matrix: G + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/16, 6*6, I, 16] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 16 * 36 * inch * 16 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2; + + for (int oc = 0; oc < outch / pack2n; oc++) { + __fp16 *g0 = kernel_tm_packn + oc * 36 * inch * pack2n; + + for (int k = 0; k < 36; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + + for (int ic = 0; ic < inch / pack2n; ic++) { + for (int i = 0; i < pack2n; i++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = + kernel_tm + (oc * pack2n + j) * 36 * inch + (ic * pack2n + i) * 36; + *g00++ = k00[k]; + } + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 16 = 0 + * input channel % 8 = 0 + ******************************************************************************************/ +int shl_c908_wg_b4f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/8 h w 8] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 36, tiles, 8] + __fp16 *input_tm1_buf = + (__fp16 *)shl_mem_alloc(in_c / 16 * 36 * tiles * 16 * sizeof(__fp16)); + wg_b4f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/16, 36, tiles, 16] + const int vlen = csrr_vlenb() * 8; + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c / 16 * 36 * tiles * 16 * sizeof(__fp16)); + if (vlen == 128) { + wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + } else if (vlen == 256) { + wg_bxf3s1_batch_gemm_m32n8_fp16_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c, + out_c, tiles, 36); + } + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/16, out_h4, out_w4, 16] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 16 * tiles * 4 * 4 * 16 * sizeof(__fp16)); + wg_b4f3s1_trans_output_pack16_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, + block_h, block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack16to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +void shl_c908_wg_b6f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + // kernel transform matrix: G + const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + // [O, I, 8, 8] --> [O/16, 8*8, I, 16] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 16 * inch * 16 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2; + + for (int oc = 0; oc < outch / pack2n; oc++) { + __fp16 *g0 = kernel_tm_packn + oc * 64 * inch * pack2n; + + for (int k = 0; k < 64; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + + for (int ic = 0; ic < inch / pack2n; ic++) { + for (int i = 0; i < pack2n; i++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = + kernel_tm + (oc * pack2n + j) * 64 * inch + (ic * pack2n + i) * 64; + *g00++ = k00[k]; + } + } + } + } + } + shl_mem_free(kernel_tm); +} + +int shl_c908_wg_b6f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/8 h w 8] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 64, tiles, 8] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/16, 64, tiles, 16] + const int vlen = csrr_vlenb() * 8; + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c / 16 * 64 * tiles * 16 * sizeof(__fp16)); + if (vlen == 128) { + wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + } else if (vlen == 256) { + wg_bxf3s1_batch_gemm_m32n8_fp16_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c, + out_c, tiles, 64); + } + + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/16, out_h6, out_w6, 16] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 16 * tiles * 6 * 6 * 16 * sizeof(__fp16)); + wg_b6f3s1_trans_output_pack16_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, + block_h, block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack16to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +void shl_c908_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + /* todo: direct conv2d */ +} + +void shl_c908_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + /* todo: direct conv2d */ +} diff --git a/source/c908_opt/convolution_3x3_fp16_packn.c b/source/c908_opt/convolution_3x3_fp16_packn.c new file mode 100644 index 00000000..e3743df2 --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp16_packn.c @@ -0,0 +1,1044 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +#ifdef NNN + +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +static void winograd_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, int pad_top, + int pad_left) +{ + shl_rvv_pad_input_packn_fp16(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left); +} + +static void winograd_crop_output_packn_fp16(const __fp16 *output_trans, __fp16 *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + __fp16 *out_tm_ptr = (__fp16 *)output_trans + c * crop_size; + __fp16 *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + __fp16 *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl); + crop_ptr += packn; + vse16_v_f16m1(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + __fp16 tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 6*6 start addr + const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + + vfloat16m1_t _tmp0m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f, + vfadd_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f, + vfsub_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp5m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _r0tm0 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm5 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[4][6][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * ch * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * ch * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * ch * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * ch * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * ch * 5; + + __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * ch * 6; + output0_tm_1 += tiles * ch * 6; + output0_tm_2 += tiles * ch * 6; + output0_tm_3 += tiles * ch * 6; + output0_tm_4 += tiles * ch * 6; + output0_tm_5 += tiles * ch * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _out00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _out03 = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f16m1(_bias, _out00, vl); + _out01 = vfadd_vv_f16m1(_bias, _out01, vl); + _out02 = vfadd_vv_f16m1(_bias, _out02, vl); + _out03 = vfadd_vv_f16m1(_bias, _out03, vl); + + vse16_v_f16m1(output0, _out00, vl); + vse16_v_f16m1(output0 + packn * 1, _out01, vl); + vse16_v_f16m1(output0 + packn * 2, _out02, vl); + vse16_v_f16m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_fp16(const __fp16 *src, __fp16 *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + __fp16 *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _a0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _a4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _a5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _a6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _a7 = vle16_v_f16m1(tm1 + packn * 7, vl); + vfloat16m1_t _a8 = vle16_v_f16m1(tm1 + packn * 8, vl); + vfloat16m1_t _a9 = vle16_v_f16m1(tm1 + packn * 9, vl); + vfloat16m1_t _a10 = vle16_v_f16m1(tm1 + packn * 10, vl); + vfloat16m1_t _a11 = vle16_v_f16m1(tm1 + packn * 11, vl); + + vsse16_v_f16m1(img_tm2, 12 * sizeof(__fp16), _a0, vl); + vsse16_v_f16m1(img_tm2 + 1, 12 * sizeof(__fp16), _a1, vl); + vsse16_v_f16m1(img_tm2 + 2, 12 * sizeof(__fp16), _a2, vl); + vsse16_v_f16m1(img_tm2 + 3, 12 * sizeof(__fp16), _a3, vl); + vsse16_v_f16m1(img_tm2 + 4, 12 * sizeof(__fp16), _a4, vl); + vsse16_v_f16m1(img_tm2 + 5, 12 * sizeof(__fp16), _a5, vl); + vsse16_v_f16m1(img_tm2 + 6, 12 * sizeof(__fp16), _a6, vl); + vsse16_v_f16m1(img_tm2 + 7, 12 * sizeof(__fp16), _a7, vl); + vsse16_v_f16m1(img_tm2 + 8, 12 * sizeof(__fp16), _a8, vl); + vsse16_v_f16m1(img_tm2 + 9, 12 * sizeof(__fp16), _a9, vl); + vsse16_v_f16m1(img_tm2 + 10, 12 * sizeof(__fp16), _a10, vl); + vsse16_v_f16m1(img_tm2 + 11, 12 * sizeof(__fp16), _a11, vl); + + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + + vse16_v_f16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int r = 0; r < area; r++) { + const __fp16 *kernel_ptr = kernel + r * out_ch * in_ch; + const __fp16 *input_ptr = input + r * tiles * in_ch; + __fp16 *output_ptr = output + r * tiles * out_ch; + + shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ptr, kernel_ptr, input_ptr, NULL, out_ch, in_ch, + tiles, false); + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + __fp16 tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 8*8 start addr + const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn; + // input_tm1 8*8 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); + + vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f16m1(_r03, _r05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = + vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[7][m], _tmp7m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vse16_v_f16m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + __fp16 *r0_tm6 = r0_tm5 + tiles * packn; + __fp16 *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = vfmacc_vf_f16m1( + _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm7, _r0tm7, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + vse16_v_f16m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[6][8][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * ch * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * ch * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * ch * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * ch * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * ch * 5; + const __fp16 *output0_tm_6 = output0_tm_0 + tiles * ch * 6; + const __fp16 *output0_tm_7 = output0_tm_0 + tiles * ch * 7; + + __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp5m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * ch * 8; + output0_tm_1 += tiles * ch * 8; + output0_tm_2 += tiles * ch * 8; + output0_tm_3 += tiles * ch * 8; + output0_tm_4 += tiles * ch * 8; + output0_tm_5 += tiles * ch * 8; + output0_tm_6 += tiles * ch * 8; + output0_tm_7 += tiles * ch * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); + + vfloat16m1_t _output00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _output02 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _output04 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _output01 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _output03 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _output05 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f16m1(_bias, _output00, vl); + _output01 = vfadd_vv_f16m1(_bias, _output01, vl); + _output02 = vfadd_vv_f16m1(_bias, _output02, vl); + _output03 = vfadd_vv_f16m1(_bias, _output03, vl); + _output04 = vfadd_vv_f16m1(_bias, _output04, vl); + _output05 = vfadd_vv_f16m1(_bias, _output05, vl); + + vse16_v_f16m1(output0, _output00, vl); + vse16_v_f16m1(output0 + packn * 2, _output02, vl); + vse16_v_f16m1(output0 + packn * 4, _output04, vl); + vse16_v_f16m1(output0 + packn * 1, _output01, vl); + vse16_v_f16m1(output0 + packn * 3, _output03, vl); + vse16_v_f16m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [36, O/pack2n, I, pack2n] --> [36, O/packn, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + + // kernel transform matrix: G + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd42 + // [O, I, 6, 6] --> [6*6, O/pack2n, I, pack2n] / [6*6, O/packn, I, packn] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(36 * outch / 4 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + for (int k = 0; k < 36; k++) { + __fp16 *g0 = kernel_tm_packn + k * outch * inch; + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [36, out_c/packn, tiles, packn] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(36 * out_c / 8 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [64, O/pack2n, I, pack2n] --> [64, O/pack, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + // kernel transform matrix: G + const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const __fp16 ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + // [O, I, 8, 8] --> [8*8, O/pack2n, I, pack2n] / [8*8, O/packn, I, packn] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + for (int k = 0; k < 64; k++) { + __fp16 *g0 = kernel_tm_packn + k * outch * inch; + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [64, out_c/packn, tiles, packn] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(64 * out_c / 8 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +#endif \ No newline at end of file diff --git a/source/c908_opt/convolution_3x3_fp16_packn_1.c b/source/c908_opt/convolution_3x3_fp16_packn_1.c new file mode 100644 index 00000000..928e5b5b --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp16_packn_1.c @@ -0,0 +1,2310 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +// #ifdef NNN + +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ + +/****************************************************************************************** + * padding input for winograd input transform + * input layout: [n c/packn h w packn] + * input_padded layout: [n c/packn h w packn] + * constrain: input channel % packn = 0 + * packn = vlen / sizeof(__fp16) + ******************************************************************************************/ +static void winograd_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, int pad_top, + int pad_left) +{ + shl_rvv_pad_input_packn_fp16(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left); +} + +static void winograd_crop_output_packn_fp16(const __fp16 *output_trans, __fp16 *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + __fp16 *out_tm_ptr = (__fp16 *)output_trans + c * crop_size; + __fp16 *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + __fp16 *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl); + crop_ptr += packn; + vse16_v_f16m1(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + __fp16 tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 6*6 start addr + const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + + vfloat16m1_t _tmp0m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f, + vfadd_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f, + vfsub_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp5m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _r0tm0 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm5 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[4][6][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _out00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _out03 = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f16m1(_bias, _out00, vl); + _out01 = vfadd_vv_f16m1(_bias, _out01, vl); + _out02 = vfadd_vv_f16m1(_bias, _out02, vl); + _out03 = vfadd_vv_f16m1(_bias, _out03, vl); + + vse16_v_f16m1(output0, _out00, vl); + vse16_v_f16m1(output0 + packn * 1, _out01, vl); + vse16_v_f16m1(output0 + packn * 2, _out02, vl); + vse16_v_f16m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_fp16(const __fp16 *src, __fp16 *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + __fp16 *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _a0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _a4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _a5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _a6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _a7 = vle16_v_f16m1(tm1 + packn * 7, vl); + vfloat16m1_t _a8 = vle16_v_f16m1(tm1 + packn * 8, vl); + vfloat16m1_t _a9 = vle16_v_f16m1(tm1 + packn * 9, vl); + vfloat16m1_t _a10 = vle16_v_f16m1(tm1 + packn * 10, vl); + vfloat16m1_t _a11 = vle16_v_f16m1(tm1 + packn * 11, vl); + + vsse16_v_f16m1(img_tm2, 12 * sizeof(__fp16), _a0, vl); + vsse16_v_f16m1(img_tm2 + 1, 12 * sizeof(__fp16), _a1, vl); + vsse16_v_f16m1(img_tm2 + 2, 12 * sizeof(__fp16), _a2, vl); + vsse16_v_f16m1(img_tm2 + 3, 12 * sizeof(__fp16), _a3, vl); + vsse16_v_f16m1(img_tm2 + 4, 12 * sizeof(__fp16), _a4, vl); + vsse16_v_f16m1(img_tm2 + 5, 12 * sizeof(__fp16), _a5, vl); + vsse16_v_f16m1(img_tm2 + 6, 12 * sizeof(__fp16), _a6, vl); + vsse16_v_f16m1(img_tm2 + 7, 12 * sizeof(__fp16), _a7, vl); + vsse16_v_f16m1(img_tm2 + 8, 12 * sizeof(__fp16), _a8, vl); + vsse16_v_f16m1(img_tm2 + 9, 12 * sizeof(__fp16), _a9, vl); + vsse16_v_f16m1(img_tm2 + 10, 12 * sizeof(__fp16), _a10, vl); + vsse16_v_f16m1(img_tm2 + 11, 12 * sizeof(__fp16), _a11, vl); + + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + + vse16_v_f16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + const int vl = vsetvl_e16m1(packn); + int p = 0; + for (; p + pack2n - 1 < out_ch; p += pack2n) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + __fp16 *output1_tm = output0_tm + packn * area * tiles; + + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 11 < tiles; t += 12) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + "flh ft4, 8(%[input_ptr])\n\t" + "flh ft5, 10(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle16.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flh fa0, 12(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flh fa1, 14(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flh fa2, 16(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flh fa3, 18(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v3\n\t" + "vfmacc.vf v24, ft4, v4\n\t" + "flh fa4, 20(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v3\n\t" + "vfmacc.vf v25, ft5, v4\n\t" + "flh fa5, 22(%[input_ptr])\n\t" + "vfmacc.vf v14, fa0, v3\n\t" + "vfmacc.vf v26, fa0, v4\n\t" + "flh ft0, 24(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v3\n\t" + "vfmacc.vf v27, fa1, v4\n\t" + "flh ft1, 26(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v3\n\t" + "vfmacc.vf v28, fa2, v4\n\t" + "flh ft2, 28(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v3\n\t" + "vfmacc.vf v29, fa3, v4\n\t" + "flh ft3, 30(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v3\n\t" + "vfmacc.vf v30, fa4, v4\n\t" + "flh ft4, 32(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v3\n\t" + "vfmacc.vf v31, fa5, v4\n\t" + "flh ft5, 34(%[input_ptr])\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "vfmacc.vf v20, ft0, v6\n\t" + "flh fa0, 36(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "vfmacc.vf v21, ft1, v6\n\t" + "flh fa1, 38(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "vfmacc.vf v22, ft2, v6\n\t" + "flh fa2, 40(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "vfmacc.vf v23, ft3, v6\n\t" + "flh fa3, 42(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v5\n\t" + "vfmacc.vf v24, ft4, v6\n\t" + "flh fa4, 44(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v5\n\t" + "vfmacc.vf v25, ft5, v6\n\t" + "flh fa5, 46(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 48\n\t" + "vfmacc.vf v14, fa0, v5\n\t" + "vfmacc.vf v26, fa0, v6\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v5\n\t" + "vfmacc.vf v27, fa1, v6\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v5\n\t" + "vfmacc.vf v28, fa2, v6\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v5\n\t" + "vfmacc.vf v29, fa3, v6\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v5\n\t" + "vfmacc.vf v30, fa4, v6\n\t" + "flh ft4, 8(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v5\n\t" + "vfmacc.vf v31, fa5, v6\n\t" + "flh ft5, 10(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v16, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v17, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v18, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v19, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse16.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v24, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v25, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v26, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v27, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v28, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v29, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v30, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v31, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "fa0", "fa1", "fa2", + "fa3", "fa4", "fa5", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0"); + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle16.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flh fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flh fa1, 10(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flh fa2, 12(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flh fa3, 14(%[input_ptr])\n\t" + "vfmacc.vf v12, fa0, v3\n\t" + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v3\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 18(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v3\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 20(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v3\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 22(%[input_ptr])\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "vfmacc.vf v20, ft0, v6\n\t" + "flh fa0, 24(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "vfmacc.vf v21, ft1, v6\n\t" + "flh fa1, 26(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "vfmacc.vf v22, ft2, v6\n\t" + "flh fa2, 28(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "vfmacc.vf v23, ft3, v6\n\t" + "flh fa3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vfmacc.vf v12, fa0, v5\n\t" + "vfmacc.vf v24, fa0, v6\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v5\n\t" + "vfmacc.vf v25, fa1, v6\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v5\n\t" + "vfmacc.vf v26, fa2, v6\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "vfmacc.vf v27, fa3, v6\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse16.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v24, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v25, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v26, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v27, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle16.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flh fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flh fa1, 10(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flh fa2, 12(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flh fa3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "vfmacc.vf v21, fa1, v6\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v10, fa2, v5\n\t" + "vfmacc.vf v22, fa2, v6\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v11, fa3, v5\n\t" + "vfmacc.vf v23, fa3, v6\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse16.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v20", + "v21", "v22", "v23", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle16.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flh fa0, 4(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flh fa1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "vfmacc.vf v21, fa1, v6\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse16.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse16.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v20", "v21", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v20, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle16.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flh fa0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v20", "fa0", "ft0", "t0"); + } + } + } + + for (; p + packn - 1 < out_ch; p += packn) { + __fp16 *output0_tm = output + p * area * tiles; // 4 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 4 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 11 < tiles; t += 12) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + "flh ft4, 8(%[input_ptr])\n\t" + "flh ft5, 10(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flh fa0, 12(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flh fa1, 14(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flh fa2, 16(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flh fa3, 18(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v3\n\t" + "flh fa4, 20(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v3\n\t" + "flh fa5, 22(%[input_ptr])\n\t" + "vfmacc.vf v14, fa0, v3\n\t" + "flh ft0, 24(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v3\n\t" + "flh ft1, 26(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v3\n\t" + "flh ft2, 28(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v3\n\t" + "flh ft3, 30(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v3\n\t" + "flh ft4, 32(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v3\n\t" + "flh ft5, 34(%[input_ptr])\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "flh fa0, 36(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "flh fa1, 38(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "flh fa2, 40(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "flh fa3, 42(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v5\n\t" + "flh fa4, 44(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v5\n\t" + "flh fa5, 46(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 48\n\t" + "vfmacc.vf v14, fa0, v5\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v5\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v5\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v5\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v5\n\t" + "flh ft4, 8(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v5\n\t" + "flh ft5, 10(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v16, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v17, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v18, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v19, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", + "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0"); + } + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flh fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flh fa1, 10(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flh fa2, 12(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flh fa3, 14(%[input_ptr])\n\t" + "vfmacc.vf v12, fa0, v3\n\t" + "flh ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v3\n\t" + "flh ft1, 18(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v3\n\t" + "flh ft2, 20(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v3\n\t" + "flh ft3, 22(%[input_ptr])\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "flh fa0, 24(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "flh fa1, 26(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "flh fa2, 28(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "flh fa3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vfmacc.vf v12, fa0, v5\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v5\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v5\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flh fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flh fa1, 10(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flh fa2, 12(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flh fa3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + "vfmacc.vf v10, fa2, v5\n\t" + "flh ft2, 4(%[input_ptr])\n\t" + "vfmacc.vf v11, fa3, v5\n\t" + "flh ft3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "fa0", "fa1", "fa2", + "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flh fa0, 4(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flh fa1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "flh ft1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse16.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v5", "v8", "v9", "fa0", "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + + // pre-load kernel matrix + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flh ft0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flh fa0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" + + "vle16.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flh ft0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse16.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v3", "v5", "v8", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + __fp16 tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 8*8 start addr + const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn; + // input_tm1 8*8 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); + + vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f16m1(_r03, _r05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = + vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[7][m], _tmp7m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vse16_v_f16m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + __fp16 *r0_tm6 = r0_tm5 + tiles * packn; + __fp16 *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = vfmacc_vf_f16m1( + _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm7, _r0tm7, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + vse16_v_f16m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[6][8][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6; + const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7; + + __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp5m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * packn * 8; + output0_tm_1 += tiles * packn * 8; + output0_tm_2 += tiles * packn * 8; + output0_tm_3 += tiles * packn * 8; + output0_tm_4 += tiles * packn * 8; + output0_tm_5 += tiles * packn * 8; + output0_tm_6 += tiles * packn * 8; + output0_tm_7 += tiles * packn * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); + + vfloat16m1_t _output00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _output02 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _output04 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _output01 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _output03 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _output05 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f16m1(_bias, _output00, vl); + _output01 = vfadd_vv_f16m1(_bias, _output01, vl); + _output02 = vfadd_vv_f16m1(_bias, _output02, vl); + _output03 = vfadd_vv_f16m1(_bias, _output03, vl); + _output04 = vfadd_vv_f16m1(_bias, _output04, vl); + _output05 = vfadd_vv_f16m1(_bias, _output05, vl); + + vse16_v_f16m1(output0, _output00, vl); + vse16_v_f16m1(output0 + packn * 2, _output02, vl); + vse16_v_f16m1(output0 + packn * 4, _output04, vl); + vse16_v_f16m1(output0 + packn * 1, _output01, vl); + vse16_v_f16m1(output0 + packn * 3, _output03, vl); + vse16_v_f16m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(__fp16) + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + + // kernel transform matrix: G + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/pack2n, 6*6, I, pack2n] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + // [O/packn, 6*6, I, packn] + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + __fp16 *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(__fp16) + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [36, out_c/packn, tiles, packn] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(36 * out_c / 8 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 64, I, pack2n] --> [O/pack, 64, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(__fp16) + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + // kernel transform matrix: G + const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const __fp16 ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + __fp16 *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flh fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v2\n\t" + "flh fa1, 18(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v2\n\t" + "flh fa2, 20(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v2\n\t" + "flh fa3, 22(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 24(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flh ft1, 26(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flh ft2, 28(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flh ft3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, fa0, v2\n\t" + "flh ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v26, fa1, v2\n\t" + "flh ft1, 10(%[input_ptr])\n\t" + "vfmacc.vf v28, fa2, v2\n\t" + "flh ft2, 12(%[input_ptr])\n\t" + "vfmacc.vf v30, fa3, v2\n\t" + "flh ft3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flh fa2, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flh fa3, 6(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, fa0, v2\n\t" + "flh ft0, 4(%[input_ptr])\n\t" + "vfmacc.vf v30, fa1, v2\n\t" + "flh ft1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v30, ft1, v4\n\t" + "flh fa1, 2(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e16, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flh fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, fa0, v2\n\t" + "flh ft0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, ft0, v4\n\t" + "flh fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vse16.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(__fp16) + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [64, out_c/packn, tiles, packn] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(64 * out_c / 8 * tiles * 8 * sizeof(__fp16)); + // wg_bxf3s1_batch_gemm_pack2nx12_fp16 + wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +// #endif \ No newline at end of file diff --git a/source/c908_opt/convolution_3x3_fp32.c b/source/c908_opt/convolution_3x3_fp32.c new file mode 100644 index 00000000..e71d8dfd --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp32.c @@ -0,0 +1,1690 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + note: VLEN = 128 +*************************************************************/ +// TODO: move pad api to rvv pad operator +/****************************************************************************************** + * padding input for winograd input transform , and change memory layout + * input layout: [n c h w] + * input_padded layout: [n, c/4, h, w, 4] + * constrain: input channel % 4 = 0 + ******************************************************************************************/ +static void winograd_pad_input_pack1to4_fp32(const float *input, float *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, + int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + int padded_hw = padded_h * padded_w; + const int in_size = inh * inw; // per-channel size + + float *pad_ptr = input_padded; + float *inp_ptr = (float *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + inp_ptr = (float *)input + c * in_size; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl); + inp_ptr++; + vse32_v_f32m1(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +} + +/****************************************************************************************** + * cut winograd output transform for output, and change memory layout + * winograd output transform layout: [n, c/8, h, w, 8] + * output layout: [n, c, h, w] + * constrain: output channel % 8 = 0 + ******************************************************************************************/ +static void winograd_crop_output_pack8to1_fp32(const float *output_trans, float *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int pack2n = csrr_vlenb() / sizeof(float) * 2; + const int vl = vsetvl_e32m2(pack2n); + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + float *out_tm_ptr = (float *)output_trans; + float *out_ptr = output; + + int c = 0; + for (; c + pack2n - 1 < out_c; c += pack2n) { + out_tm_ptr = (float *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + float *crop_ptr = out_tm_ptr + h * wino_w * vl; + for (int w = 0; w < out_w; w++) { + vfloat32m2_t _tmp = vle32_v_f32m2(crop_ptr, vl); + crop_ptr += vl; + vsse32_v_f32m2(out_ptr, out_size * sizeof(float), _tmp, vl); + out_ptr++; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_pack4_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // after padding - q channel + float *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + float tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // pad_buf 6*6 block start addr + const float *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + + vfloat32m1_t _tmp0m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f, + vfadd_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f, + vfsub_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp5m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _r0tm0 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm5 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_pack8_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int pack2n = csrr_vlenb() / sizeof(float) * 2; + const int vl = vsetvl_e32m2(pack2n); + int tiles = blk_h * blk_w; + for (int p = 0; p + pack2n - 1 < ch; p += pack2n) { + const float *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + float tmp[4][6][pack2n]; + + vfloat32m2_t _bias = bias ? vle32_v_f32m2(bias + p, vl) : vfmv_v_f_f32m2(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n; // 6*6 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5; + + float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * pack2n; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat32m2_t _r00 = vle32_v_f32m2(output0_tm_0, vl); + vfloat32m2_t _r01 = vle32_v_f32m2(output0_tm_1, vl); + vfloat32m2_t _r02 = vle32_v_f32m2(output0_tm_2, vl); + vfloat32m2_t _r03 = vle32_v_f32m2(output0_tm_3, vl); + vfloat32m2_t _r04 = vle32_v_f32m2(output0_tm_4, vl); + vfloat32m2_t _r05 = vle32_v_f32m2(output0_tm_5, vl); + + vfloat32m2_t _tmp02a = vfadd_vv_f32m2(_r01, _r02, vl); + vfloat32m2_t _tmp13a = vfsub_vv_f32m2(_r01, _r02, vl); + + vfloat32m2_t _tmp02b = vfadd_vv_f32m2(_r03, _r04, vl); + vfloat32m2_t _tmp13b = vfsub_vv_f32m2(_r03, _r04, vl); + + vfloat32m2_t _tmp0m = + vfadd_vv_f32m2(vfadd_vv_f32m2(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat32m2_t _tmp1m = vfmacc_vf_f32m2(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m2_t _tmp2m = vfmacc_vf_f32m2(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m2_t _tmp3m = + vfmacc_vf_f32m2(vfadd_vv_f32m2(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse32_v_f32m2(tmp[0][m], _tmp0m, vl); + vse32_v_f32m2(tmp[1][m], _tmp1m, vl); + vse32_v_f32m2(tmp[2][m], _tmp2m, vl); + vse32_v_f32m2(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * pack2n * 6; + output0_tm_1 += tiles * pack2n * 6; + output0_tm_2 += tiles * pack2n * 6; + output0_tm_3 += tiles * pack2n * 6; + output0_tm_4 += tiles * pack2n * 6; + output0_tm_5 += tiles * pack2n * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat32m2_t _tmp00 = vle32_v_f32m2(tmp[m][0], vl); + vfloat32m2_t _tmp01 = vle32_v_f32m2(tmp[m][1], vl); + vfloat32m2_t _tmp02 = vle32_v_f32m2(tmp[m][2], vl); + vfloat32m2_t _tmp03 = vle32_v_f32m2(tmp[m][3], vl); + vfloat32m2_t _tmp04 = vle32_v_f32m2(tmp[m][4], vl); + vfloat32m2_t _tmp05 = vle32_v_f32m2(tmp[m][5], vl); + + vfloat32m2_t _tmp02a = vfadd_vv_f32m2(_tmp01, _tmp02, vl); + vfloat32m2_t _tmp13a = vfsub_vv_f32m2(_tmp01, _tmp02, vl); + + vfloat32m2_t _tmp02b = vfadd_vv_f32m2(_tmp03, _tmp04, vl); + vfloat32m2_t _tmp13b = vfsub_vv_f32m2(_tmp03, _tmp04, vl); + + vfloat32m2_t _out00 = + vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat32m2_t _out01 = vfmacc_vf_f32m2(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m2_t _out02 = vfmacc_vf_f32m2(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m2_t _out03 = + vfmacc_vf_f32m2(vfadd_vv_f32m2(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f32m2(_bias, _out00, vl); + _out01 = vfadd_vv_f32m2(_bias, _out01, vl); + _out02 = vfadd_vv_f32m2(_bias, _out02, vl); + _out03 = vfadd_vv_f32m2(_bias, _out03, vl); + + vse32_v_f32m2(output0, _out00, vl); + vse32_v_f32m2(output0 + pack2n * 1, _out01, vl); + vse32_v_f32m2(output0 + pack2n * 2, _out02, vl); + vse32_v_f32m2(output0 + pack2n * 3, _out03, vl); + + output0 += blk_w * 4 * pack2n; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile8_fp32(const float *src, float *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + for (int r = 0; r < area; r++) { + float *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + + vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + + vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + + vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + + vse32_v_f32m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m8n8_fp32(const float *input, const float *kernel, + float *output, int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 7 < out_ch; p += 8) { + float *output0_tm = output + p * area * tiles; // 8 channel dot output + const float *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v16, fa0, v2\n\t" + "flw ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v2\n\t" + "flw ft1, 20(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v2\n\t" + "flw ft2, 24(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v2\n\t" + "flw ft3, 28(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flw fa0, 32(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v2\n\t" + "flw fa1, 36(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v2\n\t" + "flw fa2, 40(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v2\n\t" + "flw fa3, 44(%[input_ptr])\n\t" + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 48(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flw ft1, 52(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flw ft2, 56(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flw ft3, 60(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" // input_ptr += 16 + "vfmacc.vf v24, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 8 + + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v24, fa0, v2\n\t" + "flw ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v26, fa1, v2\n\t" + "flw ft1, 20(%[input_ptr])\n\t" + "vfmacc.vf v28, fa2, v2\n\t" + "flw ft2, 24(%[input_ptr])\n\t" + "vfmacc.vf v30, fa3, v2\n\t" + "flw ft3, 28(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 8 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v24, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 8 + + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v28, fa0, v2\n\t" + "flw ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, fa1, v2\n\t" + "flw ft1, 12(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 4 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v28, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v30, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 8 + + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t0, 8\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v30, fa0, v2\n\t" + "flw ft0, 4(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 2 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 8 + + "vfmacc.vf v30, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 8 + + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_fp32_v256(const float *input, const float *kernel, + float *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + float *output0_tm = output + p * area * tiles; // 16 channel dot output + const float *kernel0_tm = kernel + p * area * in_ch; // 16 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v2\n\t" + "flw ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v2\n\t" + "flw ft1, 20(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v2\n\t" + "flw ft2, 24(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v2\n\t" + "flw ft3, 28(%[input_ptr])\n\t" + "vfmacc.vf v24, ft0, v2\n\t" + "flw fa0, 32(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v2\n\t" + "flw fa1, 36(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v2\n\t" + "flw fa2, 40(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v2\n\t" + "flw fa3, 44(%[input_ptr])\n\t" + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 48(%[input_ptr])\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flw ft1, 52(%[input_ptr])\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flw ft2, 56(%[input_ptr])\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flw ft3, 60(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" // input_ptr += 16 + "vfmacc.vf v24, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 16 + + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, fa0, v2\n\t" + "flw ft0, 16(%[input_ptr])\n\t" + "vfmacc.vf v26, fa1, v2\n\t" + "flw ft1, 20(%[input_ptr])\n\t" + "vfmacc.vf v28, fa2, v2\n\t" + "flw ft2, 24(%[input_ptr])\n\t" + "vfmacc.vf v30, fa3, v2\n\t" + "flw ft3, 28(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 8 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v24, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v26, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + "vfmacc.vf v28, ft2, v4\n\t" + "flw fa2, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, ft3, v4\n\t" + "flw fa3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 16 + + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, fa0, v2\n\t" + "flw ft0, 8(%[input_ptr])\n\t" + "vfmacc.vf v30, fa1, v2\n\t" + "flw ft1, 12(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 4 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v28, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + "vfmacc.vf v30, ft1, v4\n\t" + "flw fa1, 4(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 16 + + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t0, 16\n\t" + "vsetvli zero, t0, e32, m2\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" // clear + + // pre-load kernel matrix + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "flw fa0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n4k2 + "vle32.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, fa0, v2\n\t" + "flw ft0, 4(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 2 + + "vle32.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 64\n\t" // kernel_ptr += 16 + + "vfmacc.vf v30, ft0, v4\n\t" + "flw fa0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -64\n\t" // kernel_ptr -= 16 + + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_b6f3s1_trans_input_pack4_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // feature map after padding - q channel + float *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + float tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *r0 = + img0 + (i * w * 6 + j * 6) * packn; // feature map after padding 8*8 start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; // input_tm1 8*8 block start addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); + + vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f32m1(_r04, _r02, vl), vl); + vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f32m1(_r03, _r05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = + vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[7][m], _tmp7m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vse32_v_f32m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + float *r0_tm6 = r0_tm5 + tiles * packn; + float *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); + vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = vfmacc_vf_f32m1( + _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm7, _r0tm7, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + vse32_v_f32m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_pack8_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int pack2n = csrr_vlenb() / sizeof(float) * 2; + const int vl = vsetvl_e32m2(pack2n); + int tiles = blk_h * blk_w; + for (int p = 0; p + pack2n - 1 < ch; p += pack2n) { + const float *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + float tmp[6][8][pack2n]; + + vfloat32m2_t _bias = bias ? vle32_v_f32m2(bias + p, vl) : vfmv_v_f_f32m2(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n; // 8*8 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5; + const float *output0_tm_6 = output0_tm_0 + tiles * pack2n * 6; + const float *output0_tm_7 = output0_tm_0 + tiles * pack2n * 7; + + float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * pack2n; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat32m2_t _r00 = vle32_v_f32m2(output0_tm_0, vl); + vfloat32m2_t _r01 = vle32_v_f32m2(output0_tm_1, vl); + vfloat32m2_t _r02 = vle32_v_f32m2(output0_tm_2, vl); + vfloat32m2_t _r03 = vle32_v_f32m2(output0_tm_3, vl); + vfloat32m2_t _r04 = vle32_v_f32m2(output0_tm_4, vl); + vfloat32m2_t _r05 = vle32_v_f32m2(output0_tm_5, vl); + vfloat32m2_t _r06 = vle32_v_f32m2(output0_tm_6, vl); + vfloat32m2_t _r07 = vle32_v_f32m2(output0_tm_7, vl); + + vfloat32m2_t _tmp024a = vfadd_vv_f32m2(_r01, _r02, vl); + vfloat32m2_t _tmp135a = vfsub_vv_f32m2(_r01, _r02, vl); + + vfloat32m2_t _tmp024b = vfadd_vv_f32m2(_r03, _r04, vl); + vfloat32m2_t _tmp135b = vfsub_vv_f32m2(_r03, _r04, vl); + + vfloat32m2_t _tmp024c = vfadd_vv_f32m2(_r05, _r06, vl); + vfloat32m2_t _tmp135c = vfsub_vv_f32m2(_r05, _r06, vl); + + vfloat32m2_t _tmp0m = + vfadd_vv_f32m2(vfadd_vv_f32m2(_r00, _tmp024a, vl), + vfmacc_vf_f32m2(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m2_t _tmp2m = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m2_t _tmp4m = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m2_t _tmp1m = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m2_t _tmp3m = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m2_t _tmp5m = + vfadd_vv_f32m2(vfadd_vv_f32m2(_r07, _tmp135a, vl), + vfmacc_vf_f32m2(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse32_v_f32m2(tmp[0][m], _tmp0m, vl); + vse32_v_f32m2(tmp[2][m], _tmp2m, vl); + vse32_v_f32m2(tmp[4][m], _tmp4m, vl); + vse32_v_f32m2(tmp[1][m], _tmp1m, vl); + vse32_v_f32m2(tmp[3][m], _tmp3m, vl); + vse32_v_f32m2(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * pack2n * 8; + output0_tm_1 += tiles * pack2n * 8; + output0_tm_2 += tiles * pack2n * 8; + output0_tm_3 += tiles * pack2n * 8; + output0_tm_4 += tiles * pack2n * 8; + output0_tm_5 += tiles * pack2n * 8; + output0_tm_6 += tiles * pack2n * 8; + output0_tm_7 += tiles * pack2n * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat32m2_t _tmp00 = vle32_v_f32m2(tmp[m][0], vl); + vfloat32m2_t _tmp01 = vle32_v_f32m2(tmp[m][1], vl); + vfloat32m2_t _tmp02 = vle32_v_f32m2(tmp[m][2], vl); + vfloat32m2_t _tmp03 = vle32_v_f32m2(tmp[m][3], vl); + vfloat32m2_t _tmp04 = vle32_v_f32m2(tmp[m][4], vl); + vfloat32m2_t _tmp05 = vle32_v_f32m2(tmp[m][5], vl); + vfloat32m2_t _tmp06 = vle32_v_f32m2(tmp[m][6], vl); + vfloat32m2_t _tmp07 = vle32_v_f32m2(tmp[m][7], vl); + + vfloat32m2_t _tmp024a = vfadd_vv_f32m2(_tmp01, _tmp02, vl); + vfloat32m2_t _tmp135a = vfsub_vv_f32m2(_tmp01, _tmp02, vl); + + vfloat32m2_t _tmp024b = vfadd_vv_f32m2(_tmp03, _tmp04, vl); + vfloat32m2_t _tmp135b = vfsub_vv_f32m2(_tmp03, _tmp04, vl); + + vfloat32m2_t _tmp024c = vfadd_vv_f32m2(_tmp05, _tmp06, vl); + vfloat32m2_t _tmp135c = vfsub_vv_f32m2(_tmp05, _tmp06, vl); + + vfloat32m2_t _output00 = + vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp00, _tmp024a, vl), + vfmacc_vf_f32m2(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m2_t _output02 = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m2_t _output04 = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m2_t _output01 = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m2_t _output03 = vfmacc_vf_f32m2( + vfmacc_vf_f32m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m2_t _output05 = + vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp07, _tmp135a, vl), + vfmacc_vf_f32m2(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f32m2(_bias, _output00, vl); + _output01 = vfadd_vv_f32m2(_bias, _output01, vl); + _output02 = vfadd_vv_f32m2(_bias, _output02, vl); + _output03 = vfadd_vv_f32m2(_bias, _output03, vl); + _output04 = vfadd_vv_f32m2(_bias, _output04, vl); + _output05 = vfadd_vv_f32m2(_bias, _output05, vl); + + vse32_v_f32m2(output0, _output00, vl); + vse32_v_f32m2(output0 + pack2n * 2, _output02, vl); + vse32_v_f32m2(output0 + pack2n * 4, _output04, vl); + vse32_v_f32m2(output0 + pack2n * 1, _output01, vl); + vse32_v_f32m2(output0 + pack2n * 3, _output03, vl); + vse32_v_f32m2(output0 + pack2n * 5, _output05, vl); + + output0 += blk_w * 6 * pack2n; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/8, 36, I, 8] + * constrain: output channel % 8 = 0 + * input channel % 4 = 0 + ******************************************************************************************/ +void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + + // kernel transform matrix: G + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/8, 6*6, I, 8] + float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + // for (int oc = 0; oc + 7 < outch; oc += 8) { + // const float *k0 = kernel_tm + (oc + 0) * inch * 36; + // const float *k1 = kernel_tm + (oc + 1) * inch * 36; + // const float *k2 = kernel_tm + (oc + 2) * inch * 36; + // const float *k3 = kernel_tm + (oc + 3) * inch * 36; + // const float *k4 = kernel_tm + (oc + 4) * inch * 36; + // const float *k5 = kernel_tm + (oc + 5) * inch * 36; + // const float *k6 = kernel_tm + (oc + 6) * inch * 36; + // const float *k7 = kernel_tm + (oc + 7) * inch * 36; + + // float *g0 = kernel_tm_packn + oc * inch * 36; + + // for (int t = 0; t < 36; t++) { + // float *g00 = g0 + t * inch * 8; + + // for (int ic = 0; ic < inch; ic++) { + // const float *k00 = k0 + ic * 36; + // const float *k10 = k1 + ic * 36; + // const float *k20 = k2 + ic * 36; + // const float *k30 = k3 + ic * 36; + // const float *k40 = k4 + ic * 36; + // const float *k50 = k5 + ic * 36; + // const float *k60 = k6 + ic * 36; + // const float *k70 = k7 + ic * 36; + + // g00[0] = k00[t]; + // g00[1] = k10[t]; + // g00[2] = k20[t]; + // g00[3] = k30[t]; + // g00[4] = k40[t]; + // g00[5] = k50[t]; + // g00[6] = k60[t]; + // g00[7] = k70[t]; + // g00 += 8; + // } + // } + // } + + const int pack2n = csrr_vlenb() / sizeof(float) * 2; + + for (int oc = 0; oc < outch / pack2n; oc++) { + float *g0 = kernel_tm_packn + oc * 36 * inch * pack2n; + + for (int k = 0; k < 36; k++) { + float *g00 = g0 + k * inch * pack2n; + + for (int ic = 0; ic < inch / pack2n; ic++) { + for (int i = 0; i < pack2n; i++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = + kernel_tm + (oc * pack2n + j) * 36 * inch + (ic * pack2n + i) * 36; + *g00++ = k00[k]; + } + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 8 = 0 + * input channel % 4 = 0 + ******************************************************************************************/ +int shl_c908_wg_b4f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/4 h w 4] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_pack1to4_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/4, 64, tiles, 4] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float)); + wg_b4f3s1_trans_input_pack4_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/4, in_c, 4] + float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 36, tiles, 8] + const int vlen = csrr_vlenb() * 8; + float *output_dot_buf = (float *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(float)); + if (vlen == 128) { + wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + } else if (vlen == 256) { + wg_bxf3s1_batch_gemm_m16n8_fp32_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c, + out_c, tiles, 36); + } + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h4, out_w4, 8] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(float)); + wg_b4f3s1_trans_output_pack8_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack8to1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/8, 64, I, 8] + * constrain: output channel % 8 = 0 + * input channel % 4 = 0 + ******************************************************************************************/ +void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + // kernel transform matrix: G + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const float ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 8 * inch * 8 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + // for (int oc = 0; oc + 7 < outch; oc += 8) { + // const float *k0 = kernel_tm + (oc + 0) * inch * 64; + // const float *k1 = kernel_tm + (oc + 1) * inch * 64; + // const float *k2 = kernel_tm + (oc + 2) * inch * 64; + // const float *k3 = kernel_tm + (oc + 3) * inch * 64; + // const float *k4 = kernel_tm + (oc + 4) * inch * 64; + // const float *k5 = kernel_tm + (oc + 5) * inch * 64; + // const float *k6 = kernel_tm + (oc + 6) * inch * 64; + // const float *k7 = kernel_tm + (oc + 7) * inch * 64; + + // float *g0 = kernel_tm_packn + oc * inch * 64; + + // for (int t = 0; t < 64; t++) { + // float *g00 = g0 + t * inch * 8; + + // for (int ic = 0; ic < inch; ic++) { + // const float *k00 = k0 + ic * 64; + // const float *k10 = k1 + ic * 64; + // const float *k20 = k2 + ic * 64; + // const float *k30 = k3 + ic * 64; + // const float *k40 = k4 + ic * 64; + // const float *k50 = k5 + ic * 64; + // const float *k60 = k6 + ic * 64; + // const float *k70 = k7 + ic * 64; + + // g00[0] = k00[t]; + // g00[1] = k10[t]; + // g00[2] = k20[t]; + // g00[3] = k30[t]; + // g00[4] = k40[t]; + // g00[5] = k50[t]; + // g00[6] = k60[t]; + // g00[7] = k70[t]; + // g00 += 8; + // } + // } + // } + + const int pack2n = csrr_vlenb() / sizeof(float) * 2; + + for (int oc = 0; oc < outch / pack2n; oc++) { + float *g0 = kernel_tm_packn + oc * 64 * inch * pack2n; + + for (int k = 0; k < 64; k++) { + float *g00 = g0 + k * inch * pack2n; + + for (int ic = 0; ic < inch / pack2n; ic++) { + for (int i = 0; i < pack2n; i++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = + kernel_tm + (oc * pack2n + j) * 64 * inch + (ic * pack2n + i) * 64; + *g00++ = k00[k]; + } + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 8 = 0 + * input channel % 4 = 0 + ******************************************************************************************/ +int shl_c908_wg_b6f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/4 h w 4] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_pack1to4_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/4, 64, tiles, 4] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float)); + wg_b6f3s1_trans_input_pack4_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 64, tiles, 8] + const int vlen = csrr_vlenb() * 8; + float *output_dot_buf = (float *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(float)); + if (vlen == 128) { + wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + } else if (vlen == 256) { + wg_bxf3s1_batch_gemm_m16n8_fp32_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c, + out_c, tiles, 64); + } + + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h4, out_w4, 8] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(float)); + wg_b6f3s1_trans_output_pack8_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack8to1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +void shl_c908_conv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + /* todo: direct conv2d */ +} + +void shl_c908_conv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + /* todo: direct conv2d */ +} diff --git a/source/c908_opt/convolution_3x3_fp32_packn.c b/source/c908_opt/convolution_3x3_fp32_packn.c new file mode 100644 index 00000000..8436f853 --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp32_packn.c @@ -0,0 +1,1048 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +#ifdef NNN +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ + +/************************************************************* + * padding input for winograd input transform , and change memory layout to [n c/4 h w 4] + * input layout: [n c h w] + * input_padded layout: [n c/packn h w packn] + * constrain: input channel % packn = 0 + *************************************************************/ +static void winograd_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left) +{ + shl_rvv_pad_input_packn_fp32(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left); +} + +static void winograd_crop_output_packn_fp32(const float *output_trans, float *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + float *out_tm_ptr = (float *)output_trans + c * crop_size; + float *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + float *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl); + crop_ptr += packn; + vse32_v_f32m1(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // after padding - q channel + float *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + float tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // pad_buf 6*6 block start addr + const float *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + + vfloat32m1_t _tmp0m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f, + vfadd_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f, + vfsub_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp5m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _r0tm0 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm5 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + float tmp[4][6][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * ch * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * ch * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * ch * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * ch * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * ch * 5; + + float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _tmp3m = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * ch * 6; + output0_tm_1 += tiles * ch * 6; + output0_tm_2 += tiles * ch * 6; + output0_tm_3 += tiles * ch * 6; + output0_tm_4 += tiles * ch * 6; + output0_tm_5 += tiles * ch * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _out00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _out03 = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f32m1(_bias, _out00, vl); + _out01 = vfadd_vv_f32m1(_bias, _out01, vl); + _out02 = vfadd_vv_f32m1(_bias, _out02, vl); + _out03 = vfadd_vv_f32m1(_bias, _out03, vl); + + vse32_v_f32m1(output0, _out00, vl); + vse32_v_f32m1(output0 + packn * 1, _out01, vl); + vse32_v_f32m1(output0 + packn * 2, _out02, vl); + vse32_v_f32m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_fp32(const float *src, float *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + for (int r = 0; r < area; r++) { + float *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _a0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _a4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _a5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _a6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _a7 = vle32_v_f32m1(tm1 + packn * 7, vl); + vfloat32m1_t _a8 = vle32_v_f32m1(tm1 + packn * 8, vl); + vfloat32m1_t _a9 = vle32_v_f32m1(tm1 + packn * 9, vl); + vfloat32m1_t _a10 = vle32_v_f32m1(tm1 + packn * 10, vl); + vfloat32m1_t _a11 = vle32_v_f32m1(tm1 + packn * 11, vl); + + vsse32_v_f32m1(img_tm2, 12 * sizeof(float), _a0, vl); + vsse32_v_f32m1(img_tm2 + 1, 12 * sizeof(float), _a1, vl); + vsse32_v_f32m1(img_tm2 + 2, 12 * sizeof(float), _a2, vl); + vsse32_v_f32m1(img_tm2 + 3, 12 * sizeof(float), _a3, vl); + vsse32_v_f32m1(img_tm2 + 4, 12 * sizeof(float), _a4, vl); + vsse32_v_f32m1(img_tm2 + 5, 12 * sizeof(float), _a5, vl); + vsse32_v_f32m1(img_tm2 + 6, 12 * sizeof(float), _a6, vl); + vsse32_v_f32m1(img_tm2 + 7, 12 * sizeof(float), _a7, vl); + vsse32_v_f32m1(img_tm2 + 8, 12 * sizeof(float), _a8, vl); + vsse32_v_f32m1(img_tm2 + 9, 12 * sizeof(float), _a9, vl); + vsse32_v_f32m1(img_tm2 + 10, 12 * sizeof(float), _a10, vl); + vsse32_v_f32m1(img_tm2 + 11, 12 * sizeof(float), _a11, vl); + + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + + vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + + vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + + vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + + vse32_v_f32m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp32(const float *input, const float *kernel, + float *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int r = 0; r < area; r++) { + const float *kernel_ptr = kernel + r * out_ch * in_ch; + const float *input_ptr = input + r * tiles * in_ch; + float *output_ptr = output + r * tiles * out_ch; + + shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ptr, kernel_ptr, input_ptr, NULL, out_ch, in_ch, + tiles, false); + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // feature map after padding - q channel + float *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + float tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *r0 = + img0 + (i * w * 6 + j * 6) * packn; // feature map after padding 8*8 start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; // input_tm1 8*8 block start addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); + + vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f32m1(_r04, _r02, vl), vl); + vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f32m1(_r03, _r05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = + vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[7][m], _tmp7m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vse32_v_f32m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + float *r0_tm6 = r0_tm5 + tiles * packn; + float *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); + vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = vfmacc_vf_f32m1( + _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm7, _r0tm7, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + vse32_v_f32m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + float tmp[6][8][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * ch * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * ch * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * ch * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * ch * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * ch * 5; + const float *output0_tm_6 = output0_tm_0 + tiles * ch * 6; + const float *output0_tm_7 = output0_tm_0 + tiles * ch * 7; + + float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _tmp5m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * ch * 8; + output0_tm_1 += tiles * ch * 8; + output0_tm_2 += tiles * ch * 8; + output0_tm_3 += tiles * ch * 8; + output0_tm_4 += tiles * ch * 8; + output0_tm_5 += tiles * ch * 8; + output0_tm_6 += tiles * ch * 8; + output0_tm_7 += tiles * ch * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); + + vfloat32m1_t _output00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _output02 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _output04 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _output01 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _output03 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _output05 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f32m1(_bias, _output00, vl); + _output01 = vfadd_vv_f32m1(_bias, _output01, vl); + _output02 = vfadd_vv_f32m1(_bias, _output02, vl); + _output03 = vfadd_vv_f32m1(_bias, _output03, vl); + _output04 = vfadd_vv_f32m1(_bias, _output04, vl); + _output05 = vfadd_vv_f32m1(_bias, _output05, vl); + + vse32_v_f32m1(output0, _output00, vl); + vse32_v_f32m1(output0 + packn * 2, _output02, vl); + vse32_v_f32m1(output0 + packn * 4, _output04, vl); + vse32_v_f32m1(output0 + packn * 1, _output01, vl); + vse32_v_f32m1(output0 + packn * 3, _output03, vl); + vse32_v_f32m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [36, O/pack2n, I, pack2n] --> [36, O/packn, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + + // kernel transform matrix: G + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd42 + // [O, I, 6, 6] --> [6*6, O/pack2n, I, pack2n] / [6*6, O/packn, I, packn] + float *kernel_tm_packn = (float *)shl_mem_alloc(36 * outch / 4 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + for (int k = 0; k < 36; k++) { + float *g0 = kernel_tm_packn + k * outch * inch; + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + for (; oc + packn - 1 < outch; oc += packn) { + float *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float)); + wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [36, out_c/packn, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(36 * out_c / 4 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float)); + wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [64, O/pack2n, I, pack2n] --> [64, O/pack, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + // kernel transform matrix: G + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const float ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd64 + // [O, I, 8, 8] --> [8*8, O/pack2n, I, pack2n] / [8*8, O/packn, I, packn] + float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + for (int k = 0; k < 64; k++) { + float *g0 = kernel_tm_packn + k * outch * inch; + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + for (; oc + packn - 1 < outch; oc += packn) { + float *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float)); + wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [64, out_c/packn, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(64 * out_c / 4 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float)); + wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +#endif diff --git a/source/c908_opt/convolution_3x3_fp32_packn_1.c b/source/c908_opt/convolution_3x3_fp32_packn_1.c new file mode 100644 index 00000000..bf5a7ff8 --- /dev/null +++ b/source/c908_opt/convolution_3x3_fp32_packn_1.c @@ -0,0 +1,2029 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ + +/****************************************************************************************** + * padding input for winograd input transform + * input layout: [n c/packn h w packn] + * input_padded layout: [n c/packn h w packn] + * constrain: input channel % packn = 0 + * packn = vlen / sizeof(float) + ******************************************************************************************/ +static void winograd_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left) +{ + shl_rvv_pad_input_packn_fp32(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left); +} + +static void winograd_crop_output_packn_fp32(const float *output_trans, float *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + float *out_tm_ptr = (float *)output_trans + c * crop_size; + float *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + float *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl); + crop_ptr += packn; + vse32_v_f32m1(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // after padding - q channel + float *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + float tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // pad_buf 6*6 block start addr + const float *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + + vfloat32m1_t _tmp0m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f, + vfadd_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f, + vfsub_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp5m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _r0tm0 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm5 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + float tmp[4][6][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _tmp3m = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _out00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _out03 = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f32m1(_bias, _out00, vl); + _out01 = vfadd_vv_f32m1(_bias, _out01, vl); + _out02 = vfadd_vv_f32m1(_bias, _out02, vl); + _out03 = vfadd_vv_f32m1(_bias, _out03, vl); + + vse32_v_f32m1(output0, _out00, vl); + vse32_v_f32m1(output0 + packn * 1, _out01, vl); + vse32_v_f32m1(output0 + packn * 2, _out02, vl); + vse32_v_f32m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_fp32(const float *src, float *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + for (int r = 0; r < area; r++) { + float *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _a0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _a4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _a5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _a6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _a7 = vle32_v_f32m1(tm1 + packn * 7, vl); + vfloat32m1_t _a8 = vle32_v_f32m1(tm1 + packn * 8, vl); + vfloat32m1_t _a9 = vle32_v_f32m1(tm1 + packn * 9, vl); + vfloat32m1_t _a10 = vle32_v_f32m1(tm1 + packn * 10, vl); + vfloat32m1_t _a11 = vle32_v_f32m1(tm1 + packn * 11, vl); + + vsse32_v_f32m1(img_tm2, 12 * sizeof(float), _a0, vl); + vsse32_v_f32m1(img_tm2 + 1, 12 * sizeof(float), _a1, vl); + vsse32_v_f32m1(img_tm2 + 2, 12 * sizeof(float), _a2, vl); + vsse32_v_f32m1(img_tm2 + 3, 12 * sizeof(float), _a3, vl); + vsse32_v_f32m1(img_tm2 + 4, 12 * sizeof(float), _a4, vl); + vsse32_v_f32m1(img_tm2 + 5, 12 * sizeof(float), _a5, vl); + vsse32_v_f32m1(img_tm2 + 6, 12 * sizeof(float), _a6, vl); + vsse32_v_f32m1(img_tm2 + 7, 12 * sizeof(float), _a7, vl); + vsse32_v_f32m1(img_tm2 + 8, 12 * sizeof(float), _a8, vl); + vsse32_v_f32m1(img_tm2 + 9, 12 * sizeof(float), _a9, vl); + vsse32_v_f32m1(img_tm2 + 10, 12 * sizeof(float), _a10, vl); + vsse32_v_f32m1(img_tm2 + 11, 12 * sizeof(float), _a11, vl); + + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + + vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + + vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + + vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + + vse32_v_f32m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp32(const float *input, const float *kernel, + float *output, int in_ch, int out_ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + const int vl = vsetvl_e32m1(packn); + int p = 0; + for (; p + pack2n - 1 < out_ch; p += pack2n) { + float *output0_tm = output + p * area * tiles; // 8 channel dot output + float *output1_tm = output0_tm + packn * area * tiles; + + const float *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 11 < tiles; t += 12) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + "flw ft4, 16(%[input_ptr])\n\t" + "flw ft5, 20(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle32.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flw fa0, 24(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flw fa1, 28(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flw fa2, 32(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flw fa3, 36(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v3\n\t" + "vfmacc.vf v24, ft4, v4\n\t" + "flw fa4, 40(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v3\n\t" + "vfmacc.vf v25, ft5, v4\n\t" + "flw fa5, 44(%[input_ptr])\n\t" + "vfmacc.vf v14, fa0, v3\n\t" + "vfmacc.vf v26, fa0, v4\n\t" + "flw ft0, 48(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v3\n\t" + "vfmacc.vf v27, fa1, v4\n\t" + "flw ft1, 52(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v3\n\t" + "vfmacc.vf v28, fa2, v4\n\t" + "flw ft2, 56(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v3\n\t" + "vfmacc.vf v29, fa3, v4\n\t" + "flw ft3, 60(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v3\n\t" + "vfmacc.vf v30, fa4, v4\n\t" + "flw ft4, 64(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v3\n\t" + "vfmacc.vf v31, fa5, v4\n\t" + "flw ft5, 68(%[input_ptr])\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "vfmacc.vf v20, ft0, v6\n\t" + "flw fa0, 72(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "vfmacc.vf v21, ft1, v6\n\t" + "flw fa1, 76(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "vfmacc.vf v22, ft2, v6\n\t" + "flw fa2, 80(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "vfmacc.vf v23, ft3, v6\n\t" + "flw fa3, 84(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v5\n\t" + "vfmacc.vf v24, ft4, v6\n\t" + "flw fa4, 88(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v5\n\t" + "vfmacc.vf v25, ft5, v6\n\t" + "flw fa5, 92(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 96\n\t" + "vfmacc.vf v14, fa0, v5\n\t" + "vfmacc.vf v26, fa0, v6\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v5\n\t" + "vfmacc.vf v27, fa1, v6\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v5\n\t" + "vfmacc.vf v28, fa2, v6\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v5\n\t" + "vfmacc.vf v29, fa3, v6\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v5\n\t" + "vfmacc.vf v30, fa4, v6\n\t" + "flw ft4, 16(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v5\n\t" + "vfmacc.vf v31, fa5, v6\n\t" + "flw ft5, 20(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v16, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v17, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v18, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v19, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse32.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v24, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v25, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v26, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v27, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v28, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v29, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v30, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v31, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "fa0", "fa1", "fa2", + "fa3", "fa4", "fa5", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0"); + } + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle32.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flw fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flw fa1, 20(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flw fa2, 24(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flw fa3, 28(%[input_ptr])\n\t" + "vfmacc.vf v12, fa0, v3\n\t" + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 32(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v3\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 36(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v3\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 40(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v3\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 44(%[input_ptr])\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "vfmacc.vf v20, ft0, v6\n\t" + "flw fa0, 48(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "vfmacc.vf v21, ft1, v6\n\t" + "flw fa1, 52(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "vfmacc.vf v22, ft2, v6\n\t" + "flw fa2, 56(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "vfmacc.vf v23, ft3, v6\n\t" + "flw fa3, 60(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + "vfmacc.vf v12, fa0, v5\n\t" + "vfmacc.vf v24, fa0, v6\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v5\n\t" + "vfmacc.vf v25, fa1, v6\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v5\n\t" + "vfmacc.vf v26, fa2, v6\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "vfmacc.vf v27, fa3, v6\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse32.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v24, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v25, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v26, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v27, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle32.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flw fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flw fa1, 20(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "vfmacc.vf v22, ft2, v4\n\t" + "flw fa2, 24(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "vfmacc.vf v23, ft3, v4\n\t" + "flw fa3, 28(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "vfmacc.vf v21, fa1, v6\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v10, fa2, v5\n\t" + "vfmacc.vf v22, fa2, v6\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v11, fa3, v5\n\t" + "vfmacc.vf v23, fa3, v6\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse32.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v22, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v23, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v20", + "v21", "v22", "v23", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", + "t0"); + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle32.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flw fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "vfmacc.vf v21, ft1, v4\n\t" + "flw fa1, 12(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "vfmacc.vf v21, fa1, v6\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + "vse32.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + "vse32.v v21, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v20", "v21", "fa0", + "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v20, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + "vle32.v v6, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "vfmacc.vf v20, ft0, v4\n\t" + "flw fa0, 4(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + "vle32.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "vfmacc.vf v20, fa0, v6\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v20, (%[output_ptr1])\n\t" + "add %[output_ptr1], %[output_ptr1], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm), + [output_ptr1] "+r"(output1_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v20", "fa0", "ft0", "t0"); + } + } + } + + for (; p + packn - 1 < out_ch; p += packn) { + float *output0_tm = output + p * area * tiles; // 4 channel dot output + const float *kernel0_tm = kernel + p * area * in_ch; // 4 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 11 < tiles; t += 12) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + "flw ft4, 16(%[input_ptr])\n\t" + "flw ft5, 20(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flw fa0, 24(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flw fa1, 28(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flw fa2, 32(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flw fa3, 36(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v3\n\t" + "flw fa4, 40(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v3\n\t" + "flw fa5, 44(%[input_ptr])\n\t" + "vfmacc.vf v14, fa0, v3\n\t" + "flw ft0, 48(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v3\n\t" + "flw ft1, 52(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v3\n\t" + "flw ft2, 56(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v3\n\t" + "flw ft3, 60(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v3\n\t" + "flw ft4, 64(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v3\n\t" + "flw ft5, 68(%[input_ptr])\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "flw fa0, 72(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "flw fa1, 76(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "flw fa2, 80(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "flw fa3, 84(%[input_ptr])\n\t" + "vfmacc.vf v12, ft4, v5\n\t" + "flw fa4, 88(%[input_ptr])\n\t" + "vfmacc.vf v13, ft5, v5\n\t" + "flw fa5, 92(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 96\n\t" + "vfmacc.vf v14, fa0, v5\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v15, fa1, v5\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v16, fa2, v5\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v17, fa3, v5\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + "vfmacc.vf v18, fa4, v5\n\t" + "flw ft4, 16(%[input_ptr])\n\t" + "vfmacc.vf v19, fa5, v5\n\t" + "flw ft5, 20(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v16, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v17, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v18, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v19, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", + "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0"); + } + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flw fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flw fa1, 20(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flw fa2, 24(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flw fa3, 28(%[input_ptr])\n\t" + "vfmacc.vf v12, fa0, v3\n\t" + "flw ft0, 32(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v3\n\t" + "flw ft1, 36(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v3\n\t" + "flw ft2, 40(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v3\n\t" + "flw ft3, 44(%[input_ptr])\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, ft0, v5\n\t" + "flw fa0, 48(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v5\n\t" + "flw fa1, 52(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v5\n\t" + "flw fa2, 56(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v5\n\t" + "flw fa3, 60(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + "vfmacc.vf v12, fa0, v5\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v13, fa1, v5\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v14, fa2, v5\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v12, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v13, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v14, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v15, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flw fa0, 16(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flw fa1, 20(%[input_ptr])\n\t" + "vfmacc.vf v10, ft2, v3\n\t" + "flw fa2, 24(%[input_ptr])\n\t" + "vfmacc.vf v11, ft3, v3\n\t" + "flw fa3, 28(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + "vfmacc.vf v10, fa2, v5\n\t" + "flw ft2, 8(%[input_ptr])\n\t" + "vfmacc.vf v11, fa3, v5\n\t" + "flw ft3, 12(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v10, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v11, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "fa0", "fa1", "fa2", + "fa3", "ft0", "ft1", "ft2", "ft3", "t0"); + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flw fa0, 8(%[input_ptr])\n\t" + "vfmacc.vf v9, ft1, v3\n\t" + "flw fa1, 12(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + "vfmacc.vf v9, fa1, v5\n\t" + "flw ft1, 4(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + "vse32.v v9, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v5", "v8", "v9", "fa0", "fa1", "ft0", "ft1", "t0"); + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e32, m1\n\t" + "srai t0, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + + // pre-load kernel matrix + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "flw ft0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n12k2 + "vle32.v v5, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += 8 + + "vfmacc.vf v8, ft0, v3\n\t" + "flw fa0, 4(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" + + "vle32.v v3, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vfmacc.vf v8, fa0, v5\n\t" + "flw ft0, 0(%[input_ptr])\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "vse32.v v8, (%[output_ptr0])\n\t" + "add %[output_ptr0], %[output_ptr0], %[step]\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 4) + : "cc", "memory", "v3", "v5", "v8", "fa0", "ft0", "t0"); + } + } + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // feature map after padding - q channel + float *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + float tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *r0 = + img0 + (i * w * 6 + j * 6) * packn; // feature map after padding 8*8 start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; // input_tm1 8*8 block start addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); + + vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f32m1(_r04, _r02, vl), vl); + vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f32m1(_r03, _r05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = + vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[7][m], _tmp7m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vse32_v_f32m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + float *r0_tm6 = r0_tm5 + tiles * packn; + float *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); + vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = vfmacc_vf_f32m1( + _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm7, _r0tm7, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + vse32_v_f32m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + float tmp[6][8][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + const float *output0_tm_6 = output0_tm_0 + tiles * packn * 6; + const float *output0_tm_7 = output0_tm_0 + tiles * packn * 7; + + float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _tmp5m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * packn * 8; + output0_tm_1 += tiles * packn * 8; + output0_tm_2 += tiles * packn * 8; + output0_tm_3 += tiles * packn * 8; + output0_tm_4 += tiles * packn * 8; + output0_tm_5 += tiles * packn * 8; + output0_tm_6 += tiles * packn * 8; + output0_tm_7 += tiles * packn * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); + + vfloat32m1_t _output00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _output02 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _output04 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _output01 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _output03 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _output05 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f32m1(_bias, _output00, vl); + _output01 = vfadd_vv_f32m1(_bias, _output01, vl); + _output02 = vfadd_vv_f32m1(_bias, _output02, vl); + _output03 = vfadd_vv_f32m1(_bias, _output03, vl); + _output04 = vfadd_vv_f32m1(_bias, _output04, vl); + _output05 = vfadd_vv_f32m1(_bias, _output05, vl); + + vse32_v_f32m1(output0, _output00, vl); + vse32_v_f32m1(output0 + packn * 2, _output02, vl); + vse32_v_f32m1(output0 + packn * 4, _output04, vl); + vse32_v_f32m1(output0 + packn * 1, _output01, vl); + vse32_v_f32m1(output0 + packn * 3, _output03, vl); + vse32_v_f32m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(float) + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + + // kernel transform matrix: G + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/pack2n, 6*6, I, pack2n] + float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + float *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + // [O/packn, 6*6, I, packn] + for (; oc + packn - 1 < outch; oc += packn) { + float *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + float *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(float) + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float)); + wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [36, out_c/packn, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(36 * out_c / 4 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float)); + wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 64, I, pack2n] --> [O/pack, 64, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(float) + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + // kernel transform matrix: G + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const float ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd64 + float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + float *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + + for (; oc + packn - 1 < outch; oc += packn) { + float *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + float *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(float) + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float)); + wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [64, out_c/packn, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(64 * out_c / 4 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float)); + wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + + // shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, out_c, out_h, out_w); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_3x3_int8.c b/source/c908_opt/convolution_3x3_int8.c new file mode 100644 index 00000000..a376f6d2 --- /dev/null +++ b/source/c908_opt/convolution_3x3_int8.c @@ -0,0 +1,2801 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + note: VLEN = 128 +*************************************************************/ + +/****************************************************************************************** + * padding input for winograd input transform , and change memory layout + * input layout: [n c h w] + * input_padded layout: [n, c/8, h, w, 8] + * constrain: input channel % 8 = 0 + ******************************************************************************************/ +static void winograd_pad_input_pack1to8_int8(const int8_t *input, int8_t *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, + int pad_top, int pad_left, int8_t pad_value) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + int padded_hw = padded_h * padded_w; + const int in_size = inh * inw; // per-channel size + + int8_t *pad_ptr = input_padded; + int8_t *inp_ptr = (int8_t *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + inp_ptr = (int8_t *)input + c * in_size; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl); + inp_ptr++; + vse8_v_i8mf2(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +} + +/****************************************************************************************** + * cut winograd output transform for output, and change memory layout + * winograd output transform layout: [n, c/8, h, w, 8] + * output layout: [n, c, h, w] + * constrain: output channel % 8 = 0 + ******************************************************************************************/ +static void winograd_crop_output_pack8to1_int8(const int8_t *output_trans, int8_t *output, + int out_c, int out_h, int out_w, int wino_h, + int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int8_t *out_tm_ptr = (int8_t *)output_trans; + int8_t *out_ptr = output; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + out_tm_ptr = (int8_t *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + int8_t *crop_ptr = out_tm_ptr + h * wino_w * vl; + for (int w = 0; w < out_w; w++) { + vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl); + crop_ptr += vl; + vsse8_v_i8mf2(out_ptr, out_size * sizeof(int8_t), _tmp, vl); + out_ptr++; + } + } + } +} + +/****************************************************************************************** + * winograd int8 postprocess int32 --> int8 + * _src: 8 channels int32 macc + * _mult: 8 channels multi for scale, support channel quantization + * _shift: 8 channels shift for scale, support channel quantization + * out_zp: output zero_point + ******************************************************************************************/ +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +} + +static inline void wg_b4f3s1_trans_input_pack8_int8(const int8_t *src, int16_t *dst, int ch, int h, + int w, int blk_h, int blk_w, int8_t input_zp) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + [0] = 4 * r00 - 5 * r02 + r04 + [1] = -4 * (r01 + r02) + r04 + r03 + [2] = 4 * (r01 - r02) + r04 - r03 + [3] = -2 * (r01 - r03) + r04 - r02 + [4] = 2 * (r01 - r03) + r04 - r02 + [5] = 4 * r01 - 5 * r03 + r05 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const int8_t *img0 = src + q * h * w; // feature map after padding - q channel + int16_t *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + int16_t tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // feature map after padding 6*6 start addr + const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl); + vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl); + vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl); + vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl); + vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl); + vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl); + + // (q - z) + vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl); + vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl); + vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl); + vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl); + vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl); + vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl); + + vint16m1_t _tmp0m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl), + _r04, vl); + vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4, + vadd_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4, + vsub_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp5m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl), + _r05, vl); + + // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl), + // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m = + // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02, + // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl), + // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m = + // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03, + // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), + // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m = + // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl), + // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl); + + vse16_v_i16m1(tmp[0][m], _tmp0m, vl); + vse16_v_i16m1(tmp[1][m], _tmp1m, vl); + vse16_v_i16m1(tmp[2][m], _tmp2m, vl); + vse16_v_i16m1(tmp[3][m], _tmp3m, vl); + vse16_v_i16m1(tmp[4][m], _tmp4m, vl); + vse16_v_i16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + int16_t *r0_tm0 = r0_tm; + int16_t *r0_tm1 = r0_tm0 + tiles * packn; + int16_t *r0_tm2 = r0_tm1 + tiles * packn; + int16_t *r0_tm3 = r0_tm2 + tiles * packn; + int16_t *r0_tm4 = r0_tm3 + tiles * packn; + int16_t *r0_tm5 = r0_tm4 + tiles * packn; + + vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl); + vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl); + vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl); + vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl); + vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl); + vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl); + + vint16m1_t _r0tm0 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl); + vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4, + vadd_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4, + vsub_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm5 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl); + + vse16_v_i16m1(r0_tm0, _r0tm0, vl); + vse16_v_i16m1(r0_tm1, _r0tm1, vl); + vse16_v_i16m1(r0_tm2, _r0tm2, vl); + vse16_v_i16m1(r0_tm3, _r0tm3, vl); + vse16_v_i16m1(r0_tm4, _r0tm4, vl); + vse16_v_i16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_pack8_int8(const int32_t *src, const int32_t *bias, + int8_t *dst, int ch, int blk_h, int blk_w, + int32_t *multi, int32_t *shift, int32_t out_zp) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 4 } // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍 + }; + + [0] = r00 + (r01 + r02) + (r03 + r04) + [1] = (r01 - r02) + (r03 - r04) * 2 + [2] = (r01 + r02) + (r03 + r04) * 4 + [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + vint32m2_t _mult = vle32_v_i32m2(multi + p, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + p, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + const int32_t *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + int32_t tmp[4][6][packn]; + + vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl); + _bias = vmul_vx_i32m2(_bias, 576, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl); + vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl); + vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl); + vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl); + vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl); + vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl); + + vint32m2_t _tmp0m = + vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl); + vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl); + vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl); + vint32m2_t _tmp3m = + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl); + + vse32_v_i32m2(tmp[0][m], _tmp0m, vl); + vse32_v_i32m2(tmp[1][m], _tmp1m, vl); + vse32_v_i32m2(tmp[2][m], _tmp2m, vl); + vse32_v_i32m2(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl); + vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl); + vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl); + vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl); + vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl); + vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl); + + vint32m2_t _out00 = vadd_vv_i32m2( + _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl); + vint32m2_t _out01 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl); + vint32m2_t _out02 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl); + vint32m2_t _out03 = vadd_vv_i32m2( + _bias, + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl); + + vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn); + vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn); + vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn); + vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile8_int8(const int16_t *src, int16_t *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + int16_t *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + + vse16_v_i16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m8n8_int8(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 7 < out_ch; p += 8) { + int32_t *output0_tm = output + p * area * tiles; // 8 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v2\n\t" + "lh t0, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "lh t1, 10(%[input_ptr])\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lh t2, 12(%[input_ptr])\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "lh t3, 14(%[input_ptr])\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "lh a0, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "lh a1, 18(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lh a2, 20(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "lh a3, 22(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v4\n\t" + "lh t0, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "lh t1, 26(%[input_ptr])\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lh t2, 28(%[input_ptr])\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "lh t3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, a0, v2\n\t" + "lh t0, 8(%[input_ptr])\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "lh t1, 10(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "lh t2, 12(%[input_ptr])\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "lh t3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_int8_v256(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + int32_t *output0_tm = output + p * area * tiles; // 16 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 16 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v2\n\t" + "lh t0, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "lh t1, 10(%[input_ptr])\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lh t2, 12(%[input_ptr])\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "lh t3, 14(%[input_ptr])\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "lh a0, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "lh a1, 18(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lh a2, 20(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "lh a3, 22(%[input_ptr])\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v4\n\t" + "lh t0, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "lh t1, 26(%[input_ptr])\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lh t2, 28(%[input_ptr])\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "lh t3, 30(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, a0, v2\n\t" + "lh t0, 8(%[input_ptr])\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "lh t1, 10(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "lh t2, 12(%[input_ptr])\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "lh t3, 14(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lh a2, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "lh a3, 6(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +// 如果使能xtheadc, 可用lwd指令 +static inline void wg_bxf3s1_batch_gemm_m8n8_int8_1(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int p = 0; p + 7 < out_ch; p += 8) { + int32_t *output0_tm = output + p * area * tiles; // 8 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v2\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "srli a3, a2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, a0, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, t0, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_int8_1_v256(const int16_t *input, + const int16_t *kernel, int32_t *output, + int in_ch, int out_ch, int tiles, + int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + int32_t *output0_tm = output + p * area * tiles; // 16 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 16 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v2\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "srli a3, a2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, a0, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, t0, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + int16_t *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + // vint16m1_t _a0, _a1, _a2, _a3; + // vint16m1_t _b0, _b1, _b2, _b3; + // vint16m1_t _c0, _c1, _c2, _c3; + vint16m1_t _a0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl); + vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl); + vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl); + vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl); + vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl); + + vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl); + vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl); + vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl); + vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl); + vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl); + vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl); + vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl); + vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl); + vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl); + vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl); + vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl); + vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl); + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + + vse16_v_i16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m8n12_int8(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int p = 0; p + 7 < out_ch; p += 8) { + int32_t *output0_tm = output + p * area * tiles; // 8 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v8, a0, v2\n\t" + "vwmacc.vx v12, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v14, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v16, t0, v2\n\t" + "vwmacc.vx v20, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + + "vwmacc.vx v18, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v22, t3, v2\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v24, a0, v2\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "lwd t0, t2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "srli t3, t2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v8, t0, v4\n\t" + "vwmacc.vx v12, t2, v4\n\t" + "lwd a0, a2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v14, t3, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v8, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v10, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v12, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v14, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2", + "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v2\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "srli a3, a2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, a0, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v24, t0, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 8; + + asm volatile( + "li t5, 8\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 16\n\t" // kernel_ptr += 8 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -16\n\t" // kernel_ptr -= 8 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 32\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n12_int8_v256(const int16_t *input, + const int16_t *kernel, int32_t *output, + int in_ch, int out_ch, int tiles, int area) +{ + for (int p = 0; p + 15 < out_ch; p += 16) { + int32_t *output0_tm = output + p * area * tiles; // 16 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 16 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v8, a0, v2\n\t" + "vwmacc.vx v12, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v14, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v16, t0, v2\n\t" + "vwmacc.vx v20, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + + "vwmacc.vx v18, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v22, t3, v2\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v24, a0, v2\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "lwd t0, t2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "srli t3, t2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v8, t0, v4\n\t" + "vwmacc.vx v12, t2, v4\n\t" + "lwd a0, a2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v14, t3, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v8, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v10, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v12, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v14, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2", + "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v2\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "srli a3, a2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, a0, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v24, t0, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * 16; + + asm volatile( + "li t5, 16\n\t" + "vsetvli zero, t5, e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "addi %[kernel_ptr], %[kernel_ptr], 32\n\t" // kernel_ptr += 16 + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "addi %[kernel_ptr], %[kernel_ptr], -32\n\t" // kernel_ptr -= 16 + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "addi %[output_ptr], %[output_ptr], 64\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/8, 36, I, 8] + * constrain: output channel % 8 = 0 + * input channel % 8 = 0 + ******************************************************************************************/ +void shl_c908_wg_b4f3s1_trans_kernel_pack8_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + int8_t *kernel_data = (int8_t *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t)); + + // kernel transform matrix: G + const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4}, + {1, 2, 4}, {1, -2, 4}, {0, 0, 6}}; + + csinn_tensor_copy(dst_kernel, src_kernel); // tensor->dtype ?? + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9; + int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const int8_t *k0 = kernel0; + const int8_t *k1 = kernel0 + 3; + const int8_t *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + int16_t tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + int16_t *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/8, 6*6, I, 8] + int16_t *kernel_tm_packn = + (int16_t *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(int16_t)); + dst_kernel->data = kernel_tm_packn; + + // for (int oc = 0; oc + 7 < outch; oc += 8) { + // const int16_t *k0 = kernel_tm + (oc + 0) * inch * 36; + // const int16_t *k1 = kernel_tm + (oc + 1) * inch * 36; + // const int16_t *k2 = kernel_tm + (oc + 2) * inch * 36; + // const int16_t *k3 = kernel_tm + (oc + 3) * inch * 36; + // const int16_t *k4 = kernel_tm + (oc + 4) * inch * 36; + // const int16_t *k5 = kernel_tm + (oc + 5) * inch * 36; + // const int16_t *k6 = kernel_tm + (oc + 6) * inch * 36; + // const int16_t *k7 = kernel_tm + (oc + 7) * inch * 36; + + // int16_t *g0 = kernel_tm_packn + oc * inch * 36; + + // for (int t = 0; t < 36; t++) { + // int16_t *g00 = g0 + t * inch * 8; + + // for (int ic = 0; ic < inch; ic++) { + // const int16_t *k00 = k0 + ic * 36; + // const int16_t *k10 = k1 + ic * 36; + // const int16_t *k20 = k2 + ic * 36; + // const int16_t *k30 = k3 + ic * 36; + // const int16_t *k40 = k4 + ic * 36; + // const int16_t *k50 = k5 + ic * 36; + // const int16_t *k60 = k6 + ic * 36; + // const int16_t *k70 = k7 + ic * 36; + + // g00[0] = k00[t]; + // g00[1] = k10[t]; + // g00[2] = k20[t]; + // g00[3] = k30[t]; + // g00[4] = k40[t]; + // g00[5] = k50[t]; + // g00[6] = k60[t]; + // g00[7] = k70[t]; + // g00 += 8; + // } + // } + // } + + const int packn = csrr_vlenb() / sizeof(int16_t); + + for (int oc = 0; oc < outch / packn; oc++) { + int16_t *g0 = kernel_tm_packn + oc * 36 * inch * packn; + + for (int k = 0; k < 36; k++) { + int16_t *g00 = g0 + k * inch * packn; + + for (int ic = 0; ic < inch / packn; ic++) { + for (int i = 0; i < packn; i++) { + for (int j = 0; j < packn; j++) { + int16_t *k00 = + kernel_tm + (oc * packn + j) * 36 * inch + (ic * packn + i) * 36; + *g00++ = k00[k]; + } + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 8 = 0 + * input channel % 8 = 0 + ******************************************************************************************/ +int shl_c908_wg_b4f3s1_pack8_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/8 h w 8] + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + + // pad input + winograd_pad_input_pack1to8_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left, input->qinfo->zero_point); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 64, tiles, 8] + int16_t *input_tm1_buf = + (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t)); + wg_b4f3s1_trans_input_pack8_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w, input->qinfo->zero_point); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t)); + wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 36, tiles, 8] + const int vlen = csrr_vlenb() * 8; + int32_t *output_dot_buf = + (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t)); + if (vlen == 128) { + wg_bxf3s1_batch_gemm_m8n12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + } else if (vlen == 256) { + wg_bxf3s1_batch_gemm_m16n12_int8_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c, + out_c, tiles, 36); + } + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h4, out_w4, 8] + int8_t *output_tm1_buf = + (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + wg_b4f3s1_trans_output_pack8_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w, multiplier, shift, output->qinfo->zero_point); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_pack8to1_int8(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_3x3_int8_packn.c b/source/c908_opt/convolution_3x3_int8_packn.c new file mode 100644 index 00000000..4b324609 --- /dev/null +++ b/source/c908_opt/convolution_3x3_int8_packn.c @@ -0,0 +1,630 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ +#ifdef NNN +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +} + +static void winograd_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, int pad_top, + int pad_left, int8_t pad_value) +{ + shl_rvv_pad_input_packn_int8(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left, pad_value); +} + +static void winograd_crop_output_packn_int8(const int8_t *output_trans, int8_t *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + int8_t *out_tm_ptr = (int8_t *)output_trans + c * crop_size; + int8_t *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + int8_t *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl); + crop_ptr += packn; + vse8_v_i8mf2(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h, + int w, int blk_h, int blk_w, int8_t input_zp) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + [0] = 4 * r00 - 5 * r02 + r04 + [1] = -4 * (r01 + r02) + r04 + r03 + [2] = 4 * (r01 - r02) + r04 - r03 + [3] = -2 * (r01 - r03) + r04 - r02 + [4] = 2 * (r01 - r03) + r04 - r02 + [5] = 4 * r01 - 5 * r03 + r05 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const int8_t *img0 = src + q * h * w; // feature map after padding - q channel + int16_t *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + int16_t tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // feature map after padding 6*6 start addr + const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl); + vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl); + vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl); + vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl); + vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl); + vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl); + + // (q - z) + vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl); + vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl); + vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl); + vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl); + vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl); + vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl); + + vint16m1_t _tmp0m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl), + _r04, vl); + vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4, + vadd_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4, + vsub_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp5m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl), + _r05, vl); + + // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl), + // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m = + // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02, + // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl), + // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m = + // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03, + // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), + // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m = + // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl), + // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl); + + vse16_v_i16m1(tmp[0][m], _tmp0m, vl); + vse16_v_i16m1(tmp[1][m], _tmp1m, vl); + vse16_v_i16m1(tmp[2][m], _tmp2m, vl); + vse16_v_i16m1(tmp[3][m], _tmp3m, vl); + vse16_v_i16m1(tmp[4][m], _tmp4m, vl); + vse16_v_i16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + int16_t *r0_tm0 = r0_tm; + int16_t *r0_tm1 = r0_tm0 + tiles * packn; + int16_t *r0_tm2 = r0_tm1 + tiles * packn; + int16_t *r0_tm3 = r0_tm2 + tiles * packn; + int16_t *r0_tm4 = r0_tm3 + tiles * packn; + int16_t *r0_tm5 = r0_tm4 + tiles * packn; + + vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl); + vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl); + vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl); + vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl); + vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl); + vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl); + + vint16m1_t _r0tm0 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl); + vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4, + vadd_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4, + vsub_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm5 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl); + + vse16_v_i16m1(r0_tm0, _r0tm0, vl); + vse16_v_i16m1(r0_tm1, _r0tm1, vl); + vse16_v_i16m1(r0_tm2, _r0tm2, vl); + vse16_v_i16m1(r0_tm3, _r0tm3, vl); + vse16_v_i16m1(r0_tm4, _r0tm4, vl); + vse16_v_i16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias, + int8_t *dst, int ch, int blk_h, int blk_w, + int32_t *multi, int32_t *shift, int32_t out_zp) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 4 } // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍 + }; + + [0] = r00 + (r01 + r02) + (r03 + r04) + [1] = (r01 - r02) + (r03 - r04) * 2 + [2] = (r01 + r02) + (r03 + r04) * 4 + [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + vint32m2_t _mult = vle32_v_i32m2(multi + p, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + p, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + const int32_t *out0_tm = src + p * tiles; // 输出转换前/dot后 第p个channel + int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + int32_t tmp[4][6][packn]; + + vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl); + _bias = vmul_vx_i32m2(_bias, 576, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const int32_t *output0_tm_1 = output0_tm_0 + tiles * ch * 1; + const int32_t *output0_tm_2 = output0_tm_0 + tiles * ch * 2; + const int32_t *output0_tm_3 = output0_tm_0 + tiles * ch * 3; + const int32_t *output0_tm_4 = output0_tm_0 + tiles * ch * 4; + const int32_t *output0_tm_5 = output0_tm_0 + tiles * ch * 5; + + int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl); + vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl); + vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl); + vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl); + vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl); + vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl); + + vint32m2_t _tmp0m = + vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl); + vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl); + vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl); + vint32m2_t _tmp3m = + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl); + + vse32_v_i32m2(tmp[0][m], _tmp0m, vl); + vse32_v_i32m2(tmp[1][m], _tmp1m, vl); + vse32_v_i32m2(tmp[2][m], _tmp2m, vl); + vse32_v_i32m2(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * ch * 6; + output0_tm_1 += tiles * ch * 6; + output0_tm_2 += tiles * ch * 6; + output0_tm_3 += tiles * ch * 6; + output0_tm_4 += tiles * ch * 6; + output0_tm_5 += tiles * ch * 6; + } + + for (int m = 0; m < 4; m++) { + vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl); + vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl); + vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl); + vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl); + vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl); + vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl); + + vint32m2_t _out00 = vadd_vv_i32m2( + _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl); + vint32m2_t _out01 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl); + vint32m2_t _out02 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl); + vint32m2_t _out03 = vadd_vv_i32m2( + _bias, + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl); + + vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn); + vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn); + vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn); + vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + int16_t *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _a0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl); + vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl); + vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl); + vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl); + vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl); + + vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl); + vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl); + vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl); + vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl); + vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl); + vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl); + vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl); + vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl); + vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl); + vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl); + vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl); + vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl); + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + + vse16_v_i16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_packnx12_int8(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, + int tiles, int area) +{ + for (int r = 0; r < area; r++) { + const int16_t *kernel_ptr = kernel + r * out_ch * in_ch; + const int16_t *input_ptr = input + r * tiles * in_ch; + int32_t *output_ptr = output + r * tiles * out_ch; + + shl_c908_ncxhwx_gemm_12xpackn_int16(output_ptr, kernel_ptr, input_ptr, out_ch, in_ch, + tiles); + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [36, O/packn, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(int8_t) / 2 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + int8_t *kernel_data = (int8_t *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t)); + + // kernel transform matrix: G + const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4}, + {1, 2, 4}, {1, -2, 4}, {0, 0, 6}}; + + csinn_tensor_copy(dst_kernel, src_kernel); // tensor->dtype ?? + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9; + int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const int8_t *k0 = kernel0; + const int8_t *k1 = kernel0 + 3; + const int8_t *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + int16_t tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + int16_t *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + const int packn = csrr_vlenb() / sizeof(int16_t); + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [6*6, O/8, I, 8] + int16_t *kernel_tm_packn = + (int16_t *)shl_mem_alloc(36 * outch / packn * inch * packn * sizeof(int16_t)); + dst_kernel->data = kernel_tm_packn; + + for (int k = 0; k < 36; k++) { + int16_t *g0 = kernel_tm_packn + k * outch * inch; + for (int oc = 0; oc + packn - 1 < outch; oc += packn) { + int16_t *g00 = g0 + oc * inch; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * in_h * in_w * sizeof(int8_t)); + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t)); + + for (int n = 0; n < batch; n++) { + shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, in_c, in_h, in_w); + + // pad buffer: [in_c/packn h w packn] + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + + // pad input + winograd_pad_input_packn_int8(input_ncxhwx, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left, input->qinfo->zero_point); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + int16_t *input_tm1_buf = + (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t)); + wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w, input->qinfo->zero_point); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/12, in_c, 12] + int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t)); + wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [36, out_c/packn, tiles, packn] + const int vlen = csrr_vlenb() * 8; + int32_t *output_dot_buf = + (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t)); + + wg_bxf3s1_batch_gemm_packnx12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + int8_t *output_tm1_buf = + (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w, multiplier, shift, output->qinfo->zero_point); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_int8(output_tm1_buf, output_ncxhwx, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, out_c, out_h, out_w); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + } + return CSINN_TRUE; +} +#endif diff --git a/source/c908_opt/convolution_3x3_int8_packn_1.c b/source/c908_opt/convolution_3x3_int8_packn_1.c new file mode 100644 index 00000000..ddff2e2e --- /dev/null +++ b/source/c908_opt/convolution_3x3_int8_packn_1.c @@ -0,0 +1,1060 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +// #ifdef NNN +#include "shl_c908.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +} + +/****************************************************************************************** + * padding input for winograd input transform + * input layout: [n c/packn h w packn] + * input_padded layout: [n c/packn h w packn] + * constrain: input channel % packn = 0 + * packn = vlen / sizeof(int8) / 2 + ******************************************************************************************/ +static void winograd_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, int pad_top, + int pad_left, int8_t pad_value) +{ + shl_rvv_pad_input_packn_int8(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top, + pad_left, pad_value); +} + +static void winograd_crop_output_packn_int8(const int8_t *output_trans, int8_t *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + int8_t *out_tm_ptr = (int8_t *)output_trans + c * crop_size; + int8_t *out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + int8_t *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl); + crop_ptr += packn; + vse8_v_i8mf2(out_ptr, _tmp, vl); + out_ptr += packn; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h, + int w, int blk_h, int blk_w, int8_t input_zp) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + [0] = 4 * r00 - 5 * r02 + r04 + [1] = -4 * (r01 + r02) + r04 + r03 + [2] = 4 * (r01 - r02) + r04 - r03 + [3] = -2 * (r01 - r03) + r04 - r02 + [4] = 2 * (r01 - r03) + r04 - r02 + [5] = 4 * r01 - 5 * r03 + r05 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const int8_t *img0 = src + q * h * w; // feature map after padding - q channel + int16_t *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + int16_t tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // feature map after padding 6*6 start addr + const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl); + vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl); + vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl); + vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl); + vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl); + vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl); + + // (q - z) + vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl); + vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl); + vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl); + vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl); + vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl); + vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl); + + vint16m1_t _tmp0m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl), + _r04, vl); + vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4, + vadd_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4, + vsub_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp5m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl), + _r05, vl); + + // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl), + // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m = + // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02, + // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl), + // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m = + // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03, + // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), + // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m = + // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl), + // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl); + + vse16_v_i16m1(tmp[0][m], _tmp0m, vl); + vse16_v_i16m1(tmp[1][m], _tmp1m, vl); + vse16_v_i16m1(tmp[2][m], _tmp2m, vl); + vse16_v_i16m1(tmp[3][m], _tmp3m, vl); + vse16_v_i16m1(tmp[4][m], _tmp4m, vl); + vse16_v_i16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + int16_t *r0_tm0 = r0_tm; + int16_t *r0_tm1 = r0_tm0 + tiles * packn; + int16_t *r0_tm2 = r0_tm1 + tiles * packn; + int16_t *r0_tm3 = r0_tm2 + tiles * packn; + int16_t *r0_tm4 = r0_tm3 + tiles * packn; + int16_t *r0_tm5 = r0_tm4 + tiles * packn; + + vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl); + vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl); + vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl); + vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl); + vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl); + vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl); + + vint16m1_t _r0tm0 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl); + vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4, + vadd_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4, + vsub_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm5 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl); + + vse16_v_i16m1(r0_tm0, _r0tm0, vl); + vse16_v_i16m1(r0_tm1, _r0tm1, vl); + vse16_v_i16m1(r0_tm2, _r0tm2, vl); + vse16_v_i16m1(r0_tm3, _r0tm3, vl); + vse16_v_i16m1(r0_tm4, _r0tm4, vl); + vse16_v_i16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias, + int8_t *dst, int ch, int blk_h, int blk_w, + int32_t *multi, int32_t *shift, int32_t out_zp) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 4 } // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍 + }; + + [0] = r00 + (r01 + r02) + (r03 + r04) + [1] = (r01 - r02) + (r03 - r04) * 2 + [2] = (r01 + r02) + (r03 + r04) * 4 + [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + vint32m2_t _mult = vle32_v_i32m2(multi + p, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + p, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + const int32_t *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + int32_t tmp[4][6][packn]; + + vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl); + _bias = vmul_vx_i32m2(_bias, 576, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl); + vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl); + vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl); + vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl); + vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl); + vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl); + + vint32m2_t _tmp0m = + vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl); + vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl); + vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl); + vint32m2_t _tmp3m = + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl); + + vse32_v_i32m2(tmp[0][m], _tmp0m, vl); + vse32_v_i32m2(tmp[1][m], _tmp1m, vl); + vse32_v_i32m2(tmp[2][m], _tmp2m, vl); + vse32_v_i32m2(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl); + vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl); + vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl); + vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl); + vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl); + vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl); + + vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl); + + vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl); + + vint32m2_t _out00 = vadd_vv_i32m2( + _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl); + vint32m2_t _out01 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl); + vint32m2_t _out02 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl); + vint32m2_t _out03 = vadd_vv_i32m2( + _bias, + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl); + + vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn); + vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn); + vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn); + vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + int16_t *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _a0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl); + vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl); + vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl); + vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl); + vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl); + + vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl); + vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl); + vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl); + vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl); + vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl); + vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl); + vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl); + vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl); + vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl); + vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl); + vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl); + vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl); + tm1 += area * tiles * packn; + img_tm2 += 12 * packn; + } + } + for (; t + 7 < tiles; t += 8) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + + vse16_v_i16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_packnx12_int8(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int p = 0; p + packn - 1 < out_ch; p += packn) { + int32_t *output0_tm = output + p * area * tiles; // 8 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 11 < tiles; t += 12) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v8, zero\n\t" + "vmv.v.x v9, zero\n\t" + "vmv.v.x v10, zero\n\t" + "vmv.v.x v11, zero\n\t" + "vmv.v.x v12, zero\n\t" + "vmv.v.x v13, zero\n\t" + "vmv.v.x v14, zero\n\t" + "vmv.v.x v15, zero\n\t" + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n12k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v8, a0, v2\n\t" + "vwmacc.vx v12, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v14, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v16, t0, v2\n\t" + "vwmacc.vx v20, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + + "vwmacc.vx v18, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v22, t3, v2\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v24, a0, v2\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "lwd t0, t2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "srli t3, t2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v8, t0, v4\n\t" + "vwmacc.vx v12, t2, v4\n\t" + "lwd a0, a2, 8(%[input_ptr])\n\t" + "vwmacc.vx v10, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v14, t3, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 16(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 24\n\t" // input_ptr += 12 + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "slli t5, %[step], 1\n\t" + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v8, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v10, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v12, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v14, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2", + "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v16, zero\n\t" + "vmv.v.x v17, zero\n\t" + "vmv.v.x v18, zero\n\t" + "vmv.v.x v19, zero\n\t" + "vmv.v.x v20, zero\n\t" + "vmv.v.x v21, zero\n\t" + "vmv.v.x v22, zero\n\t" + "vmv.v.x v23, zero\n\t" + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v16, a0, v2\n\t" + "vwmacc.vx v20, a2, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v24, t0, v2\n\t" + "vwmacc.vx v28, t2, v2\n\t" + "lwd a0, a2, 16(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v2\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v2\n\t" + "srli a3, a2, 16\n\t" + + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v16, a0, v4\n\t" + "vwmacc.vx v20, a2, v4\n\t" + "lwd t0, t2, 24(%[input_ptr])\n\t" + "vwmacc.vx v18, a1, v4\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v22, a3, v4\n\t" + "srli t3, t2, 16\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" // input_ptr += 16 + "vwmacc.vx v24, t0, v4\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + "srli a3, a2, 16\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "slli t5, %[step], 1\n\t" + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v16, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v18, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v20, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v22, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v24, zero\n\t" + "vmv.v.x v25, zero\n\t" + "vmv.v.x v26, zero\n\t" + "vmv.v.x v27, zero\n\t" + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "lwd a0, a2, 0(%[input_ptr])\n\t" + "srli a1, a0, 16\n\t" + "srli a3, a2, 16\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v24, a0, v2\n\t" + "lwd t0, t2, 8(%[input_ptr])\n\t" + "vwmacc.vx v28, a2, v2\n\t" + "srli t1, t0, 16\n\t" + "vwmacc.vx v26, a1, v2\n\t" + "srli t3, t2, 16\n\t" + "vwmacc.vx v30, a3, v2\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" // input_ptr += 8 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v24, t0, v4\n\t" + "lwd a0, a2, 0(%[input_ptr])\n\t" + "vwmacc.vx v28, t2, v4\n\t" + "srli a1, a0, 16\n\t" + "vwmacc.vx v26, t1, v4\n\t" + "srli a3, a2, 16\n\t" + "vwmacc.vx v30, t3, v4\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "slli t5, %[step], 1\n\t" + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v24, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v26, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5"); + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v28, zero\n\t" + "vmv.v.x v29, zero\n\t" + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v28, a0, v2\n\t" + "lh t0, 4(%[input_ptr])\n\t" + "vwmacc.vx v30, a1, v2\n\t" + "lh t1, 6(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 8\n\t" // input_ptr += 4 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v28, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + "vwmacc.vx v30, t1, v4\n\t" + "lh a1, 2(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "slli t5, %[step], 1\n\t" + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v28, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0", + "a1", "t0", "t1", "t5"); + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + asm volatile( + "vsetvli zero, %[step], e16, m1\n\t" + "srai t5, %[inch], 1\n\t" // t0 = in_c / 2 + + "vmv.v.x v30, zero\n\t" + "vmv.v.x v31, zero\n\t" // clear + + // pre-load kernel matrix + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + // pre-load input matrix + "lh a0, 0(%[input_ptr])\n\t" + + "1:\n\t" // m8n8k2 + "vle16.v v4, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v30, a0, v2\n\t" + "lh t0, 2(%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 4\n\t" // input_ptr += 2 + + "vle16.v v2, (%[kernel_ptr])\n\t" + "add %[kernel_ptr], %[kernel_ptr], %[step]\n\t" // kernel_ptr += + // packn + + "vwmacc.vx v30, t0, v4\n\t" + "lh a0, 0(%[input_ptr])\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 1b\n\t" + + "slli t5, %[step], 1\n\t" + + "vsetvli zero, zero, e32, m2\n\t" + "vse32.v v30, (%[output_ptr])\n\t" + "add %[output_ptr], %[output_ptr], t5\n\t" + + : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm) + : [inch] "r"(in_ch), [step] "r"(packn * 2) + : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5"); + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(int8_t) / 2 + ******************************************************************************************/ +void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + int8_t *kernel_data = (int8_t *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t)); + + // kernel transform matrix: G + const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4}, + {1, 2, 4}, {1, -2, 4}, {0, 0, 6}}; + + csinn_tensor_copy(dst_kernel, src_kernel); // tensor->dtype ?? + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9; + int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const int8_t *k0 = kernel0; + const int8_t *k1 = kernel0 + 3; + const int8_t *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + int16_t tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + int16_t *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + const int packn = csrr_vlenb() / sizeof(int16_t); + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/8, 6*6, I, 8] + int16_t *kernel_tm_packn = + (int16_t *)shl_mem_alloc(outch / packn * 36 * inch * packn * sizeof(int16_t)); + dst_kernel->data = kernel_tm_packn; + + for (int oc = 0; oc + packn - 1 < outch; oc += packn) { + int16_t *g0 = kernel_tm_packn + oc * 36 * inch; + + for (int k = 0; k < 36; k++) { + int16_t *g00 = g0 + k * inch * packn; + + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + * packn = vlen / sizeof(int8_t) / 2 + ******************************************************************************************/ +int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + // int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * in_h * in_w * sizeof(int8_t)); + // int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t)); + + for (int n = 0; n < batch; n++) { + // shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, in_c, in_h, in_w); + + // pad buffer: [in_c/packn h w packn] + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + + // pad input + winograd_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left, input->qinfo->zero_point); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + int16_t *input_tm1_buf = + (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t)); + wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w, input->qinfo->zero_point); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/12, in_c, 12] + int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t)); + wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/packn, 36, tiles, packn] + const int vlen = csrr_vlenb() * 8; + int32_t *output_dot_buf = + (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t)); + + wg_bxf3s1_batch_gemm_packnx12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + int8_t *output_tm1_buf = + (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w, multiplier, shift, output->qinfo->zero_point); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packn_int8(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + + // shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, out_c, out_h, out_w); + + output_data += output_size; + shl_mem_free(output_tm1_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + } + return CSINN_TRUE; +} +// #endif diff --git a/source/c908_opt/convolution_gemm_fp16.c b/source/c908_opt/convolution_gemm_fp16.c new file mode 100644 index 00000000..e34f7327 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp16.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // m = out_ch / group + int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); + for (int g = 0; g < group; g++) { + shl_c908_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + } + memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + shl_mem_free(pa_reorder); +} + +int shl_c908_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t in_height = input->dim[2]; + int32_t in_width = input->dim[3]; + int32_t out_ch = kernel->dim[0]; + int32_t out_height = output->dim[2]; + int32_t out_width = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_top = params->pad_top; + + // im2col matrix_col = out_height * out_width + // im2col matrix_row = channel_col + int channel_col = in_ch / group * ksize_h * ksize_w; + + int32_t m = out_ch / group; + int32_t k = channel_col; + int32_t n = out_height * out_width; + + __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + const int vlen = csrr_vlenb() * 8; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // im2col + for (int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + for (int h = 0; h < out_height; ++h) { + for (int w = 0; w < out_width; ++w) { + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = + (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { + im2col_data[col_index] = 0.0f; + } else { + im2col_data[col_index] = + input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + + im_col]; + } + } + } + } + + __fp16 *pa = kernel_data + g * m * k; + __fp16 *pb = pb_reorder; + __fp16 *pc = output_data; + if (vlen == 128) { + // pack + shl_c908_reorder_input_z24_fp16(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z32_fp16_v256(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n); + } + input_data += in_ch / group * in_height * in_width; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp16_pack1ton.c b/source/c908_opt/convolution_gemm_fp16_pack1ton.c new file mode 100644 index 00000000..00828422 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp16_pack1ton.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params); +} + +int shl_c908_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_pack1ton_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn] + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const __fp16 *img0 = input_pad_buf; + __fp16 *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e16m1(loop_c); + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * vl; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += vl; + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + // shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m, + // in_cp * maxk, n, n); + shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp16_packn.c b/source/c908_opt/convolution_gemm_fp16_packn.c new file mode 100644 index 00000000..fe44c696 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp16_packn.c @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * packn = vlenb / sizeof(__fp16) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + * pack kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + ************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); +} + +int shl_c908_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(__fp16)); + const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const __fp16 *img0 = input_pad_buf + c * padded_in_hw; + __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = img0 + a * padded_in_w * packn + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * packn; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = + (__fp16 *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(__fp16)); + shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp16_packnto1.c b/source/c908_opt/convolution_gemm_fp16_packnto1.c new file mode 100644 index 00000000..309cb95f --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp16_packnto1.c @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params); +} + +int shl_c908_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(__fp16)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const __fp16 *img0 = input_pad_buf + c * padded_in_hw; + __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * packn; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp32.c b/source/c908_opt/convolution_gemm_fp32.c new file mode 100644 index 00000000..d4de5038 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp32.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // m = out_ch / group + int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); + for (int g = 0; g < group; g++) { + shl_c908_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + } + memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_c908_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t in_height = input->dim[2]; + int32_t in_width = input->dim[3]; + int32_t out_ch = kernel->dim[0]; + int32_t out_height = output->dim[2]; + int32_t out_width = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_top = params->pad_top; + + // im2col matrix_col = out_height * out_width + // im2col matrix_row = channel_col + int channel_col = in_ch / group * ksize_h * ksize_w; + + int32_t m = out_ch / group; + int32_t k = channel_col; + int32_t n = out_height * out_width; + + float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + + const int vlen = csrr_vlenb() * 8; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // im2col + for (int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + for (int h = 0; h < out_height; ++h) { + for (int w = 0; w < out_width; ++w) { + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = + (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { + im2col_data[col_index] = 0.0f; + } else { + im2col_data[col_index] = + input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + + im_col]; + } + } + } + } + + float *pa = kernel_data + g * m * k; + float *pb = pb_reorder; + float *pc = output_data; + if (vlen == 128) { + // pack + shl_c908_reorder_input_z12_fp32(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z16_fp32_v256(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n); + } + input_data += in_ch / group * in_height * in_width; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp32_pack1ton.c b/source/c908_opt/convolution_gemm_fp32_pack1ton.c new file mode 100644 index 00000000..f1eb366b --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp32_pack1ton.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params); +} + +int shl_c908_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_pack1ton_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const float *img0 = input_pad_buf; + float *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e32m1(loop_c); + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * vl; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += vl; + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m, + // in_cp * maxk, n, n); + shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp32_packn.c b/source/c908_opt/convolution_gemm_fp32_packn.c new file mode 100644 index 00000000..15a82870 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp32_packn.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * packn = vlenb / sizeof(float) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + * pack kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + ************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); +} + +int shl_c908_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(float)); + const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const float *img0 = input_pad_buf + c * padded_in_hw; + float *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = img0 + a * padded_in_w * packn + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * packn; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = + (float *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(float)); + shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_fp32_packnto1.c b/source/c908_opt/convolution_gemm_fp32_packnto1.c new file mode 100644 index 00000000..96748c76 --- /dev/null +++ b/source/c908_opt/convolution_gemm_fp32_packnto1.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params); +} + +int shl_c908_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(float)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const float *img0 = input_pad_buf + c * padded_in_hw; + float *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * packn; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, false); + shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_int8.c b/source/c908_opt/convolution_gemm_int8.c new file mode 100644 index 00000000..f0c094a1 --- /dev/null +++ b/source/c908_opt/convolution_gemm_int8.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int m = kernel->dim[0] / group; // m = out_ch / group + int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); + int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; + + for (int g = 0; g < group; g++) { + shl_c908_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k); + } + // FIXME: free params->conv_extra.kernel_tm->data + // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + // shl_mem_free(pa_reorder); +} + +int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + // int8_t *kernel_data = (int8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t in_height = input->dim[2]; + int32_t in_width = input->dim[3]; + int32_t out_ch = kernel->dim[0]; + int32_t out_height = output->dim[2]; + int32_t out_width = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_top = params->pad_top; + + // im2col matrix_col = out_height * out_width + // im2col matrix_row = channel_col + int channel_col = in_ch / group * ksize_h * ksize_w; + + int32_t m = out_ch / group; + int32_t k = channel_col; + int32_t n = out_height * out_width; + int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + + int8_t *im2col_data = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + const int vlen = csrr_vlenb() * 8; + + int j = 0; + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // im2col + for (int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + for (int h = 0; h < out_height; ++h) { + for (int w = 0; w < out_width; ++w) { + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = + (c * out_height + h) * out_width + w; // [channel_col, out_h, out_w] + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) { + im2col_data[col_index] = input->qinfo->zero_point; + } else { + im2col_data[col_index] = + input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + + im_col]; + } + } + } + } + + int8_t *pa = kernel_data + g * m * k4; + int8_t *pb = pb_reorder; + int8_t *pc = output_data; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + if (vlen == 128) { + // pack + shl_c908_reorder_input_z8_int8(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); + } else if (vlen == 256) { + // pack + shl_c908_reorder_input_z16_int8_v256(im2col_data, pb, k, n, n); + // GEMM + shl_c908_gemm_8x16_int8_v256(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); + } + input_data += in_ch / group * in_height * in_width; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_int8_pack1ton.c b/source/c908_opt/convolution_gemm_int8_pack1ton.c new file mode 100644 index 00000000..4926085b --- /dev/null +++ b/source/c908_opt/convolution_gemm_int8_pack1ton.c @@ -0,0 +1,225 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn can != 0 + * layout: [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_pack1ton_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int in_c4 = ((in_c - 1) & -4) + 4; + for (int oc = 0; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c4 * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + int tail_c4 = in_c & 3; + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * (in_c4 - ic); + + int p = 0; + for (; p + 3 < tail_c; p += 4) { + int8_t *g2 = g1 + p * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + if (p < tail_c) { + int8_t *g2 = g1 + p * packn; + for (int i = 0; i < tail_c4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + int in_c4 = ((in_c - 1) & -4) + 4; // align 4 for input_channel + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c4 * maxk * sizeof(int8_t)); + int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c4 * maxk; + im2col_gemm_reorder_kernel_pack1ton_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } +} + +int shl_c908_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_pack1ton_int8(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + int in_cp4 = ((in_cp - 1) & -4) + 4; + + // [in_cp4/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_cp4%packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const int8_t *img0 = input_pad_buf; + int8_t *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e8mf2(loop_c); + int vl4 = ((vl - 1) & -4) + 4; + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * vl; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += vl4; // XXX: dst align 4 + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t)); + shl_rvv_reorder_input_z12_pack1ton_int8(im2col_buf, reorder_buf, in_cp4, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4; + int32_t *bias_ptr = bias_data + g * m; + // shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m, + // in_cp4 * maxk, n, n, output->qinfo->zero_point, + // multiplier, shift); + shl_c908_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp4 * maxk, n, output->qinfo->zero_point, + multiplier, shift); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_int8_packn.c b/source/c908_opt/convolution_gemm_int8_packn.c new file mode 100644 index 00000000..8c274a07 --- /dev/null +++ b/source/c908_opt/convolution_gemm_int8_packn.c @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4] + * 默认支持 dot 版本,不支持 dot 数据排布不同 + ************************************************************/ +static void im2col_gemm_reorder_kernel_packn_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b] + for (int oc = 0; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packn_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + + // FIXME: free params->conv_extra.kernel_tm->data + // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + // shl_mem_free(pa_reorder); +} + +int shl_c908_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // paddding + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(int8_t)); + const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const int8_t *img0 = input_pad_buf + c * padded_in_hw; + int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = img0 + a * padded_in_w * packn + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * packn; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = + (int8_t *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(int8_t)); + shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + shl_c908_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, output->qinfo->zero_point, + multiplier, shift); + // shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} diff --git a/source/c908_opt/convolution_gemm_int8_packnto1.c b/source/c908_opt/convolution_gemm_int8_packnto1.c new file mode 100644 index 00000000..36be05cc --- /dev/null +++ b/source/c908_opt/convolution_gemm_int8_packnto1.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn != 0 and in_ch % packn = 0 + * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4] + * [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4] + * 默认支持 dot 版本,不支持 dot 数据排布不同 + ************************************************************/ +static void im2col_gemm_reorder_kernel_packnto1_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + + // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b] + int oc = 0; + for (; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } + // [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4] + if (oc < out_c) { + vl = vsetvl_e8mf2(out_c - oc); + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * vl + k * packn * vl; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * vl; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packnto1_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + + // FIXME: free params->conv_extra.kernel_tm->data + // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + // shl_mem_free(pa_reorder); +} + +int shl_c908_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // paddding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(int8_t)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const int8_t *img0 = input_pad_buf + c * padded_in_hw; + int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * packn; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t)); + shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + shl_c908_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, output->qinfo->zero_point, + multiplier, shift); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/c908_opt/depthwise_convolution.c b/source/c908_opt/depthwise_convolution.c new file mode 100644 index 00000000..0ed523fb --- /dev/null +++ b/source/c908_opt/depthwise_convolution.c @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(float); + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_fp32; + + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_fp32; + } else { + cb->exec = shl_ref_depthwise_conv2d_f32; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_fp32; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_fp32; + } else { + cb->exec = shl_ref_depthwise_conv2d_f32; + } + } + return CSINN_TRUE; +} + +int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_fp16; + + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_fp16; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_fp16; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_fp16; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + return CSINN_TRUE; +} + +int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // enable fuse zeropoint to bias + if (!params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + bias->data = bias_data; + } + int kernel_inner = 1 * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] -= tmp; + } + } + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_int8; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_int8; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + return CSINN_TRUE; +} + +int shl_c908_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_ch = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + // xxx: only int4 support nhwc layout now + if (input->layout == CSINN_LAYOUT_NHWC) { + out_ch = output->dim[3]; + in_ch = input->dim[3]; + in_h = input->dim[1]; + in_w = input->dim[2]; + kernel_h = kernel->dim[1]; + kernel_w = kernel->dim[2]; + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_int4; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_int4; + } + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + return CSINN_TRUE; + } + return CSINN_FALSE; +} diff --git a/source/c908_opt/fullyconnected.c b/source/c908_opt/fullyconnected.c new file mode 100644 index 00000000..e663aea1 --- /dev/null +++ b/source/c908_opt/fullyconnected.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +int shl_c908_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) +{ + const int weights_dims_count = weights->dim_count; + const int out_nodes = weights->dim[weights_dims_count - 2]; + const int in_nodes = weights->dim[weights_dims_count - 1]; + struct csinn_callback *cb = params->base.cb; + if (input->dtype == CSINN_DTYPE_FLOAT32) { + shl_rvv_fc_gemv_transform_weight_fp32(weights); + cb->exec = shl_rvv_fullyconnected_packn_fp32; + } else if (input->dtype == CSINN_DTYPE_FLOAT16) { + shl_rvv_fc_gemv_transform_weight_fp16(weights); + cb->exec = shl_rvv_fullyconnected_packn_fp16; + } else if (input->dtype == CSINN_DTYPE_INT8) { + // enable fuse zeropoint to bias + if (!params->fc_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *weights_data = (int8_t *)weights->data; + int32_t input_zp = input->qinfo->zero_point; + + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_nodes * sizeof(int32_t)); + bias->data = bias_data; + } + for (int oc = 0; oc < out_nodes; oc++) { + int32_t tmp = 0; + for (int j = 0; j < in_nodes; j++) { + tmp += weights_data[oc * in_nodes + j] * input_zp; + } + bias_data[oc] -= tmp; + } + } + + // support channel quantization + for (int i = 0; i < weights->quant_channel; i++) { + float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), + &(weights->qinfo[i].shift)); + } + if (in_nodes % 4 == 0) { + shl_rvv_fc_gemv_transform_weight_int8_dot(weights); + cb->exec = shl_rvv_fullyconnected_packn_int8_dot; + } else { + shl_rvv_fc_gemv_transform_weight_int8(weights); + cb->exec = shl_rvv_fullyconnected_packn_int8; + } + } else if (input->dtype == CSINN_DTYPE_INT4) { + // support channel quantization + for (int i = 0; i < weights->quant_channel; i++) { + float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), + &(weights->qinfo[i].shift)); + } + if (in_nodes % 8 == 0) { + shl_rvv_fc_gemv_transform_weight_int4_dot(weights); + cb->exec = shl_rvv_fullyconnected_packn_int4_dot; + } else { + shl_debug_warning("fc is not optimized for int4, call reference func replaced.\n"); + cb->exec = shl_ref_fullyconnected_quant; + } + } + return CSINN_TRUE; +} diff --git a/source/c908_opt/gemm_fp16.c b/source/c908_opt/gemm_fp16.c new file mode 100644 index 00000000..a6c2a4d4 --- /dev/null +++ b/source/c908_opt/gemm_fp16.c @@ -0,0 +1,3679 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 128 + * VS kernel 12 x 16 + * input matrix and kernel matrix have been reordered + *************************************************************/ + +static inline void kernel_m8n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n24 + "remw t2, %[n], a0\n\t" // t2 = n % 24 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + // m8 + "1:\n\t" + "li s1, 8\n\t" + "vsetvli zero, s1, e16, m1\n\t" // set vl = 8 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + "flh fs4, 8(%[bias_ptr])\n\t" + "flh fs5, 10(%[bias_ptr])\n\t" + "flh fs6, 12(%[bias_ptr])\n\t" + "flh fs7, 14(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n24 + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n24==0, jump to m8n16 + // m8n24 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + "vfmv.v.f v20, fs4\n\t" + "vfmv.v.f v21, fs4\n\t" + "vfmv.v.f v22, fs4\n\t" + "vfmv.v.f v23, fs5\n\t" + "vfmv.v.f v24, fs5\n\t" + "vfmv.v.f v25, fs5\n\t" + "vfmv.v.f v26, fs6\n\t" + "vfmv.v.f v27, fs6\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs7\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v3, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n24k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n24k2 + "3:\n\t" + + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v6, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v3, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v20, fa4, v4\n\t" + "vfmacc.vf v21, fa4, v5\n\t" + "vfmacc.vf v22, fa4, v6\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v23, fa5, v4\n\t" + "vfmacc.vf v24, fa5, v5\n\t" + "vfmacc.vf v25, fa5, v6\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v26, fa6, v4\n\t" + "vfmacc.vf v27, fa6, v5\n\t" + "vfmacc.vf v28, fa6, v6\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v29, fa7, v4\n\t" + "vfmacc.vf v30, fa7, v5\n\t" + "vfmacc.vf v31, fa7, v6\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n24k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n24 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + + "addi s3, s3, 48\n\t" // ******************** + + // end kernel_m8n24 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -48\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v14, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v17, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v20, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v23, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v26, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v29, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v15, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v18, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v21, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v24, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v27, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v16, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v22, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v25, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n16 + "6:\n\t" + "andi s1, t2, 16\n\t" // s1 = bool_n16 + "beqz s1, 10f\n\t" // if n16==0, jump to m8n8 + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n16k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "7:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n16k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n16 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n16 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 16 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v24, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v26, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v25, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v27, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v29, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n8 + "10:\n\t" + "andi s1, t2, 8\n\t" // s1 = bool_n8 + "beqz s1, 14f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n4k2 + "11:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n8k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 16\n\t" // ******************** + + // end kernel_m8n8 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -16\n\t" // pb -= 8 + + "vse16.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v28, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v29, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v30, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n_tail + "14:\n\t" + "andi s1, t2, 7\n\t" // s1 = bool_n_tail + "beqz s1, 18f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e16, m1\n\t" // set vl = n_tail + "slli t6, s1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "15:\n\t" + "vle16.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m8n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse16.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse16.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse16.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse16.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse16.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 16\n\t" // bias_data += 8 + "slli t6, %[k], 4\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 4\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v14, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v17, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v15, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v18, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v16, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs2\n\t" + "vfmv.v.f v13, fs2\n\t" + "vfmv.v.f v14, fs3\n\t" + "vfmv.v.f v15, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v12, fa2, v4\n\t" + "vfmacc.vf v13, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v14, fa3, v4\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v10, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v12, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v14, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v13, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v15, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v9, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v10, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v11, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v10, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v11, (a3)\n\t" + "add a3, a3, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v10, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v9, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x24_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc) +{ + __fp16 *kernel_ptr = (__fp16 *)sa; + __fp16 *input_ptr = (__fp16 *)sb; + __fp16 *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * 2); + } + __fp16 *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +static inline void kernel_m8n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n16 + "andi t2, %[n], 15\n\t" // t2 = n & 15u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "li s1, 8\n\t" + "vsetvli zero, s1, e16, m1\n\t" // set vl = 8 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + "flh fs4, 8(%[bias_ptr])\n\t" + "flh fs5, 10(%[bias_ptr])\n\t" + "flh fs6, 12(%[bias_ptr])\n\t" + "flh fs7, 14(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n16 + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n16==0, jump to m8n8 + // m8n16 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n16k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "3:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n16k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n16 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v24, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v26, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v25, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v27, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v29, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n8 + "6:\n\t" + "andi s1, t2, 8\n\t" // s1 = n8 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "7:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 16\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -16\n\t" // pb -= 8 + + "vse16.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v28, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v29, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v30, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n_tail + "10:\n\t" + "andi s1, t2, 7\n\t" // s1 = bool_n_tail + "beqz s1, 14f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e16, m1\n\t" // set vl = n_tail + "slli t6, s1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "11:\n\t" + "vle16.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse16.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse16.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse16.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse16.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse16.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 16\n\t" // bias_data += 8 + "slli t6, %[k], 4\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 4\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m8n16_fp16_1(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n16 + "andi t2, %[n], 15\n\t" // t2 = n & 15u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + "flh fs4, 8(%[bias_ptr])\n\t" + "flh fs5, 10(%[bias_ptr])\n\t" + "flh fs6, 12(%[bias_ptr])\n\t" + "flh fs7, 14(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n16 + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n16==0, jump to m8n8 + // m8n16 + "2:\n\t" + "li s2, 16\n\t" + "vsetvli zero, s2, e16, m2\n\t" // set vl = 8 + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n16k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "3:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, ft0, v2\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v2\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v20, ft2, v2\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v22, ft3, v2\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v24, ft4, v2\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v26, ft5, v2\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v28, ft6, v2\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n16k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n16 + + "vfmacc.vf v16, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v24, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v26, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n8 + "6:\n\t" + "li s1, 8\n\t" + "vsetvli zero, s1, e16, m1\n\t" // set vl = 8 + "andi s1, t2, 8\n\t" // s1 = n8 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "7:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 16\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -16\n\t" // pb -= 8 + + "vse16.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse16.v v28, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse16.v v29, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse16.v v30, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n_tail + "10:\n\t" + "andi s1, t2, 7\n\t" // s1 = bool_n_tail + "beqz a1, 14f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e16, m1\n\t" // set vl = n_tail + "slli t6, s1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "11:\n\t" + "vle16.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse16.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse16.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse16.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse16.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse16.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 16\n\t" // bias_data += 8 + "slli t6, %[k], 4\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 4\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n8 + "andi t2, %[n], 15\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v17, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse16.v v18, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v18, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v19, (a3)\n\t" + "add a3, a3, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n8 + "andi t2, %[n], 15\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v17, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n8 + "andi t2, %[n], 15\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", + // We use these general-purpose registers. + "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc) +{ + __fp16 *kernel_ptr = (__fp16 *)sa; + __fp16 *input_ptr = (__fp16 *)sb; + __fp16 *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * 2); + } + __fp16 *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c908_opt/gemm_fp16_packn.c b/source/c908_opt/gemm_fp16_packn.c new file mode 100644 index 00000000..855bcfc3 --- /dev/null +++ b/source/c908_opt/gemm_fp16_packn.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void gemm_fp16_ncxhwx_12xpack2n(__fp16 *output, const __fp16 *kernel, const __fp16 *input, + const __fp16 *bias, int m, int k, int n, bool fuse_relu); +void gemm_fp16_ncxhwx_12xpackn(__fp16 *output, const __fp16 *kernel, const __fp16 *input, + const __fp16 *bias, int m, int k, int n, bool fuse_relu); + +void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + const __fp16 *bias, int m, int k, int n, bool fuse_relu) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + gemm_fp16_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu); + sa += pack2n * k; + dst += pack2n * n; + if (bias) { + bias += pack2n; + } + } + for (; oc + packn - 1 < m; oc += packn) { + gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu); + sa += packn * k; + dst += packn * n; + if (bias) { + bias += packn; + } + } + if (oc < m) { + gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu); + } +} \ No newline at end of file diff --git a/source/c908_opt/gemm_fp16_v256.c b/source/c908_opt/gemm_fp16_v256.c new file mode 100644 index 00000000..87e772e3 --- /dev/null +++ b/source/c908_opt/gemm_fp16_v256.c @@ -0,0 +1,3247 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 256 + * VS kernel 12 x 16 + * input matrix and kernel matrix have been reordered + *************************************************************/ + +static inline void kernel_m8n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 48\n\t" + "divw t1, %[n], a0\n\t" // t1 = n24 + "remw t2, %[n], a0\n\t" // t2 = n % 24 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + // m8 + "1:\n\t" + "li s1, 16\n\t" + "vsetvli zero, s1, e16, m1\n\t" // set vl = 8 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + "flh fs4, 8(%[bias_ptr])\n\t" + "flh fs5, 10(%[bias_ptr])\n\t" + "flh fs6, 12(%[bias_ptr])\n\t" + "flh fs7, 14(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n24 + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n24==0, jump to m8n16 + // m8n24 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + "vfmv.v.f v20, fs4\n\t" + "vfmv.v.f v21, fs4\n\t" + "vfmv.v.f v22, fs4\n\t" + "vfmv.v.f v23, fs5\n\t" + "vfmv.v.f v24, fs5\n\t" + "vfmv.v.f v25, fs5\n\t" + "vfmv.v.f v26, fs6\n\t" + "vfmv.v.f v27, fs6\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs7\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v3, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n24k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n24k2 + "3:\n\t" + + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v6, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v3, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v20, fa4, v4\n\t" + "vfmacc.vf v21, fa4, v5\n\t" + "vfmacc.vf v22, fa4, v6\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v23, fa5, v4\n\t" + "vfmacc.vf v24, fa5, v5\n\t" + "vfmacc.vf v25, fa5, v6\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v26, fa6, v4\n\t" + "vfmacc.vf v27, fa6, v5\n\t" + "vfmacc.vf v28, fa6, v6\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v29, fa7, v4\n\t" + "vfmacc.vf v30, fa7, v5\n\t" + "vfmacc.vf v31, fa7, v6\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n24k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n24 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + + "addi s3, s3, 96\n\t" // ******************** + + // end kernel_m8n24 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -96\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v14, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v17, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v20, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v23, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v26, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v29, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v15, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v18, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v21, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v24, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v27, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v16, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v22, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v25, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n16 + "6:\n\t" + "andi s1, t2, 32\n\t" // s1 = bool_n16 + "beqz s1, 10f\n\t" // if n16==0, jump to m8n8 + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n16k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "7:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n16k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n16 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 64\n\t" // ******************** + + // end kernel_m8n16 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -64\n\t" // pb -= 16 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v24, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v26, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v25, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v27, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v29, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n8 + "10:\n\t" + "andi s1, t2, 16\n\t" // s1 = bool_n8 + "beqz s1, 14f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n4k2 + "11:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n8k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse16.v v24, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v25, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v26, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v27, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v28, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v29, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v30, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n_tail + "14:\n\t" + "andi s1, t2, 15\n\t" // s1 = bool_n_tail + "beqz a1, 18f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e16, m1\n\t" // set vl = n_tail + "slli t6, s1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "15:\n\t" + "vle16.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m8n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse16.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse16.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse16.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse16.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse16.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 16\n\t" // bias_data += 8 + "slli t6, %[k], 4\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 4\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 48\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v14, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v17, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v15, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v18, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v16, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 32\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs2\n\t" + "vfmv.v.f v13, fs2\n\t" + "vfmv.v.f v14, fs3\n\t" + "vfmv.v.f v15, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v12, fa2, v4\n\t" + "vfmacc.vf v13, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v14, fa3, v4\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v10, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v12, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v14, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v13, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v15, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v9, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v10, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v11, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v10, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v11, (a3)\n\t" + "add a3, a3, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 48\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 32\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v10, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v9, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "li a0, 48\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 24 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 32\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x48_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int m, int k, int n, int ldc) +{ + __fp16 *kernel_ptr = (__fp16 *)sa; + __fp16 *input_ptr = (__fp16 *)sb; + __fp16 *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * 2); + } + __fp16 *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +static inline void kernel_m8n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 5\n\t" // t1 = n32 + "andi t2, %[n], 31\n\t" // t2 = n & 31u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "li s1, 16\n\t" + "vsetvli zero, s1, e16, m1\n\t" // set vl = 16 + // load 8 bias_data for 8 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + "flh fs4, 8(%[bias_ptr])\n\t" + "flh fs5, 10(%[bias_ptr])\n\t" + "flh fs6, 12(%[bias_ptr])\n\t" + "flh fs7, 14(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n32 + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n32==0, jump to m8n16 + // m8n32 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n32k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n32k2 + "3:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle16.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n32k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n16 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 64\n\t" // ******************** + + // end kernel_m8n32 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -64\n\t" // pb -= 32 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v24, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v26, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v25, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v27, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v29, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n16 + "6:\n\t" + "andi s1, t2, 16\n\t" // s1 = n16 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "7:\n\t" + "vle16.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n16k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n16 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 16 + + "vse16.v v24, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v25, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v26, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v27, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse16.v v28, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse16.v v29, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse16.v v30, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse16.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n_tail + "10:\n\t" + "andi s1, t2, 15\n\t" // s1 = bool_n_tail + "beqz s1, 14f\n\t" // if n_tail==0, jump to end m8 + "vsetvli zero, s1, e16, m1\n\t" // set vl = n_tail + "slli t6, s1, 1\n\t" // t6 = 2 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + "flh ft4, 8(s2)\n\t" + "flh ft5, 10(s2)\n\t" + "flh ft6, 12(s2)\n\t" + "flh ft7, 14(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "11:\n\t" + "vle16.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flh fa0, 16(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flh fa1, 18(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flh fa2, 20(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flh fa3, 22(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flh fa4, 24(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flh fa5, 26(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flh fa6, 28(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flh fa7, 30(s2)\n\t" // 0 + "addi s2, s2, 32\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle16.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flh ft4, 8(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flh ft5, 10(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flh ft6, 12(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flh ft7, 14(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse16.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse16.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse16.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse16.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse16.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 16\n\t" // bias_data += 8 + "slli t6, %[k], 4\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 4\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 5\n\t" // t1 = n8 + "andi t2, %[n], 31\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + "flh fs2, 4(%[bias_ptr])\n\t" + "flh fs3, 6(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v17, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse16.v v18, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse16.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + "flh ft2, 4(s2)\n\t" + "flh ft3, 6(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 10(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flh fa2, 12(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flh fa3, 14(s2)\n\t" + "addi s2, s2, 16\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flh ft2, 4(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flh ft3, 6(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse16.v v18, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse16.v v19, (a3)\n\t" + "add a3, a3, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 5\n\t" // t1 = n8 + "andi t2, %[n], 31\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + "flh fs1, 2(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 1\n\t" // t5_tmp = ldc * 2 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v17, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + "flh ft1, 2(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 4(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flh fa1, 6(s2)\n\t" + "addi s2, s2, 8\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flh ft1, 2(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse16.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n, + int ldc, __fp16 *bias) +{ + asm volatile( + "srai t1, %[n], 5\n\t" // t1 = n8 + "andi t2, %[n], 31\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 16\n\t" + "vsetvli zero, a0, e16, m1\n\t" // set vl = 8 + // load 4 bias_data for 4 out_channels + "flh fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle16.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse16.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse16.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 15\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e16, m1\n\t" // set vl = n_tail + "slli t6, t1, 1\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flh ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle16.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flh fa0, 2(s2)\n\t" + "addi s2, s2, 4\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle16.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flh ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse16.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", + // We use these general-purpose registers. + "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x32_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int m, int k, int n, int ldc) +{ + __fp16 *kernel_ptr = (__fp16 *)sa; + __fp16 *input_ptr = (__fp16 *)sb; + __fp16 *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * 2); + } + __fp16 *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} \ No newline at end of file diff --git a/source/c908_opt/gemm_fp32.c b/source/c908_opt/gemm_fp32.c new file mode 100644 index 00000000..6bba3687 --- /dev/null +++ b/source/c908_opt/gemm_fp32.c @@ -0,0 +1,3247 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 128 + * input matrix and kernel matrix have been reordered + *************************************************************/ + +static inline void kernel_m8n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "li a0, 12\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + // m8 + "1:\n\t" + "li s1, 4\n\t" + "vsetvli zero, s1, e32, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + "flw fs4, 16(%[bias_ptr])\n\t" + "flw fs5, 20(%[bias_ptr])\n\t" + "flw fs6, 24(%[bias_ptr])\n\t" + "flw fs7, 28(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n12 + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n12==0, jump to m8n8 + // m8n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + "vfmv.v.f v20, fs4\n\t" + "vfmv.v.f v21, fs4\n\t" + "vfmv.v.f v22, fs4\n\t" + "vfmv.v.f v23, fs5\n\t" + "vfmv.v.f v24, fs5\n\t" + "vfmv.v.f v25, fs5\n\t" + "vfmv.v.f v26, fs6\n\t" + "vfmv.v.f v27, fs6\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs7\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v3, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n12k2 + "3:\n\t" + + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v6, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v3, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v20, fa4, v4\n\t" + "vfmacc.vf v21, fa4, v5\n\t" + "vfmacc.vf v22, fa4, v6\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v23, fa5, v4\n\t" + "vfmacc.vf v24, fa5, v5\n\t" + "vfmacc.vf v25, fa5, v6\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v26, fa6, v4\n\t" + "vfmacc.vf v27, fa6, v5\n\t" + "vfmacc.vf v28, fa6, v6\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v29, fa7, v4\n\t" + "vfmacc.vf v30, fa7, v5\n\t" + "vfmacc.vf v31, fa7, v6\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + + "addi s3, s3, 48\n\t" // ******************** + + // end kernel_m8n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -48\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v14, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v17, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v20, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v23, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v26, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v29, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v15, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v18, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v21, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v24, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v27, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v16, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v22, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v25, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n8 + "6:\n\t" + "andi s1, t2, 8\n\t" // s1 = bool_n8 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n4 + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "7:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v24, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v26, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v25, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v27, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v29, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n4 + "10:\n\t" + "andi s1, t2, 4\n\t" // s1 = bool_n4 + "beqz s1, 14f\n\t" // if n4==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n4k2 + "11:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 16\n\t" // ******************** + + // end kernel_m8n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -16\n\t" // pb -= 4 + + "vse32.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v28, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v29, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v30, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n_tail + "14:\n\t" + "andi s1, t2, 3\n\t" // s1 = bool_n_tail + "beqz s1, 18f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e32, m1\n\t" // set vl = n_tail + "slli t6, s1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "15:\n\t" + "vle32.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m8n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse32.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse32.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse32.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse32.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse32.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "slli t6, %[k], 5\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 5\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "li a0, 12\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v14, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v17, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v15, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v18, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v16, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs2\n\t" + "vfmv.v.f v13, fs2\n\t" + "vfmv.v.f v14, fs3\n\t" + "vfmv.v.f v15, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v12, fa2, v4\n\t" + "vfmacc.vf v13, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v14, fa3, v4\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v10, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v12, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v14, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v13, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v15, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 4\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v9, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v10, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v11, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v10, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v11, (a3)\n\t" + "add a3, a3, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "li a0, 12\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m2 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m2n8 + // m2n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m2n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m2n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m2n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m2n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m2n8 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m2n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m2n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m2n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m2n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m2n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v10, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m2n4 + "10:\n\t" + "andi t1, t2, 4\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m2n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m2n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m2n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m2n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v9, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m2n_tail + "14:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m2n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m2n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m2ntial + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m2n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "li a0, 12\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m1 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m1n8 + // m1n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m1n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m1n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m1n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 48\n\t" // ******************** + + // end kernel_m1n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -48\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m1n8 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m1n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m1n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m1n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m1n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m1n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m1n4 + "10:\n\t" + "andi t1, t2, 4\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m1n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m1n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m1n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m1n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m1n_tail + "14:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m1n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m1n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m1n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x12_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc) +{ + float *kernel_ptr = (float *)sa; + float *input_ptr = (float *)sb; + float *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +static inline void kernel_m8n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "srai t1, %[n], 3\n\t" // t1 = n8 + "andi t2, %[n], 7\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "li s1, 4\n\t" + "vsetvli zero, s1, e32, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + "flw fs4, 16(%[bias_ptr])\n\t" + "flw fs5, 20(%[bias_ptr])\n\t" + "flw fs6, 24(%[bias_ptr])\n\t" + "flw fs7, 28(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n8 + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m8n4 + // m8n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "3:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v24, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v26, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v25, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v27, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v29, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n4 + "6:\n\t" + "andi s1, t2, 4\n\t" // s1 = n4 + "beqz s1, 10f\n\t" // if n4==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 16\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -16\n\t" // pb -= 4 + + "vse32.v v24, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v25, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v26, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v27, (a3)\n\t" + "addi a3, a3, 16\n\t" + "vse32.v v28, (a4)\n\t" + "addi a4, a4, 16\n\t" + "vse32.v v29, (a5)\n\t" + "addi a5, a5, 16\n\t" + "vse32.v v30, (a6)\n\t" + "addi a6, a6, 16\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 16\n\t" + + // m8n_tail + "10:\n\t" + "andi s1, t2, 3\n\t" // s1 = bool_n_tail + "beqz s1, 14f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e32, m1\n\t" // set vl = n_tail + "slli t6, s1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse32.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse32.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse32.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse32.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse32.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "slli t6, %[k], 5\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 5\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "srai t1, %[n], 3\n\t" // t1 = n8 + "andi t2, %[n], 7\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 4 bias_data for 4 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 4\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v17, (a1)\n\t" + "addi a1, a1, 16\n\t" + "vse32.v v18, (a2)\n\t" + "addi a2, a2, 16\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 16\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v18, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v19, (a3)\n\t" + "add a3, a3, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "srai t1, %[n], 3\n\t" // t1 = n8 + "andi t2, %[n], 7\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m2 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m2n4 + // m2n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m2n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m2n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m2n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m2n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m2n4 + "6:\n\t" + "andi t1, t2, 4\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m2n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m2n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m2n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m2n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v17, (a1)\n\t" + "addi a1, a1, 16\n\t" + + // m2n_tail + "10:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m2n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m2n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m2n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + + // end kernel_m2 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc, + float *bias) +{ + asm volatile( + "srai t1, %[n], 3\n\t" // t1 = n8 + "andi t2, %[n], 7\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m1 + "1:\n\t" + "li a0, 4\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 1 bias_data for 1 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m1n4 + // m1n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m1n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m1n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m1n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m1n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m1n4 + "6:\n\t" + "andi t1, t2, 4\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m1n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m1n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m1n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m1n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 16\n\t" + + // m1n_tail + "10:\n\t" + "andi t1, t2, 3\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m1n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m1n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m1n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m1n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + + // end kernel_m1 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", + // We use these general-purpose registers. + "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k, + int n, int ldc) +{ + float *kernel_ptr = (float *)sa; + float *input_ptr = (float *)sb; + float *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c908_opt/gemm_fp32_packn.c b/source/c908_opt/gemm_fp32_packn.c new file mode 100644 index 00000000..e7e6cdc6 --- /dev/null +++ b/source/c908_opt/gemm_fp32_packn.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void gemm_fp32_ncxhwx_12xpack2n(float *output, const float *kernel, const float *input, + const float *bias, int m, int k, int n, bool fuse_relu); +void gemm_fp32_ncxhwx_12xpackn(float *output, const float *kernel, const float *input, + const float *bias, int m, int k, int n, bool fuse_relu); + +void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, + const float *bias, int m, int k, int n, bool fuse_relu) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + gemm_fp32_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu); + sa += pack2n * k; + dst += pack2n * n; + if (bias) { + bias += pack2n; + } + } + for (; oc + packn - 1 < m; oc += packn) { + gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu); + sa += packn * k; + dst += packn * n; + if (bias) { + bias += packn; + } + } + if (oc < m) { + gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu); + } +} diff --git a/source/c908_opt/gemm_fp32_v256.c b/source/c908_opt/gemm_fp32_v256.c new file mode 100644 index 00000000..d8c2106e --- /dev/null +++ b/source/c908_opt/gemm_fp32_v256.c @@ -0,0 +1,3246 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 256 + * input matrix and kernel matrix have been reordered + *************************************************************/ +static inline void kernel_m8n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n24 + "remw t2, %[n], a0\n\t" // t2 = n % 24 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + // m8 + "1:\n\t" + "li s1, 8\n\t" + "vsetvli zero, s1, e32, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + "flw fs4, 16(%[bias_ptr])\n\t" + "flw fs5, 20(%[bias_ptr])\n\t" + "flw fs6, 24(%[bias_ptr])\n\t" + "flw fs7, 28(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n12 + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n12==0, jump to m8n8 + // m8n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + "vfmv.v.f v20, fs4\n\t" + "vfmv.v.f v21, fs4\n\t" + "vfmv.v.f v22, fs4\n\t" + "vfmv.v.f v23, fs5\n\t" + "vfmv.v.f v24, fs5\n\t" + "vfmv.v.f v25, fs5\n\t" + "vfmv.v.f v26, fs6\n\t" + "vfmv.v.f v27, fs6\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs7\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v3, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n12k2 + "3:\n\t" + + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v6, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v3, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v20, fa4, v4\n\t" + "vfmacc.vf v21, fa4, v5\n\t" + "vfmacc.vf v22, fa4, v6\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v23, fa5, v4\n\t" + "vfmacc.vf v24, fa5, v5\n\t" + "vfmacc.vf v25, fa5, v6\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v26, fa6, v4\n\t" + "vfmacc.vf v27, fa6, v5\n\t" + "vfmacc.vf v28, fa6, v6\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v29, fa7, v4\n\t" + "vfmacc.vf v30, fa7, v5\n\t" + "vfmacc.vf v31, fa7, v6\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "vfmacc.vf v20, ft4, v1\n\t" + "vfmacc.vf v21, ft4, v2\n\t" + "vfmacc.vf v22, ft4, v3\n\t" + "vfmacc.vf v23, ft5, v1\n\t" + "vfmacc.vf v24, ft5, v2\n\t" + "vfmacc.vf v25, ft5, v3\n\t" + "vfmacc.vf v26, ft6, v1\n\t" + "vfmacc.vf v27, ft6, v2\n\t" + "vfmacc.vf v28, ft6, v3\n\t" + "vfmacc.vf v29, ft7, v1\n\t" + "vfmacc.vf v30, ft7, v2\n\t" + "vfmacc.vf v31, ft7, v3\n\t" + + "addi s3, s3, 96\n\t" // ******************** + + // end kernel_m8n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -96\n\t" // pb -= 24 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v14, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v17, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v20, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v23, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v26, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v29, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v15, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v18, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v21, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v24, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v27, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v16, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v22, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v25, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n16 + "6:\n\t" + "andi s1, t2, 16\n\t" // s1 = bool_n8 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n4 + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "7:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 64\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -64\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v24, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v26, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v25, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v27, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v29, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n4 + "10:\n\t" + "andi s1, t2, 8\n\t" // s1 = bool_n4 + "beqz s1, 14f\n\t" // if n4==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n4k2 + "11:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 4 + + "vse32.v v24, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v25, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v26, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v27, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v28, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v29, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v30, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n_tail + "14:\n\t" + "andi s1, t2, 7\n\t" // s1 = bool_n_tail + "beqz a1, 18f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e32, m1\n\t" // set vl = n_tail + "slli t6, s1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "15:\n\t" + "vle32.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m8n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse32.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse32.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse32.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse32.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse32.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "slli t6, %[k], 5\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 5\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m4n8 + // m4n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + "vfmv.v.f v14, fs2\n\t" + "vfmv.v.f v15, fs2\n\t" + "vfmv.v.f v16, fs2\n\t" + "vfmv.v.f v17, fs3\n\t" + "vfmv.v.f v18, fs3\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v14, fa2, v4\n\t" + "vfmacc.vf v15, fa2, v5\n\t" + "vfmacc.vf v16, fa2, v6\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v17, fa3, v4\n\t" + "vfmacc.vf v18, fa3, v5\n\t" + "vfmacc.vf v19, fa3, v6\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "vfmacc.vf v14, ft2, v1\n\t" + "vfmacc.vf v15, ft2, v2\n\t" + "vfmacc.vf v16, ft2, v3\n\t" + "vfmacc.vf v17, ft3, v1\n\t" + "vfmacc.vf v18, ft3, v2\n\t" + "vfmacc.vf v19, ft3, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m4n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v14, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v17, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v15, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v18, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v16, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m4n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs2\n\t" + "vfmv.v.f v13, fs2\n\t" + "vfmv.v.f v14, fs3\n\t" + "vfmv.v.f v15, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v12, fa2, v4\n\t" + "vfmacc.vf v13, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v14, fa3, v4\n\t" + "vfmacc.vf v15, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "vfmacc.vf v12, ft2, v1\n\t" + "vfmacc.vf v13, ft2, v2\n\t" + "vfmacc.vf v14, ft3, v1\n\t" + "vfmacc.vf v15, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v10, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v12, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v14, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v13, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v15, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v9, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v10, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v11, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + "vfmv.v.f v10, fs2\n\t" + "vfmv.v.f v11, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v10, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v11, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m4n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "vfmacc.vf v10, ft2, v1\n\t" + "vfmacc.vf v11, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v10, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v11, (a3)\n\t" + "add a3, a3, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m2 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m2n8 + // m2n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + "vfmv.v.f v11, fs1\n\t" + "vfmv.v.f v12, fs1\n\t" + "vfmv.v.f v13, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m2n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v11, fa1, v4\n\t" + "vfmacc.vf v12, fa1, v5\n\t" + "vfmacc.vf v13, fa1, v6\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m2n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m2n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "vfmacc.vf v11, ft1, v1\n\t" + "vfmacc.vf v12, ft1, v2\n\t" + "vfmacc.vf v13, ft1, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m2n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v12, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v13, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m2n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m2n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs1\n\t" + "vfmv.v.f v11, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m2n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v10, fa1, v4\n\t" + "vfmacc.vf v11, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m2n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m2n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft1, v1\n\t" + "vfmacc.vf v11, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m2n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v10, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v11, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m2n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m2n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m2n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m2n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m2n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v9, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m2n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m2n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v9, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m2n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m2ntial + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m2n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v9, (a1)\n\t" + "add a1, a1, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "li a0, 24\n\t" + "divw t1, %[n], a0\n\t" // t1 = n12 + "remw t2, %[n], a0\n\t" // t2 = n % 12 (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m1 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n12==0, jump to m1n8 + // m1n12 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + "vfmv.v.f v10, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m1n12k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n12k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v6, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v3, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "vfmacc.vf v10, fa0, v6\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m1n12k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m1n12 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "vfmacc.vf v10, ft0, v3\n\t" + + "addi %[input_ptr], %[input_ptr], 96\n\t" // ******************** + + // end kernel_m1n12 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -96\n\t" // pb -= 12 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v10, (a0)\n\t" + "addi a0, a0, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m1n8 + "6:\n\t" + "andi t1, t2, 16\n\t" // s1 = bool_n8 + "beqz t1, 10f\n\t" // if n8==0, jump to m1n4 + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + "vfmv.v.f v9, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m1n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n8k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "vfmacc.vf v9, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m1n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m1n8 + + "vfmacc.vf v8, ft0, v1\n\t" + "vfmacc.vf v9, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m1n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v9, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m1n4 + "10:\n\t" + "andi t1, t2, 8\n\t" // s1 = bool_n4 + "beqz t1, 14f\n\t" // if n4==0, jump to m1n_tail + + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m1n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n4k2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m1n4k1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m1n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v8, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m1n_tail + "14:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 18f\n\t" // if bool_n_tail==0, jump to ending + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v8, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 16f\n\t" // if k2 == 0, jump to m1n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n_tailk2 + "15:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v8, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 15b\n\t" + + // m1n_tailk1 + "16:\n\t" + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v8, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m1n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v8, (a0)\n\t" + "add a0, a0, t6\n\t" + + // ending + "18:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x24_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc) +{ + float *kernel_ptr = (float *)sa; + float *input_ptr = (float *)sb; + float *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +static inline void kernel_m8n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n16 + "andi t2, %[n], 15\n\t" // t2 = n & 15u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "li s1, 8\n\t" + "vsetvli zero, s1, e32, m1\n\t" // set vl = 8 + // load 8 bias_data for 8 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + "flw fs4, 16(%[bias_ptr])\n\t" + "flw fs5, 20(%[bias_ptr])\n\t" + "flw fs6, 24(%[bias_ptr])\n\t" + "flw fs7, 28(%[bias_ptr])\n\t" + + "mv s1, t1\n\t" // s1 = n16 + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + "add a4, a3, t5\n\t" + "add a5, a4, t5\n\t" + "add a6, a5, t5\n\t" + "add a7, a6, t5\n\t" // ******* 移到m8外面 + + "mv s3, %[input_ptr]\n\t" // s3 hold input data start addr + + "beqz t1, 6f\n\t" // if n16==0, jump to m8n8 + // m8n16 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + "vfmv.v.f v24, fs4\n\t" + "vfmv.v.f v25, fs4\n\t" + "vfmv.v.f v26, fs5\n\t" + "vfmv.v.f v27, fs5\n\t" + "vfmv.v.f v28, fs6\n\t" + "vfmv.v.f v29, fs6\n\t" + "vfmv.v.f v30, fs7\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m8n16k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n16k2 + "3:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v5, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + "vle32.v v2, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v24, fa4, v4\n\t" + "vfmacc.vf v25, fa4, v5\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v26, fa5, v4\n\t" + "vfmacc.vf v27, fa5, v5\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v28, fa6, v4\n\t" + "vfmacc.vf v29, fa6, v5\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v30, fa7, v4\n\t" + "vfmacc.vf v31, fa7, v5\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m8n16k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "vfmacc.vf v24, ft4, v1\n\t" + "vfmacc.vf v25, ft4, v2\n\t" + "vfmacc.vf v26, ft5, v1\n\t" + "vfmacc.vf v27, ft5, v2\n\t" + "vfmacc.vf v28, ft6, v1\n\t" + "vfmacc.vf v29, ft6, v2\n\t" + "vfmacc.vf v30, ft7, v1\n\t" + "vfmacc.vf v31, ft7, v2\n\t" + + "addi s3, s3, 64\n\t" // ******************** + + // end kernel_m8n16 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -64\n\t" // pb -= 16 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v24, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v26, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v28, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v30, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v25, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v27, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v29, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + "addi s1, s1, -1\n\t" + "bnez s1, 2b\n\t" + + // m8n8 + "6:\n\t" + "andi s1, t2, 8\n\t" // s1 = n8 + "beqz s1, 10f\n\t" // if n8==0, jump to m8n_tail + + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m8n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n8k2 + "7:\n\t" + "vle32.v v4, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "addi s3, s3, 32\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "addi s3, s3, 32\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi s3, s3, -32\n\t" // pb -= 8 + + "vse32.v v24, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v25, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v26, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v27, (a3)\n\t" + "addi a3, a3, 32\n\t" + "vse32.v v28, (a4)\n\t" + "addi a4, a4, 32\n\t" + "vse32.v v29, (a5)\n\t" + "addi a5, a5, 32\n\t" + "vse32.v v30, (a6)\n\t" + "addi a6, a6, 32\n\t" + "vse32.v v31, (a7)\n\t" + "addi a7, a7, 32\n\t" + + // m8n_tail + "10:\n\t" + "andi s1, t2, 7\n\t" // s1 = bool_n_tail + "beqz s1, 14f\n\t" // if n4==0, jump to m8n_tail + "vsetvli zero, s1, e32, m1\n\t" // set vl = n_tail + "slli t6, s1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v24, fs0\n\t" + "vfmv.v.f v25, fs1\n\t" + "vfmv.v.f v26, fs2\n\t" + "vfmv.v.f v27, fs3\n\t" + "vfmv.v.f v28, fs4\n\t" + "vfmv.v.f v29, fs5\n\t" + "vfmv.v.f v30, fs6\n\t" + "vfmv.v.f v31, fs7\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + "flw ft4, 16(s2)\n\t" + "flw ft5, 20(s2)\n\t" + "flw ft6, 24(s2)\n\t" + "flw ft7, 28(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m8n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, ft0, v1\n\t" + "flw fa0, 32(s2)\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "flw fa1, 36(s2)\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "flw fa2, 40(s2)\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "flw fa3, 44(s2)\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "flw fa4, 48(s2)\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "flw fa5, 52(s2)\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "flw fa6, 56(s2)\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + "flw fa7, 60(s2)\n\t" // 0 + "addi s2, s2, 64\n\t" // += 16 elements, bump kernel to next k2 addr + + "vle32.v v1, (s3)\n\t" + "add s3, s3, t6\n\t" + + "vfmacc.vf v24, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v25, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v26, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v27, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + "vfmacc.vf v28, fa4, v4\n\t" + "flw ft4, 16(s2)\n\t" + "vfmacc.vf v29, fa5, v4\n\t" + "flw ft5, 20(s2)\n\t" + "vfmacc.vf v30, fa6, v4\n\t" + "flw ft6, 24(s2)\n\t" + "vfmacc.vf v31, fa7, v4\n\t" + "flw ft7, 28(s2)\n\t" // 1 + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vfmacc.vf v24, ft0, v1\n\t" + "vfmacc.vf v25, ft1, v1\n\t" + "vfmacc.vf v26, ft2, v1\n\t" + "vfmacc.vf v27, ft3, v1\n\t" + "vfmacc.vf v28, ft4, v1\n\t" + "vfmacc.vf v29, ft5, v1\n\t" + "vfmacc.vf v30, ft6, v1\n\t" + "vfmacc.vf v31, ft7, v1\n\t" + + "add s3, s3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub s3, s3, t6\n\t" // pb -= n_tail + + "vse32.v v24, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v25, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v26, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v27, (a3)\n\t" + "add a3, a3, t6\n\t" + "vse32.v v28, (a4)\n\t" + "add a4, a4, t6\n\t" + "vse32.v v29, (a5)\n\t" + "add a5, a5, t6\n\t" + "vse32.v v30, (a6)\n\t" + "add a6, a6, t6\n\t" + "vse32.v v31, (a7)\n\t" + "add a7, a7, t6\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "slli t6, %[k], 5\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[ldc], 5\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * ldc + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1", + "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", + "ft7"); +} + +static inline void kernel_m4n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n16 + "andi t2, %[n], 15\n\t" // t2 = n & 15u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m4 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 4 bias_data for 4 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + "flw fs2, 8(%[bias_ptr])\n\t" + "flw fs3, 12(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + "add a2, a1, t5\n\t" + "add a3, a2, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + "vfmv.v.f v20, fs2\n\t" + "vfmv.v.f v21, fs2\n\t" + "vfmv.v.f v22, fs3\n\t" + "vfmv.v.f v23, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m4n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v20, fa2, v4\n\t" + "vfmacc.vf v21, fa2, v5\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v22, fa3, v4\n\t" + "vfmacc.vf v23, fa3, v5\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "vfmacc.vf v20, ft2, v1\n\t" + "vfmacc.vf v21, ft2, v2\n\t" + "vfmacc.vf v22, ft3, v1\n\t" + "vfmacc.vf v23, ft3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v20, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v22, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v21, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v23, (a3)\n\t" + "addi a3, a3, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m4n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v17, (a1)\n\t" + "addi a1, a1, 32\n\t" + "vse32.v v18, (a2)\n\t" + "addi a2, a2, 32\n\t" + "vse32.v v19, (a3)\n\t" + "addi a3, a3, 32\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m4n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + "vfmv.v.f v18, fs2\n\t" + "vfmv.v.f v19, fs3\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 4 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + "flw ft2, 8(s2)\n\t" + "flw ft3, 12(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m4n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m4n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 16(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 20(s2)\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "flw fa2, 24(s2)\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + "flw fa3, 28(s2)\n\t" + "addi s2, s2, 32\n\t" // += 8 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + "vfmacc.vf v18, fa2, v4\n\t" + "flw ft2, 8(s2)\n\t" + "vfmacc.vf v19, fa3, v4\n\t" + "flw ft3, 12(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m4n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "vfmacc.vf v18, ft2, v1\n\t" + "vfmacc.vf v19, ft3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + "vse32.v v18, (a2)\n\t" + "add a2, a2, t6\n\t" + "vse32.v v19, (a3)\n\t" + "add a3, a3, t6\n\t" + + // end kernel_m4 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2", + "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3"); +} + +static inline void kernel_m2n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n8 + "andi t2, %[n], 15\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m2 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 2 bias_data for 2 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + "flw fs1, 4(%[bias_ptr])\n\t" + + // init output addr + "slli t5, %[ldc], 2\n\t" // t5_tmp = ldc * 4 + "mv a0, %[output_ptr]\n\t" + "add a1, a0, t5\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m2n4 + // m2n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + "vfmv.v.f v18, fs1\n\t" + "vfmv.v.f v19, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m2n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v18, fa1, v4\n\t" + "vfmacc.vf v19, fa1, v5\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m2n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m2n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "vfmacc.vf v18, ft1, v1\n\t" + "vfmacc.vf v19, ft1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m2n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v18, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v19, (a1)\n\t" + "addi a1, a1, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m2n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m2n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m2n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m2n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m2n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v17, (a1)\n\t" + "addi a1, a1, 32\n\t" + + // m2n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m2n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs1\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 2 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + "flw ft1, 4(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m2n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m2n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 8(s2)\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + "flw fa1, 12(s2)\n\t" + "addi s2, s2, 16\n\t" // += 4 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + "vfmacc.vf v17, fa1, v4\n\t" + "flw ft1, 4(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m2n4 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m2n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse32.v v17, (a1)\n\t" + "add a1, a1, t6\n\t" + + // end kernel_m2 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1", + "ft0", "ft1"); +} + +static inline void kernel_m1n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n, + int ldc, float *bias) +{ + asm volatile( + "srai t1, %[n], 4\n\t" // t1 = n8 + "andi t2, %[n], 15\n\t" // t2 = n & 7u (n_tail) + "srai t3, %[k], 1\n\t" // t3 = k2 + "andi t4, %[k], 1\n\t" // t4 = k1 + + // m1 + "1:\n\t" + "li a0, 8\n\t" + "vsetvli zero, a0, e32, m1\n\t" // set vl = 4 + // load 1 bias_data for 1 out_channels + "flw fs0, 0(%[bias_ptr])\n\t" + + // init output addr + "mv a0, %[output_ptr]\n\t" + + "beqz t1, 6f\n\t" // if n8==0, jump to m1n4 + // m1n8 + "2:\n\t" + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + "vfmv.v.f v17, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 4f\n\t" // if k2 == 0, jump to m1n8k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v5, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "vfmacc.vf v17, fa0, v5\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 3b\n\t" + + // m1n8k1 + "4:\n\t" + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m1n8 + + "vfmacc.vf v16, ft0, v1\n\t" + "vfmacc.vf v17, ft0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m1n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + "vse32.v v17, (a0)\n\t" + "addi a0, a0, 32\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m1n4 + "6:\n\t" + "andi t1, t2, 8\n\t" // s1 = n4 + "beqz t1, 10f\n\t" // if n4==0, jump to m1n_tail + + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 8f\n\t" // if k2 == 0, jump to m1n4k1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 7b\n\t" + + // m1n4k1 + "8:\n\t" + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m1n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + "vse32.v v16, (a0)\n\t" + "addi a0, a0, 32\n\t" + + // m1n_tail + "10:\n\t" + "andi t1, t2, 7\n\t" // s1 = bool_n_tail + "beqz t1, 14f\n\t" // if n4==0, jump to m1n_tail + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + // init out_tmp = bias + "vfmv.v.f v16, fs0\n\t" + + "mv s2, %[kernel_ptr]\n\t" // s2 hold kernel 1 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "flw ft0, 0(s2)\n\t" + + "beqz t3, 12f\n\t" // if k2 == 0, jump to m1n_tailk1 + "mv t5, t3\n\t" // t5 = k2 + + // m1n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, ft0, v1\n\t" + "flw fa0, 4(s2)\n\t" + "addi s2, s2, 8\n\t" // += 2 elements, bump kernel to next k2 addr + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vfmacc.vf v16, fa0, v4\n\t" + "flw ft0, 0(s2)\n\t" + + "addi t5, t5, -1\n\t" + "bnez t5, 11b\n\t" + + // m1n_tailk1 + "12:\n\t" + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m1n4 + + "vfmacc.vf v16, ft0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m1n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + "vse32.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + + // end kernel_m1 + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", + // We use these general-purpose registers. + "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0"); +} + +/************************************************************** + * dst - output:[m, n] + * sa - kernel: [m, k] + * sb - input: [k, n] + **************************************************************/ +void shl_c908_gemm_8x16_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc) +{ + float *kernel_ptr = (float *)sa; + float *input_ptr = (float *)sb; + float *output_ptr = dst; + + bool flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c908_opt/gemm_int16_packn.c b/source/c908_opt/gemm_int16_packn.c new file mode 100644 index 00000000..2dff427d --- /dev/null +++ b/source/c908_opt/gemm_int16_packn.c @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void gemm_int16_ncxhwx_12xpackn(int32_t *output, const int16_t *kernel, const int16_t *input, int k, + int n); + +void shl_c908_ncxhwx_gemm_12xpackn_int16(int32_t *dst, const int16_t *sa, const int16_t *sb, int m, + int k, int n) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + int oc = 0; + for (; oc + packn - 1 < m; oc += packn) { + gemm_int16_ncxhwx_12xpackn(dst, sa, sb, k, n); + sa += packn * k; + dst += packn * n; + } +} diff --git a/source/c908_opt/gemm_int8.c b/source/c908_opt/gemm_int8.c new file mode 100644 index 00000000..f38b53fb --- /dev/null +++ b/source/c908_opt/gemm_int8.c @@ -0,0 +1,4083 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 128 + * input matrix and kernel matrix have been reordered + *************************************************************/ + +static inline void kernel_m8n8_int8(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + "mv t3, %[input_ptr]\n\t" // t3 hold input data start addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m8n4 + // m8n8 + "2:\n\t" + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v17, t4\n\t" + "lw t4, 4(%[bias_ptr])\n\t" // bias_ptr[1] + "vmv.v.x v18, t4\n\t" + "vmv.v.x v19, t4\n\t" + "lw t4, 8(%[bias_ptr])\n\t" // bias_ptr[2] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v21, t4\n\t" + "lw t4, 12(%[bias_ptr])\n\t" // bias_ptr[3] + "vmv.v.x v22, t4\n\t" + "vmv.v.x v23, t4\n\t" + "lw t4, 16(%[bias_ptr])\n\t" // bias_ptr[4] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v25, t4\n\t" + "lw t4, 20(%[bias_ptr])\n\t" // bias_ptr[5] + "vmv.v.x v26, t4\n\t" + "vmv.v.x v27, t4\n\t" + "lw t4, 24(%[bias_ptr])\n\t" // bias_ptr[6] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v29, t4\n\t" + "lw t4, 28(%[bias_ptr])\n\t" // bias_ptr[7] + "vmv.v.x v30, t4\n\t" + "vmv.v.x v31, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 16\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + "lw a1, 4(t5)\n\t" + "lw a2, 8(t5)\n\t" + "lw a3, 12(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m8n8k1 + + // m8n8k2 + "3:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v5, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v17, a0, v2\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v19, a1, v2\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v21, a2, v2\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v23, a3, v2\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v25, a4, v2\n\t" + "lw a0, 32(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v27, a5, v2\n\t" + "lw a1, 36(t5)\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v29, a6, v2\n\t" + "lw a2, 40(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" + "vmaqa.vx v31, a7, v2\n\t" + "lw a3, 44(t5)\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "vmaqa.vx v17, a0, v5\n\t" + "lw a4, 48(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v19, a1, v5\n\t" + "lw a5, 52(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "vmaqa.vx v21, a2, v5\n\t" + "lw a6, 56(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "vmaqa.vx v23, a3, v5\n\t" + "lw a7, 60(t5)\n\t" + "addi t5, t5, 64\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "vmaqa.vx v25, a4, v5\n\t" + "lw a0, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v27, a5, v5\n\t" + "lw a1, 4(t5)\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "vmaqa.vx v29, a6, v5\n\t" + "lw a2, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" + "vmaqa.vx v31, a7, v5\n\t" + "lw a3, 12(t5)\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m8n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v17, a0, v2\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v19, a1, v2\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v21, a2, v2\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v23, a3, v2\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v25, a4, v2\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v27, a5, v2\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v29, a6, v2\n\t" + "vmaqa.vx v30, a7, v1\n\t" + "vmaqa.vx v31, a7, v2\n\t" + + "addi t3, t3, 32\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "lw a2, 4(%[mult_ptr])\n\t" + "lw a3, 4(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lw a0, 8(%[mult_ptr])\n\t" + "lw a1, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "lw a2, 12(%[mult_ptr])\n\t" + "lw a3, 12(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lw a0, 16(%[mult_ptr])\n\t" + "lw a1, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "lw a2, 20(%[mult_ptr])\n\t" + "lw a3, 20(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lw a0, 24(%[mult_ptr])\n\t" + "lw a1, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "lw a2, 28(%[mult_ptr])\n\t" + "lw a3, 28(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 8\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m8n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m8n_tail + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0] + "vmv.v.x v16, t4\n\t" + "lw t4, 4(%[bias_ptr])\n\t" // bias_ptr[1] + "vmv.v.x v18, t4\n\t" + "lw t4, 8(%[bias_ptr])\n\t" // bias_ptr[2] + "vmv.v.x v20, t4\n\t" + "lw t4, 12(%[bias_ptr])\n\t" // bias_ptr[3] + "vmv.v.x v22, t4\n\t" + "lw t4, 16(%[bias_ptr])\n\t" // bias_ptr[4] + "vmv.v.x v24, t4\n\t" + "lw t4, 20(%[bias_ptr])\n\t" // bias_ptr[5] + "vmv.v.x v26, t4\n\t" + "lw t4, 24(%[bias_ptr])\n\t" // bias_ptr[6] + "vmv.v.x v28, t4\n\t" + "lw t4, 28(%[bias_ptr])\n\t" // bias_ptr[7] + "vmv.v.x v30, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + "lw a1, 4(t5)\n\t" + "lw a2, 8(t5)\n\t" + "lw a3, 12(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lw a0, 32(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "lw a1, 36(t5)\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lw a2, 40(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" + "lw a3, 44(t5)\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lw a4, 48(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lw a5, 52(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lw a6, 56(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "lw a7, 60(t5)\n\t" + "addi t5, t5, 64\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lw a0, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "lw a1, 4(t5)\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lw a2, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" + "lw a3, 12(t5)\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m8n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vmaqa.vx v16, a0, v1\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "addi t3, t3, 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "lw a2, 4(%[mult_ptr])\n\t" + "lw a3, 4(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lw a0, 8(%[mult_ptr])\n\t" + "lw a1, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "lw a2, 12(%[mult_ptr])\n\t" + "lw a3, 12(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lw a0, 16(%[mult_ptr])\n\t" + "lw a1, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "lw a2, 20(%[mult_ptr])\n\t" + "lw a3, 20(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lw a0, 24(%[mult_ptr])\n\t" + "lw a1, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "lw a2, 28(%[mult_ptr])\n\t" + "lw a3, 28(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m8n_tail + "10:\n\t" + "andi t1, %[n], 3\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m8 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0] + "vmv.v.x v16, t4\n\t" + "lw t4, 4(%[bias_ptr])\n\t" // bias_ptr[1] + "vmv.v.x v18, t4\n\t" + "lw t4, 8(%[bias_ptr])\n\t" // bias_ptr[2] + "vmv.v.x v20, t4\n\t" + "lw t4, 12(%[bias_ptr])\n\t" // bias_ptr[3] + "vmv.v.x v22, t4\n\t" + "lw t4, 16(%[bias_ptr])\n\t" // bias_ptr[4] + "vmv.v.x v24, t4\n\t" + "lw t4, 20(%[bias_ptr])\n\t" // bias_ptr[5] + "vmv.v.x v26, t4\n\t" + "lw t4, 24(%[bias_ptr])\n\t" // bias_ptr[6] + "vmv.v.x v28, t4\n\t" + "lw t4, 28(%[bias_ptr])\n\t" // bias_ptr[7] + "vmv.v.x v30, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + "lw a1, 4(t5)\n\t" + "lw a2, 8(t5)\n\t" + "lw a3, 12(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lw a0, 32(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "lw a1, 36(t5)\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lw a2, 40(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" + "lw a3, 44(t5)\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lw a4, 48(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lw a5, 52(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lw a6, 56(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "lw a7, 60(t5)\n\t" + "addi t5, t5, 64\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lw a0, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "lw a1, 4(t5)\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lw a2, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" + "lw a3, 12(t5)\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "lw a4, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "lw a5, 20(t5)\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lw a6, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "lw a7, 28(t5)\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "add t3, t3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub t3, t3, t6\n\t" // pb -= n_tail + + // 后处理 + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "lw a2, 4(%[mult_ptr])\n\t" + "lw a3, 4(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lw a0, 8(%[mult_ptr])\n\t" + "lw a1, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "lw a2, 12(%[mult_ptr])\n\t" + "lw a3, 12(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lw a0, 16(%[mult_ptr])\n\t" + "lw a1, 16(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "lw a2, 20(%[mult_ptr])\n\t" + "lw a3, 20(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lw a0, 24(%[mult_ptr])\n\t" + "lw a1, 24(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "lw a2, 28(%[mult_ptr])\n\t" + "lw a3, 28(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "add t2, t2, t1\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "addi %[mult_ptr], %[mult_ptr], 32\n\t" // mult_ptr += 8 + "addi %[shift_ptr], %[shift_ptr], 32\n\t" // shift_ptr += 8 + "slli t6, %[k], 3\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[n], 3\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * n + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +// 如果使能xtheadc, 可用lwd指令 +static inline void kernel_m8n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + "mv t3, %[input_ptr]\n\t" // t3 hold input data start addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m8n4 + // m8n8 + "2:\n\t" + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m8n8k1 + + // m8n8k2 + "3:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t5, t5, 32\n\t" + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v4\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "vmaqa.vx v30, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m8n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "addi t3, t3, 32\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 8\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m8n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m8n_tail + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m8n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "addi t3, t3, 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m8n_tail + "10:\n\t" + "andi t1, %[n], 3\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m8 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "add t3, t3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub t3, t3, t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "add t2, t2, t1\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "addi %[mult_ptr], %[mult_ptr], 32\n\t" // mult_ptr += 8 + "addi %[shift_ptr], %[shift_ptr], 32\n\t" // shift_ptr += 8 + "slli t6, %[k], 3\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[n], 3\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * n + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +static inline void kernel_m4n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "addi t2, t2, 8\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v22, a3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 3\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v22, a3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t1", "t2", "t4", "t5", "t6"); +} + +static inline void kernel_m2n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a2, a3, 8(t5)\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "vmaqa.vx v18, a3, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "addi t2, t2, 8\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a3, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 3\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a3, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t1", "t2", "t4", "t5", "t6"); +} + +static inline void kernel_m1n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "addi t2, t2, 8\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 16\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 16\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 3\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t1", "t2", "t4", "t5", "t6"); +} + +// m8n8 --> m8n4 --> m8n2 --> m8n1 +// 需要修改 reorder_input +static inline void kernel_m8n8_int8_2(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vle32.v v8, (%[mult_ptr])\n\t" + "vle32.v v10, (%[shift_ptr])\n\t" + "vxor.vi v10, v10, -1\n\t" + + // m8 + "1:\n\t" + "srai t1, %[n], 3\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + "mv t3, %[input_ptr]\n\t" // t3 hold input data start addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m8n4 + // m8n8 + "2:\n\t" + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "vle32.v v16, (%[bias_ptr])\n\t" + "vmv.v.v v18, v16\n\t" + "vmv.v.v v20, v16\n\t" + "vmv.v.v v22, v16\n\t" + "vmv.v.v v24, v16\n\t" + "vmv.v.v v26, v16\n\t" + "vmv.v.v v28, v16\n\t" + "vmv.v.v v30, v16\n\t" + // "vle32.v v18, (%[bias_ptr])\n\t" + // "vle32.v v20, (%[bias_ptr])\n\t" + // "vle32.v v22, (%[bias_ptr])\n\t" + // "vle32.v v24, (%[bias_ptr])\n\t" + // "vle32.v v26, (%[bias_ptr])\n\t" + // "vle32.v v28, (%[bias_ptr])\n\t" + // "vle32.v v30, (%[bias_ptr])\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pa(kernel_data) + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + // pre-load pb (input_data) + "lwd a0, a1, 0(t3)\n\t" + "lwd a2, a3, 8(t3)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m8n8k1 + + // m8n8k2 + "3:\n\t" + "vle32.v v4, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t3)\n\t" + "lwd a6, a7, 24(t3)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t3, t3, 32\n\t" + "lwd a0, a1, 0(t3)\n\t" + "lwd a2, a3, 8(t3)\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lwd a4, a5, 16(t3)\n\t" + "lwd a6, a7, 24(t3)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t3, t3, 32\n\t" // += 16 elements + "lwd a0, a1, 0(t3)\n\t" + "lwd a2, a3, 8(t3)\n\t" + "vmaqa.vx v24, a4, v4\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "vmaqa.vx v30, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m8n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "lwd a4, a5, 16(t3)\n\t" + "lwd a6, a7, 24(t3)\n\t" + "addi t3, t3, 32\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + // end kernel_m8n8 + "5:\n\t" + + // 后处理 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vle32.v v8, (%[mult_ptr])\n\t" + "vle32.v v10, (%[shift_ptr])\n\t" + "vxor.vi v10, v10, -1\n\t" + + "vmulh.vv v16, v16, v8\n\t" + "vssra.vv v16, v16, v10\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v0, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v18, v18, v8\n\t" + "vssra.vv v18, v18, v10\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v20, v20, v8\n\t" + "vssra.vv v20, v20, v10\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v22, v22, v8\n\t" + "vssra.vv v22, v22, v10\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v24, v24, v8\n\t" + "vssra.vv v24, v24, v10\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v24, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v24, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v26, v26, v8\n\t" + "vssra.vv v26, v26, v10\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v26, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v26, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v28, v28, v8\n\t" + "vssra.vv v28, v28, v10\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v28, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v28, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v30, v30, v8\n\t" + "vssra.vv v30, v30, v10\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v30, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v30, v1, 0\n\t" + + "vsse8.v v16, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v18, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v20, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v22, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v24, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v26, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v28, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v30, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m8n4 + "6:\n\t" + "andi t1, %[n], 4\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m8n_tail + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "vle32.v v16, (%[bias_ptr])\n\t" + "vmv.v.v v18, v16\n\t" + "vmv.v.v v20, v16\n\t" + "vmv.v.v v22, v16\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pa(kernel_data) + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + // pre-load pb (input_data) + "lwd a0, a1, 0(t3)\n\t" + "lwd a2, a3, 8(t3)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "lwd a4, a5, 16(t3)\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a6, a7, 24(t3)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" // 0 + "addi t3, t3, 32\n\t" + + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "lwd a0, a1, 0(t3)\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "lwd a2, a3, 8(t3)\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "vmaqa.vx v22, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m8n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "addi t3, t3, 16\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + + // end kernel_m8n4 + "9:\n\t" + + // 后处理 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vle32.v v8, (%[mult_ptr])\n\t" + "vle32.v v10, (%[shift_ptr])\n\t" + "vxor.vi v10, v10, -1\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v16, v16, v8\n\t" + "vssra.vv v16, v16, v10\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v16, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v18, v18, v8\n\t" + "vssra.vv v18, v18, v10\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v20, v20, v8\n\t" + "vssra.vv v20, v20, v10\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v22, v22, v8\n\t" + "vssra.vv v22, v22, v10\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v1, 0\n\t" + + "vsse8.v v16, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v18, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v20, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v22, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + + // m8n2 + "10:\n\t" + "andi t1, %[n], 2\n\t" // t1 = n & 2u + "beqz t1, 14f\n\t" // if n2==0, jump to kernel_m8n1 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + + // init out_tmp = bias + "vle32.v v16, (%[bias_ptr])\n\t" + "vmv.v.v v18, v16\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pa(kernel_data) + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + // pre-load pb (input_data) + "lwd a0, a1, 0(t3)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n2k2 + "11:\n\t" + "vle32.v v4, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "lwd a2, a3, 8(t3)\n\t" + "vmaqa.vx v18, a1, v2\n\t" // 0 + "addi t3, t3, 16\n\t" + + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "lwd a0, a1, 0(t3)\n\t" + "vmaqa.vx v18, a3, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n2k1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "addi t3, t3, 8\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + + // end kernel_m8n2 + "13:\n\t" + // 后处理 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vle32.v v8, (%[mult_ptr])\n\t" + "vle32.v v10, (%[shift_ptr])\n\t" + "vxor.vi v10, v10, -1\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v16, v16, v8\n\t" + "vssra.vv v16, v16, v10\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v16, v0, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v18, v18, v8\n\t" + "vssra.vv v18, v18, v10\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v1, 0\n\t" + + "vsse8.v v16, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + "vsse8.v v18, (t2), %[n]\n\t" + "addi t2, t2, 1\n\t" + + // m8n1 + "14:\n\t" + "andi t1, %[n], 1\n\t" // t1 = n & 1u + "beqz t1, 18f\n\t" // if n1==0, jump to kernel_m8 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + + // init out_tmp = bias + "vle32.v v16, (%[bias_ptr])\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pa(kernel_data) + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + // pre-load pb (input_data) + "lw a0, 0(t3)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 16f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n1k2 + "15:\n\t" + "vle32.v v4, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "lw a1, 4(t3)\n\t" + "addi t3, t3, 8\n\t" + + "vle32.v v2, (t5)\n\t" + "addi t5, t5, 32\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t3)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 15b\n\t" + + // m8n1k1 + "16:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "addi t3, t3, 4\n\t" + "vmaqa.vx v16, a0, v2\n\t" + // end kernel_m8n1 + "17:\n\t" + // 后处理 + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vle32.v v8, (%[mult_ptr])\n\t" + "vle32.v v10, (%[shift_ptr])\n\t" + "vxor.vi v10, v10, -1\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vv v16, v16, v8\n\t" + "vssra.vv v16, v16, v10\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v0, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v16, v0, 0\n\t" + + "vsse8.v v16, (t2), %[n]\n\t" + // "addi t2, t2, 1\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "addi %[mult_ptr], %[mult_ptr], 32\n\t" // mult_ptr += 8 + "addi %[shift_ptr], %[shift_ptr], 32\n\t" // shift_ptr += 8 + "slli t6, %[k], 3\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[n], 3\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * n + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +static inline void kernel_m8n12_int8(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + asm volatile( + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 19f\n\t" + + // m8 + "1:\n\t" + "mv t1, %[n]\n\t" + "li t6, 12\n\t" + "mv t2, %[output_ptr]\n\t" // init output addr + "mv t3, %[input_ptr]\n\t" // t3 hold input data start addr + + "blt t1, t6, 6f\n\t" // if n < 12, jump to m8n8 + + // m8n12 + "2:\n\t" + "li t6, 4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v8, t4\n\t" + "vmv.v.x v9, t4\n\t" + "vmv.v.x v10, t4\n\t" + "vmv.v.x v11, t5\n\t" + "vmv.v.x v12, t5\n\t" + "vmv.v.x v13, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v14, t4\n\t" + "vmv.v.x v15, t4\n\t" + "vmv.v.x v16, t4\n\t" + "vmv.v.x v17, t5\n\t" + "vmv.v.x v18, t5\n\t" + "vmv.v.x v19, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v21, t4\n\t" + "vmv.v.x v22, t4\n\t" + "vmv.v.x v23, t5\n\t" + "vmv.v.x v24, t5\n\t" + "vmv.v.x v25, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v26, t4\n\t" + "vmv.v.x v27, t4\n\t" + "vmv.v.x v28, t4\n\t" + "vmv.v.x v29, t5\n\t" + "vmv.v.x v30, t5\n\t" + "vmv.v.x v31, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v3, (t3)\n\t" + "addi t3, t3, 16\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m8n12k1 + + // m8n12k2 + "3:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v5, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v6, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v8, a0, v1\n\t" + "vmaqa.vx v9, a0, v2\n\t" + "vmaqa.vx v10, a0, v3\n\t" + "vmaqa.vx v11, a1, v1\n\t" + "vmaqa.vx v12, a1, v2\n\t" + "vmaqa.vx v13, a1, v3\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v14, a2, v1\n\t" + "vmaqa.vx v15, a2, v2\n\t" + "vmaqa.vx v16, a2, v3\n\t" + "vmaqa.vx v17, a3, v1\n\t" + "vmaqa.vx v18, a3, v2\n\t" + "vmaqa.vx v19, a3, v3\n\t" + "addi t5, t5, 32\n\t" + + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v20, a4, v1\n\t" + "vmaqa.vx v21, a4, v2\n\t" + "vmaqa.vx v22, a4, v3\n\t" + "vmaqa.vx v23, a5, v1\n\t" + "vmaqa.vx v24, a5, v2\n\t" + "vmaqa.vx v25, a5, v3\n\t" + "vmaqa.vx v26, a6, v1\n\t" + "vmaqa.vx v27, a6, v2\n\t" + "vmaqa.vx v28, a6, v3\n\t" + "vmaqa.vx v29, a7, v1\n\t" + "vmaqa.vx v30, a7, v2\n\t" + "vmaqa.vx v31, a7, v3\n\t" + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 16\n\t" + "vle32.v v3, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v8, a0, v4\n\t" + "vmaqa.vx v9, a0, v5\n\t" + "vmaqa.vx v10, a0, v6\n\t" + "vmaqa.vx v11, a1, v4\n\t" + "vmaqa.vx v12, a1, v5\n\t" + "vmaqa.vx v13, a1, v6\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v14, a2, v4\n\t" + "vmaqa.vx v15, a2, v5\n\t" + "vmaqa.vx v16, a2, v6\n\t" + "vmaqa.vx v17, a3, v4\n\t" + "vmaqa.vx v18, a3, v5\n\t" + "vmaqa.vx v19, a3, v6\n\t" + "addi t5, t5, 32\n\t" + + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v20, a4, v4\n\t" + "vmaqa.vx v21, a4, v5\n\t" + "vmaqa.vx v22, a4, v6\n\t" + "vmaqa.vx v23, a5, v4\n\t" + "vmaqa.vx v24, a5, v5\n\t" + "vmaqa.vx v25, a5, v6\n\t" + "vmaqa.vx v26, a6, v4\n\t" + "vmaqa.vx v27, a6, v5\n\t" + "vmaqa.vx v28, a6, v6\n\t" + "vmaqa.vx v29, a7, v4\n\t" + "vmaqa.vx v30, a7, v5\n\t" + "vmaqa.vx v31, a7, v6\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m8m12k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n12 + + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v8, a0, v1\n\t" + "vmaqa.vx v9, a0, v2\n\t" + "vmaqa.vx v10, a0, v3\n\t" + "vmaqa.vx v11, a1, v1\n\t" + "vmaqa.vx v12, a1, v2\n\t" + "vmaqa.vx v13, a1, v3\n\t" + "vmaqa.vx v14, a2, v1\n\t" + "vmaqa.vx v15, a2, v2\n\t" + "vmaqa.vx v16, a2, v3\n\t" + "vmaqa.vx v17, a3, v1\n\t" + "vmaqa.vx v18, a3, v2\n\t" + "vmaqa.vx v19, a3, v3\n\t" + "vmaqa.vx v20, a4, v1\n\t" + "vmaqa.vx v21, a4, v2\n\t" + "vmaqa.vx v22, a4, v3\n\t" + "vmaqa.vx v23, a5, v1\n\t" + "vmaqa.vx v24, a5, v2\n\t" + "vmaqa.vx v25, a5, v3\n\t" + "vmaqa.vx v26, a6, v1\n\t" + "vmaqa.vx v27, a6, v2\n\t" + "vmaqa.vx v28, a6, v3\n\t" + "vmaqa.vx v29, a7, v1\n\t" + "vmaqa.vx v30, a7, v2\n\t" + "vmaqa.vx v31, a7, v3\n\t" + + "addi t3, t3, 48\n\t" // ******************** + + // end kernel_m8n12 + "5:\n\t" + + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -48\n\t" // pb -= 8 + // 后处理 + "li t6, 4\n\t" + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v8, v8, a0\n\t" + "vmulh.vx v9, v9, a0\n\t" + "vmulh.vx v10, v10, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v8, v8, a1\n\t" + "vssra.vx v9, v9, a1\n\t" + "vssra.vx v10, v10, a1\n\t" + "vadd.vx v8, v8, %[out_zp]\n\t" + "vadd.vx v9, v9, %[out_zp]\n\t" + "vadd.vx v10, v10, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v8, 0\n\t" + "vnclip.wi v2, v9, 0\n\t" + "vnclip.wi v3, v10, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v8, v1, 0\n\t" + "vnclip.wi v9, v2, 0\n\t" + "vnclip.wi v10, v3, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v11, v11, a2\n\t" + "vmulh.vx v12, v12, a2\n\t" + "vmulh.vx v13, v13, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v11, v11, a3\n\t" + "vssra.vx v12, v12, a3\n\t" + "vssra.vx v13, v13, a3\n\t" + "vadd.vx v11, v11, %[out_zp]\n\t" + "vadd.vx v12, v12, %[out_zp]\n\t" + "vadd.vx v13, v13, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v11, 0\n\t" + "vnclip.wi v5, v12, 0\n\t" + "vnclip.wi v6, v13, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v11, v4, 0\n\t" + "vnclip.wi v12, v5, 0\n\t" + "vnclip.wi v13, v6, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v14, v14, a0\n\t" + "vmulh.vx v15, v15, a0\n\t" + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v14, v14, a1\n\t" + "vssra.vx v15, v15, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v14, v14, %[out_zp]\n\t" + "vadd.vx v15, v15, %[out_zp]\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v14, 0\n\t" + "vnclip.wi v2, v15, 0\n\t" + "vnclip.wi v3, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v14, v1, 0\n\t" + "vnclip.wi v15, v2, 0\n\t" + "vnclip.wi v16, v3, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v17, v17, a2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "vmulh.vx v19, v19, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v17, v17, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vssra.vx v19, v19, a3\n\t" + "vadd.vx v17, v17, %[out_zp]\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vadd.vx v19, v19, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v17, 0\n\t" + "vnclip.wi v5, v18, 0\n\t" + "vnclip.wi v6, v19, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v17, v4, 0\n\t" + "vnclip.wi v18, v5, 0\n\t" + "vnclip.wi v19, v6, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "vmulh.vx v21, v21, a0\n\t" + "vmulh.vx v22, v22, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vssra.vx v21, v21, a1\n\t" + "vssra.vx v22, v22, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vadd.vx v21, v21, %[out_zp]\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vnclip.wi v2, v21, 0\n\t" + "vnclip.wi v3, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + "vnclip.wi v21, v2, 0\n\t" + "vnclip.wi v22, v3, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v23, v23, a2\n\t" + "vmulh.vx v24, v24, a2\n\t" + "vmulh.vx v25, v25, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v23, v23, a3\n\t" + "vssra.vx v24, v24, a3\n\t" + "vssra.vx v25, v25, a3\n\t" + "vadd.vx v23, v23, %[out_zp]\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vadd.vx v25, v25, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v23, 0\n\t" + "vnclip.wi v5, v24, 0\n\t" + "vnclip.wi v6, v25, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v23, v4, 0\n\t" + "vnclip.wi v24, v5, 0\n\t" + "vnclip.wi v25, v6, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v26, v26, a0\n\t" + "vmulh.vx v27, v27, a0\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v26, v26, a1\n\t" + "vssra.vx v27, v27, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vadd.vx v27, v27, %[out_zp]\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v26, 0\n\t" + "vnclip.wi v2, v27, 0\n\t" + "vnclip.wi v3, v28, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v26, v1, 0\n\t" + "vnclip.wi v27, v2, 0\n\t" + "vnclip.wi v28, v3, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v29, v29, a2\n\t" + "vmulh.vx v30, v30, a2\n\t" + "vmulh.vx v31, v31, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v29, v29, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vssra.vx v31, v31, a3\n\t" + "vadd.vx v29, v29, %[out_zp]\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vadd.vx v31, v31, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v29, 0\n\t" + "vnclip.wi v5, v30, 0\n\t" + "vnclip.wi v6, v31, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v29, v4, 0\n\t" + "vnclip.wi v30, v5, 0\n\t" + "vnclip.wi v31, v6, 0\n\t" + + "addi t6, %[n], -8\n\t" + "mv a0, t2\n\t" + "vse8.v v8, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v9, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v10, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v11, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v12, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v13, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v14, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v15, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v17, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v18, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v19, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v20, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v21, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v23, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v24, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v25, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v26, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v27, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, t6\n\t" + "vse8.v v29, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v30, (a0)\n\t" + "addi a0, a0, 4\n\t" + "vse8.v v31, (a0)\n\t" + + "addi t2, t2, 12\n\t" + + "li t6, 12\n\t" + "addi t1, t1, -12\n\t" + "bge t1, t6, 2b\n\t" + + // m8n8 + "6:\n\t" + "li t6, 8\n\t" + "blt t1, t6, 10f\n\t" + "addi t1, t1, -8\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n8k1 + + // m8n8k2 + "7:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t5, t5, 32\n\t" + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v4\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "vmaqa.vx v30, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "addi t3, t3, 32\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 8\n\t" + + // m8n4 + "10:\n\t" + "li t6, 4\n\t" + "blt t1, t6, 14f\n\t" // if n4==0, jump to m8n_tail + "addi t1, t1, -4\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "11:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 16\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n4k1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "addi t3, t3, 16\n\t" // ******************** + + // end kernel_m8n4 + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -16\n\t" // pb -= 4 + + // 后处理 + "li t6, 4\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 4\n\t" + + // m8n_tail + "14:\n\t" + "beqz t1, 18f\n\t" // if n_tail==0, jump to end kernel_m8 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 16f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "15:\n\t" + "vle32.v v4, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 15b\n\t" + + // m8n_tailk1 + "16:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 17f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "add t3, t3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "17:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub t3, t3, t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "add t2, t2, t1\n\t" + + // end kernel_m8 + "18:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "addi %[mult_ptr], %[mult_ptr], 32\n\t" // mult_ptr += 8 + "addi %[shift_ptr], %[shift_ptr], 32\n\t" // shift_ptr += 8 + "slli t6, %[k], 3\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[n], 3\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * n + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "19:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +void shl_c908_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift) +{ + int8_t *kernel_ptr = (int8_t *)sa; + int8_t *input_ptr = (int8_t *)sb; + int8_t *output_ptr = dst; + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + int32_t *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + mult += (m - tail); + shift += (m - tail); + } + if (tail & 4) { + kernel_m4n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + mult += 4; + shift += 4; + } + if (tail & 2) { + kernel_m2n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + mult += 2; + shift += 2; + } + if (tail & 1) { + kernel_m1n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + mult += 1; + shift += 1; + } +} diff --git a/source/c908_opt/gemm_int8_packn.c b/source/c908_opt/gemm_int8_packn.c new file mode 100644 index 00000000..1dacb05d --- /dev/null +++ b/source/c908_opt/gemm_int8_packn.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +void gemm_int8_ncxhwx_12xpackn(int8_t *output, const int8_t *kernel, const int8_t *input, + const int32_t *bias, int m, int k, int n, int32_t out_zp, + int32_t *mult, int32_t *shift); +void gemm_int8_ncxhwx_8xpackn(int8_t *output, const int8_t *kernel, const int8_t *input, + const int32_t *bias, int m, int k, int n, int32_t out_zp, + int32_t *mult, int32_t *shift); + +void shl_c908_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + const int32_t *bias, int m, int k, int n, int32_t out_zp, + int32_t *mult, int32_t *shift) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + int oc = 0; + for (; oc + packn - 1 < m; oc += packn) { + gemm_int8_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, out_zp, mult + oc, shift + oc); + sa += packn * k; + dst += packn * n; + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + bias += packn; + } + if (oc < m) { + gemm_int8_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, out_zp, mult + oc, shift + oc); + } +} diff --git a/source/c908_opt/gemm_int8_v256.c b/source/c908_opt/gemm_int8_v256.c new file mode 100644 index 00000000..a0281bfb --- /dev/null +++ b/source/c908_opt/gemm_int8_v256.c @@ -0,0 +1,1714 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************* + * note: VLEN = 256 + * input matrix and kernel matrix have been reordered + *************************************************************/ + +// 如果使能xtheadc, 可用lwd指令 +static inline void kernel_m8n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, + int32_t *shift) +{ + asm volatile( + "srai t0, %[m], 3\n\t" // t0 = m8 + "beqz t0, 15f\n\t" + + // m8 + "1:\n\t" + "srai t1, %[n], 4\n\t" // t1 = n16 + "mv t2, %[output_ptr]\n\t" // init output addr + "mv t3, %[input_ptr]\n\t" // t3 hold input data start addr + + "beqz t1, 6f\n\t" // if n16==0, jump to m8n8 + // m8n8 + "2:\n\t" + "li t6, 16\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 16 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 64\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m8n8k1 + + // m8n16k2 + "3:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 64\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t5, t5, 32\n\t" + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "vle32.v v2, (t3)\n\t" + "addi t3, t3, 64\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v24, a4, v4\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "vmaqa.vx v30, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m8n16k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m8n8 + + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "vmaqa.vx v24, a4, v2\n\t" + "vmaqa.vx v26, a5, v2\n\t" + "vmaqa.vx v28, a6, v2\n\t" + "vmaqa.vx v30, a7, v2\n\t" + + "addi t3, t3, 64\n\t" // ******************** + + // end kernel_m8n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -64\n\t" // pb -= 8 + + // 后处理 + "li t6, 16\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 16 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 16 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 16 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m8n8 + "6:\n\t" + "andi t1, %[n], 8\n\t" // t1 = n & 8u (n8) + "beqz t1, 10f\n\t" // if n8==0, jump to m8n_tail + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n8k2 + "7:\n\t" + "vle32.v v4, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "addi t3, t3, 32\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m8n8k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m8n4 + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "addi t3, t3, 32\n\t" // ******************** + + // end kernel_m8n8 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi t3, t3, -32\n\t" // pb -= 8 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "addi t2, t2, 8\n\t" + + // m8n_tail + "10:\n\t" + "andi t1, %[n], 7\n\t" // t1 = n & 7u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m8 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + "lwd t4, t5, 16(%[bias_ptr])\n\t" // bias_ptr[4]/[5] + "vmv.v.x v24, t4\n\t" + "vmv.v.x v26, t5\n\t" + "lwd t4, t5, 24(%[bias_ptr])\n\t" // bias_ptr[6]/[7] + "vmv.v.x v28, t4\n\t" + "vmv.v.x v30, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "addi t5, t5, 32\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v1\n\t" // 0 + + "vle32.v v1, (t3)\n\t" + "add t3, t3, t6\n\t" + + "vmaqa.vx v16, a0, v4\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v4\n\t" + "vmaqa.vx v20, a2, v4\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v4\n\t" + "addi t5, t5, 32\n\t" // += 16 elements + + "vmaqa.vx v24, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v26, a5, v4\n\t" + "vmaqa.vx v28, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v30, a7, v4\n\t" // 1 + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "vmaqa.vx v24, a4, v1\n\t" + "vmaqa.vx v26, a5, v1\n\t" + "vmaqa.vx v28, a6, v1\n\t" + "vmaqa.vx v30, a7, v1\n\t" + + "add t3, t3, t6\n\t" // ******************** + + // end kernel_m8n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub t3, t3, t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "lwd a0, a2, 16(%[mult_ptr])\n\t" + "lwd a1, a3, 16(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v24, v24, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v24, v24, a1\n\t" + "vadd.vx v24, v24, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v24, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v24, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v26, v26, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v26, v26, a3\n\t" + "vadd.vx v26, v26, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v26, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v26, v4, 0\n\t" + + "lwd a0, a2, 24(%[mult_ptr])\n\t" + "lwd a1, a3, 24(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v28, v28, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v28, v28, a1\n\t" + "vadd.vx v28, v28, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v28, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v28, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v30, v30, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v30, v30, a3\n\t" + "vadd.vx v30, v30, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v30, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v30, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v24, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v26, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v28, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v30, (a0)\n\t" + "add t2, t2, t1\n\t" + + // end kernel_m8 + "14:\n\t" + "addi %[bias_ptr], %[bias_ptr], 32\n\t" // bias_data += 8 + "addi %[mult_ptr], %[mult_ptr], 32\n\t" // mult_ptr += 8 + "addi %[shift_ptr], %[shift_ptr], 32\n\t" // shift_ptr += 8 + "slli t6, %[k], 3\n\t" + "add %[kernel_ptr], %[kernel_ptr], t6\n\t" // kernel_data += 8 * k + "slli t6, %[n], 3\n\t" + "add %[output_ptr], %[output_ptr], t6\n\t" // output_data += 8 * n + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + // ending + "15:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6"); +} + +static inline void kernel_m4n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, + int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 4\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 16\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a4, a5, 16(t5)\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "vmaqa.vx v20, a2, v2\n\t" + "vmaqa.vx v22, a3, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + // 后处理 + "li t6, 16\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "addi t2, t2, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 8\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v22, a3, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "addi t2, t2, 8\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 7\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + "lwd t4, t5, 8(%[bias_ptr])\n\t" // bias_ptr[2]/[3] + "vmv.v.x v20, t4\n\t" + "vmv.v.x v22, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + "lwd a2, a3, 8(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a4, a5, 16(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "lwd a6, a7, 24(t5)\n\t" + "vmaqa.vx v22, a3, v1\n\t" + "addi t5, t5, 32\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a4, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a5, v4\n\t" + "vmaqa.vx v20, a6, v4\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v22, a7, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m8n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "vmaqa.vx v20, a2, v1\n\t" + "vmaqa.vx v22, a3, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "lwd a0, a2, 8(%[mult_ptr])\n\t" + "lwd a1, a3, 8(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v20, v20, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v20, v20, a1\n\t" + "vadd.vx v20, v20, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v1, v20, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v20, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v22, v22, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v22, v22, a3\n\t" + "vadd.vx v22, v22, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v22, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v22, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v20, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v22, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t1", "t2", "t4", "t5", "t6"); +} + +static inline void kernel_m2n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, + int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 4\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 16\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + "lwd a2, a3, 8(t5)\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "vmaqa.vx v18, a3, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + "vmaqa.vx v18, a1, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + // 后处理 + "li t6, 16\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m2\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + // "addi a3, a3, -1\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "addi t2, t2, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 8\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a3, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + // 后处理 + "li t6, 8\n\t" + + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t6, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "addi t2, t2, 8\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 7\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lwd t4, t5, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + "vmv.v.x v18, t5\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lwd a0, a1, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lwd a2, a3, 8(t5)\n\t" + "vmaqa.vx v18, a1, v1\n\t" + "addi t5, t5, 16\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a2, v4\n\t" + "lwd a0, a1, 0(t5)\n\t" + "vmaqa.vx v18, a3, v4\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + "vmaqa.vx v18, a1, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lwd a0, a2, 0(%[mult_ptr])\n\t" + "lwd a1, a3, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "vsetvli zero, t1, e32, m1\n\t" + "vmulh.vx v18, v18, a2\n\t" + "not a3, a3\n\t" + "vssra.vx v18, v18, a3\n\t" + "vadd.vx v18, v18, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" + "vnclip.wi v4, v18, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" + "vnclip.wi v18, v4, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add a0, a0, %[n]\n\t" + "vse8.v v18, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "a2", "a3", "t1", "t2", "t4", "t5", "t6"); +} + +static inline void kernel_m1n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n, + int32_t *bias, int32_t out_zp, int32_t *mult, + int32_t *shift) +{ + asm volatile( + // m4 + "1:\n\t" + "srai t1, %[n], 4\n\t" // t1 = n8 + "mv t2, %[output_ptr]\n\t" // init output addr + + "beqz t1, 6f\n\t" // if n8==0, jump to m4n4 + // m4n8 + "2:\n\t" + "li t6, 16\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 4f\n\t" // if k2 == 0, jump to m4n8k1 + + // m4n8k2 + "3:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a0, v2\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v2, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 64\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 3b\n\t" + + // m4n8k1 + "4:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 5f\n\t" // if k1 == 0, jump to end kernel_m4n8 + + "vmaqa.vx v16, a0, v2\n\t" + + "addi %[input_ptr], %[input_ptr], 64\n\t" // ******************** + + // end kernel_m4n8 + "5:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -64\n\t" // pb -= 8 + + // 后处理 + "li t6, 16\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m2\n\t" // set vl = 8 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + // "addi a1, a1, -1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, m1\n\t" // set vl = 8 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf2\n\t" // set vl = 8 + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "addi t2, t2, 16\n\t" + + "addi t1, t1, -1\n\t" + "bnez t1, 2b\n\t" + + // m4n4 + "6:\n\t" + "andi t1, %[n], 8\n\t" // t1 = n & 4u (n4) + "beqz t1, 10f\n\t" // if n4==0, jump to m4n_tail + "li t6, 8\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 8f\n\t" // if k2 == 0, jump to m8n4k1 + + // m8n4k2 + "7:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "addi %[input_ptr], %[input_ptr], 32\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 7b\n\t" + + // m4n4k1 + "8:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 9f\n\t" // if k1 == 0, jump to end kernel_m4n4 + + "vmaqa.vx v16, a0, v1\n\t" + + "addi %[input_ptr], %[input_ptr], 32\n\t" // ******************** + + // end kernel_m8n4 + "9:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "addi %[input_ptr], %[input_ptr], -32\n\t" // pb -= 4 + + // 后处理 + "li t6, 8\n\t" + + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t6, e32, m1\n\t" // set vl = 4 + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t6, e16, mf2\n\t" // set vl = 4 + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t6, e8, mf4\n\t" // set vl = 4 + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "addi t2, t2, 8\n\t" + + // m4n_tail + "10:\n\t" + "andi t1, %[n], 7\n\t" // t1 = n & 3u (n_tail) + "beqz t1, 14f\n\t" // if n_tail==0, jump to end kernel_m4 + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "slli t6, t1, 2\n\t" // t6 = 4 * n_tail + + // init out_tmp = bias + "lw t4, 0(%[bias_ptr])\n\t" // bias_ptr[0]/[1] + "vmv.v.x v16, t4\n\t" + + "mv t5, %[kernel_ptr]\n\t" // s2 hold kernel 8 lines start addr + + // pre-load pb (input_data) + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + // pre-load pa(kernel_data) + "lw a0, 0(t5)\n\t" + + "srai t4, %[k], 3\n\t" // t4 = k8[k2] + "beqz t4, 12f\n\t" // if k2 == 0, jump to m8n_tail k1 + + // m8n_tailk2 + "11:\n\t" + "vle32.v v4, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a0, v1\n\t" + "lw a1, 4(t5)\n\t" + "addi t5, t5, 8\n\t" + + "vle32.v v1, (%[input_ptr])\n\t" + "add %[input_ptr], %[input_ptr], t6\n\t" + + "vmaqa.vx v16, a1, v4\n\t" + "lw a0, 0(t5)\n\t" + + "addi t4, t4, -1\n\t" + "bnez t4, 11b\n\t" + + // m2n_tailk1 + "12:\n\t" + "andi t4, %[k], 4\n\t" // t4 = k1 + "beqz t4, 13f\n\t" // if k1 == 0, jump to end kernel_m8n_tail + + "vmaqa.vx v16, a0, v1\n\t" + + "add %[input_ptr], %[input_ptr], t6\n\t" // ******************** + + // end kernel_m4n_tail + "13:\n\t" + // ********* bump pb to origin addr ************ + // offset pre-load + "sub %[input_ptr], %[input_ptr], t6\n\t" // pb -= n_tail + + // 后处理 + "lw a0, 0(%[mult_ptr])\n\t" + "lw a1, 0(%[shift_ptr])\n\t" + "vsetvli zero, t1, e32, m1\n\t" // set vl = n_tail + "vmulh.vx v16, v16, a0\n\t" + "not a1, a1\n\t" + "vssra.vx v16, v16, a1\n\t" + "vadd.vx v16, v16, %[out_zp]\n\t" + "vsetvli zero, t1, e16, mf2\n\t" // set vl = n_tail + "vnclip.wi v1, v16, 0\n\t" + "vsetvli zero, t1, e8, mf4\n\t" // set vl = n_tail + "vnclip.wi v16, v1, 0\n\t" + + "mv a0, t2\n\t" + "vse8.v v16, (a0)\n\t" + "add t2, t2, t1\n\t" + + // ending + "14:\n\t" + + : + // Outputs. + [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias), + [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift) + : + // Inputs. + [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp) + : + // Clobbers. + "cc", "memory", + // We use these Vector registers. + "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", + // We use these general-purpose registers. + "a0", "a1", "t1", "t2", "t4", "t5", "t6"); +} + +void shl_c908_gemm_8x16_int8_v256(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, + int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult, + int32_t *shift) +{ + int8_t *kernel_ptr = (int8_t *)sa; + int8_t *input_ptr = (int8_t *)sb; + int8_t *output_ptr = dst; + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + int32_t *bias_ptr = bias; + + int tail = m % 8; + if (m > 8) { + kernel_m8n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += (m - tail) * n; + kernel_ptr += (m - tail) * k; + bias_ptr += (m - tail); + } + if (tail & 4) { + kernel_m4n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 4 * n; + kernel_ptr += 4 * k; + bias_ptr += 4; + } + if (tail & 2) { + kernel_m2n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 2 * n; + kernel_ptr += 2 * k; + bias_ptr += 2; + } + if (tail & 1) { + kernel_m1n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult, + shift); + output_ptr += 1 * n; + kernel_ptr += 1 * k; + bias_ptr += 1; + } +} diff --git a/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S new file mode 100644 index 00000000..a71eb69a --- /dev/null +++ b/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S @@ -0,0 +1,1308 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void gemm_fp16_ncxhwx_12xpack2n(const __fp16 *output, + const __fp16 *kernel, + const __fp16 *input, + const __fp16 *bias, + int m, // maxtrix A row + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + bool fuse_relu) + + Algorithm works as follows: + (1) perform matrix-multiplication [pack2n, k] x [k, n] = [pack2n, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr [NULL without bais] + a4: m [packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: fuse_bias + + t0 = packn * 2 maintenance kernel_addr + t1 = tmp variable + t2 = k2 input_channel dim loop count + t3 = kernel data addr + t4 = n12 + t5 = n_tail + t6 = next packn line output + + ft0-ft5: hold input data + fa0-fa5: hold input data + + v1-v2: acc initial (bias or zero) + v3-v6: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .file "gemm_fp16_ncxhwx.S" + .section .text.gemm_fp16_ncxhwx_12xpack2n, "ax", @progbits + .align 5 + .global gemm_fp16_ncxhwx_12xpack2n + .type gemm_fp16_ncxhwx_12xpack2n, @function + +gemm_fp16_ncxhwx_12xpack2n: + slli t0, a4, 1 // t0 = packn * 2 + vsetvli zero, a4, e16, m1 + + mul t1, t0, a6 // packn * n + add t6, a0, t1 // t6[out1_addr] = out0_addr + packn * n + + li t1, 12 + divw t4, a6, t1 // t4 = n12 + remw t5, a6, t1 // t5 = n % 12 (n_tail) + + // pack2n * n [init] + vmv.v.x v1, zero // clear acc + vmv.v.x v2, zero + + beqz a3, non_bias1 + vle16.v v1, (a3) + add a3, a3, t0 // +packn + vle16.v v2, (a3) + +non_bias1: + beqz t4, pack2nx8_start // if n12==0, jump to pack2nx8 + +pack2nx12_start: + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + vmv.v.v v16, v1 + vmv.v.v v17, v1 + vmv.v.v v18, v1 + vmv.v.v v19, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + vmv.v.v v24, v2 + vmv.v.v v25, v2 + vmv.v.v v26, v2 + vmv.v.v v27, v2 + vmv.v.v v28, v2 + vmv.v.v v29, v2 + vmv.v.v v30, v2 + vmv.v.v v31, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + flh ft4, 8(a2) + flh ft5, 10(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx12_k1 + +pack2nx12_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vle16.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 12(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 14(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flh fa2, 16(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flh fa3, 18(a2) + vfmacc.vf v12, ft4, v3 + vfmacc.vf v24, ft4, v4 + flh fa4, 20(a2) + vfmacc.vf v13, ft5, v3 + vfmacc.vf v25, ft5, v4 + flh fa5, 22(a2) + vfmacc.vf v14, fa0, v3 + vfmacc.vf v26, fa0, v4 + flh ft0, 24(a2) + vfmacc.vf v15, fa1, v3 + vfmacc.vf v27, fa1, v4 + flh ft1, 26(a2) + vfmacc.vf v16, fa2, v3 + vfmacc.vf v28, fa2, v4 + flh ft2, 28(a2) + vfmacc.vf v17, fa3, v3 + vfmacc.vf v29, fa3, v4 + flh ft3, 30(a2) + vfmacc.vf v18, fa4, v3 + vfmacc.vf v30, fa4, v4 + flh ft4, 32(a2) + vfmacc.vf v19, fa5, v3 + vfmacc.vf v31, fa5, v4 + flh ft5, 34(a2) + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + vfmacc.vf v20, ft0, v6 + flh fa0, 36(a2) + vfmacc.vf v9, ft1, v5 + vfmacc.vf v21, ft1, v6 + flh fa1, 38(a2) + vfmacc.vf v10, ft2, v5 + vfmacc.vf v22, ft2, v6 + flh fa2, 40(a2) + vfmacc.vf v11, ft3, v5 + vfmacc.vf v23, ft3, v6 + flh fa3, 42(a2) + vfmacc.vf v12, ft4, v5 + vfmacc.vf v24, ft4, v6 + flh fa4, 44(a2) + vfmacc.vf v13, ft5, v5 + vfmacc.vf v25, ft5, v6 + flh fa5, 46(a2) + addi a2, a2, 48 + vfmacc.vf v14, fa0, v5 + vfmacc.vf v26, fa0, v6 + flh ft0, 0(a2) + vfmacc.vf v15, fa1, v5 + vfmacc.vf v27, fa1, v6 + flh ft1, 2(a2) + vfmacc.vf v16, fa2, v5 + vfmacc.vf v28, fa2, v6 + flh ft2, 4(a2) + vfmacc.vf v17, fa3, v5 + vfmacc.vf v29, fa3, v6 + flh ft3, 6(a2) + vfmacc.vf v18, fa4, v5 + vfmacc.vf v30, fa4, v6 + flh ft4, 8(a2) + vfmacc.vf v19, fa5, v5 + vfmacc.vf v31, fa5, v6 + flh ft5, 10(a2) + + addi t2, t2, -1 + bnez t2, pack2nx12_k2 + +pack2nx12_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx12_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 12(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 14(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flh fa2, 16(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flh fa3, 18(a2) + vfmacc.vf v12, ft4, v3 + vfmacc.vf v24, ft4, v4 + flh fa4, 20(a2) + vfmacc.vf v13, ft5, v3 + vfmacc.vf v25, ft5, v4 + flh fa5, 22(a2) + addi a2, a2, 24 + vfmacc.vf v14, fa0, v3 + vfmacc.vf v26, fa0, v4 + vfmacc.vf v15, fa1, v3 + vfmacc.vf v27, fa1, v4 + vfmacc.vf v16, fa2, v3 + vfmacc.vf v28, fa2, v4 + vfmacc.vf v17, fa3, v3 + vfmacc.vf v29, fa3, v4 + vfmacc.vf v18, fa4, v3 + vfmacc.vf v30, fa4, v4 + vfmacc.vf v19, fa5, v3 + vfmacc.vf v31, fa5, v4 + +pack2nx12_relu: + beqz a7, pack2nx12_end + vmv.v.x v0, zero + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v16, v16, v0 + vfmax.vv v17, v17, v0 + vfmax.vv v18, v18, v0 + vfmax.vv v19, v19, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + vfmax.vv v24, v24, v0 + vfmax.vv v25, v25, v0 + vfmax.vv v26, v26, v0 + vfmax.vv v27, v27, v0 + vfmax.vv v28, v28, v0 + vfmax.vv v29, v29, v0 + vfmax.vv v30, v30, v0 + vfmax.vv v31, v31, v0 + +pack2nx12_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + vse16.v v12, (a0) + add a0, a0, t0 + vse16.v v13, (a0) + add a0, a0, t0 + vse16.v v14, (a0) + add a0, a0, t0 + vse16.v v15, (a0) + add a0, a0, t0 + vse16.v v16, (a0) + add a0, a0, t0 + vse16.v v17, (a0) + add a0, a0, t0 + vse16.v v18, (a0) + add a0, a0, t0 + vse16.v v19, (a0) + add a0, a0, t0 + + vse16.v v20, (t6) + add t6, t6, t0 + vse16.v v21, (t6) + add t6, t6, t0 + vse16.v v22, (t6) + add t6, t6, t0 + vse16.v v23, (t6) + add t6, t6, t0 + vse16.v v24, (t6) + add t6, t6, t0 + vse16.v v25, (t6) + add t6, t6, t0 + vse16.v v26, (t6) + add t6, t6, t0 + vse16.v v27, (t6) + add t6, t6, t0 + vse16.v v28, (t6) + add t6, t6, t0 + vse16.v v29, (t6) + add t6, t6, t0 + vse16.v v30, (t6) + add t6, t6, t0 + vse16.v v31, (t6) + add t6, t6, t0 + + addi t4, t4, -1 + bnez t4, pack2nx12_start + +pack2nx8_start: + andi t4, t5, 8 // s1 = bool_n8 + beqz t4, pack2nx4_start // if n8==0, jump to pack2nx4 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + vmv.v.v v24, v2 + vmv.v.v v25, v2 + vmv.v.v v26, v2 + vmv.v.v v27, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx8_k1 + +pack2nx8_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vle16.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flh fa3, 14(a2) + vfmacc.vf v12, fa0, v3 + vfmacc.vf v24, fa0, v4 + flh ft0, 16(a2) + vfmacc.vf v13, fa1, v3 + vfmacc.vf v25, fa1, v4 + flh ft1, 18(a2) + vfmacc.vf v14, fa2, v3 + vfmacc.vf v26, fa2, v4 + flh ft2, 20(a2) + vfmacc.vf v15, fa3, v3 + vfmacc.vf v27, fa3, v4 + flh ft3, 22(a2) + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + vfmacc.vf v20, ft0, v6 + flh fa0, 24(a2) + vfmacc.vf v9, ft1, v5 + vfmacc.vf v21, ft1, v6 + flh fa1, 26(a2) + vfmacc.vf v10, ft2, v5 + vfmacc.vf v22, ft2, v6 + flh fa2, 28(a2) + vfmacc.vf v11, ft3, v5 + vfmacc.vf v23, ft3, v6 + flh fa3, 30(a2) + addi a2, a2, 32 + vfmacc.vf v12, fa0, v5 + vfmacc.vf v24, fa0, v6 + flh ft0, 0(a2) + vfmacc.vf v13, fa1, v5 + vfmacc.vf v25, fa1, v6 + flh ft1, 2(a2) + vfmacc.vf v14, fa2, v5 + vfmacc.vf v26, fa2, v6 + flh ft2, 4(a2) + vfmacc.vf v15, fa3, v5 + vfmacc.vf v27, fa3, v6 + flh ft3, 6(a2) + + addi t2, t2, -1 + bnez t2, pack2nx8_k2 + +pack2nx8_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx8_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flh fa3, 14(a2) + addi a2, a2, 16 + vfmacc.vf v12, fa0, v3 + vfmacc.vf v24, fa0, v4 + vfmacc.vf v13, fa1, v3 + vfmacc.vf v25, fa1, v4 + vfmacc.vf v14, fa2, v3 + vfmacc.vf v26, fa2, v4 + vfmacc.vf v15, fa3, v3 + vfmacc.vf v27, fa3, v4 + +pack2nx8_relu: + beqz a7, pack2nx8_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + vfmax.vv v24, v24, v0 + vfmax.vv v25, v25, v0 + vfmax.vv v26, v26, v0 + vfmax.vv v27, v27, v0 + +pack2nx8_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + vse16.v v12, (a0) + add a0, a0, t0 + vse16.v v13, (a0) + add a0, a0, t0 + vse16.v v14, (a0) + add a0, a0, t0 + vse16.v v15, (a0) + add a0, a0, t0 + + vse16.v v20, (t6) + add t6, t6, t0 + vse16.v v21, (t6) + add t6, t6, t0 + vse16.v v22, (t6) + add t6, t6, t0 + vse16.v v23, (t6) + add t6, t6, t0 + vse16.v v24, (t6) + add t6, t6, t0 + vse16.v v25, (t6) + add t6, t6, t0 + vse16.v v26, (t6) + add t6, t6, t0 + vse16.v v27, (t6) + add t6, t6, t0 + +pack2nx4_start: + andi t4, t5, 4 // s1 = bool_n4 + beqz t4, pack2nx2_start // if n4==0, jump to pack2nx2 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx4_k1 + +pack2nx4_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vle16.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flh fa3, 14(a2) + addi a2, a2, 16 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flh ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + vfmacc.vf v21, fa1, v6 + flh ft1, 2(a2) + vfmacc.vf v10, fa2, v5 + vfmacc.vf v22, fa2, v6 + flh ft2, 4(a2) + vfmacc.vf v11, fa3, v5 + vfmacc.vf v23, fa3, v6 + flh ft3, 6(a2) + + addi t2, t2, -1 + bnez t2, pack2nx4_k2 + +pack2nx4_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx4_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + addi a2, a2, 8 + +pack2nx4_relu: + beqz a7, pack2nx4_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + +pack2nx4_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + + vse16.v v20, (t6) + add t6, t6, t0 + vse16.v v21, (t6) + add t6, t6, t0 + vse16.v v22, (t6) + add t6, t6, t0 + vse16.v v23, (t6) + add t6, t6, t0 + +pack2nx2_start: + andi t4, t5, 2 // s1 = bool_n2 + beqz t4, pack2nx1_start // if n2==0, jump to pack2nx1 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx2_k1 + +pack2nx2_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vle16.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 4(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flh fa1, 6(a2) + addi a2, a2, 8 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flh ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + vfmacc.vf v21, fa1, v6 + flh ft1, 2(a2) + + addi t2, t2, -1 + bnez t2, pack2nx2_k2 + +pack2nx2_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx2_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + addi a2, a2, 4 + +pack2nx2_relu: + beqz a7, pack2nx2_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + +pack2nx2_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + + vse16.v v20, (t6) + add t6, t6, t0 + vse16.v v21, (t6) + add t6, t6, t0 + +pack2nx1_start: + andi t4, t5, 1 // s1 = bool_n1 + beqz t4, pack2n_end // if n1==0, jump to end + + vmv.v.v v8, v1 + vmv.v.v v20, v2 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx1_k1 + +pack2nx1_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vle16.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flh fa0, 2(a2) + addi a2, a2, 4 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vle16.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flh ft0, 0(a2) + + addi t2, t2, -1 + bnez t2, pack2nx1_k2 + +pack2nx1_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx1_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + addi a2, a2, 2 + +pack2nx1_relu: + beqz a7, pack2nx1_end + vfmax.vv v8, v8, v0 + vfmax.vv v20, v20, v0 + +pack2nx1_end: + vse16.v v8, (a0) + vse16.v v20, (t6) + +pack2n_end: + ret + +/************************************************************************************************** + + void gemm_fp16_ncxhwx_12xpackn(const __fp16 *output, + const __fp16 *kernel, + const __fp16 *input, + const __fp16 *bias, + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + bool fuse_relu) + + Algorithm works as follows: + (1) perform matrix-multiplication [m, k] x [k, n] = [m, n] + m = packn or tail_packn + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr [NULL without bais] + a4: m [packn or tail_packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: fuse_bias + + t0 = packn * 2 maintenance kernel_addr + t1 = tmp variable + t2 = k2 input_channel dim loop count + t3 = kernel data addr + t4 = n12 + t5 = n_tail + t6 = unused + + ft0-ft5: hold input data + fa0-fa5: hold input data + + v1: acc initial (bias or zero) + v3/v5: hold kernel data + v8-v19: packn line acc + + *************************************************************************************************/ + .section .text.gemm_fp16_ncxhwx_12xpackn, "ax", @progbits + .align 5 + .global gemm_fp16_ncxhwx_12xpackn + .type gemm_fp16_ncxhwx_12xpackn, @function + +gemm_fp16_ncxhwx_12xpackn: + slli t0, a4, 1 // t0 = packn * 2 + vsetvli zero, a4, e16, m1 + + li t1, 12 + divw t4, a6, t1 // t4 = n12 + remw t5, a6, t1 // t5 = n % 12 (n_tail) + + vmv.v.x v1, zero // clear acc + + beqz a3, non_bias2 + vle16.v v1, (a3) + +non_bias2: + beqz t4, packnx8_start // if n12==0, jump to pack2nx8 + +packnx12_start: + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + vmv.v.v v16, v1 + vmv.v.v v17, v1 + vmv.v.v v18, v1 + vmv.v.v v19, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + flh ft4, 8(a2) + flh ft5, 10(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx12_k1 + +packnx12_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flh fa0, 12(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 14(a2) + vfmacc.vf v10, ft2, v3 + flh fa2, 16(a2) + vfmacc.vf v11, ft3, v3 + flh fa3, 18(a2) + vfmacc.vf v12, ft4, v3 + flh fa4, 20(a2) + vfmacc.vf v13, ft5, v3 + flh fa5, 22(a2) + vfmacc.vf v14, fa0, v3 + flh ft0, 24(a2) + vfmacc.vf v15, fa1, v3 + flh ft1, 26(a2) + vfmacc.vf v16, fa2, v3 + flh ft2, 28(a2) + vfmacc.vf v17, fa3, v3 + flh ft3, 30(a2) + vfmacc.vf v18, fa4, v3 + flh ft4, 32(a2) + vfmacc.vf v19, fa5, v3 + flh ft5, 34(a2) + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + flh fa0, 36(a2) + vfmacc.vf v9, ft1, v5 + flh fa1, 38(a2) + vfmacc.vf v10, ft2, v5 + flh fa2, 40(a2) + vfmacc.vf v11, ft3, v5 + flh fa3, 42(a2) + vfmacc.vf v12, ft4, v5 + flh fa4, 44(a2) + vfmacc.vf v13, ft5, v5 + flh fa5, 46(a2) + addi a2, a2, 48 + vfmacc.vf v14, fa0, v5 + flh ft0, 0(a2) + vfmacc.vf v15, fa1, v5 + flh ft1, 2(a2) + vfmacc.vf v16, fa2, v5 + flh ft2, 4(a2) + vfmacc.vf v17, fa3, v5 + flh ft3, 6(a2) + vfmacc.vf v18, fa4, v5 + flh ft4, 8(a2) + vfmacc.vf v19, fa5, v5 + flh ft5, 10(a2) + + addi t2, t2, -1 + bnez t2, packnx12_k2 + +packnx12_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx12_relu + + vfmacc.vf v8, ft0, v3 + flh fa0, 12(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 14(a2) + vfmacc.vf v10, ft2, v3 + flh fa2, 16(a2) + vfmacc.vf v11, ft3, v3 + flh fa3, 18(a2) + vfmacc.vf v12, ft4, v3 + flh fa4, 20(a2) + vfmacc.vf v13, ft5, v3 + flh fa5, 22(a2) + addi a2, a2, 24 + vfmacc.vf v14, fa0, v3 + vfmacc.vf v15, fa1, v3 + vfmacc.vf v16, fa2, v3 + vfmacc.vf v17, fa3, v3 + vfmacc.vf v18, fa4, v3 + vfmacc.vf v19, fa5, v3 + +packnx12_relu: + beqz a7, packnx12_end + vmv.v.x v0, zero + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v16, v16, v0 + vfmax.vv v17, v17, v0 + vfmax.vv v18, v18, v0 + vfmax.vv v19, v19, v0 + +packnx12_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + vse16.v v12, (a0) + add a0, a0, t0 + vse16.v v13, (a0) + add a0, a0, t0 + vse16.v v14, (a0) + add a0, a0, t0 + vse16.v v15, (a0) + add a0, a0, t0 + vse16.v v16, (a0) + add a0, a0, t0 + vse16.v v17, (a0) + add a0, a0, t0 + vse16.v v18, (a0) + add a0, a0, t0 + vse16.v v19, (a0) + add a0, a0, t0 + + addi t4, t4, -1 + bnez t4, packnx12_start + +packnx8_start: + andi t4, t5, 8 // s1 = bool_n8 + beqz t4, packnx4_start // if n8==0, jump to packnx4 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx8_k1 + +packnx8_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + flh fa3, 14(a2) + vfmacc.vf v12, fa0, v3 + flh ft0, 16(a2) + vfmacc.vf v13, fa1, v3 + flh ft1, 18(a2) + vfmacc.vf v14, fa2, v3 + flh ft2, 20(a2) + vfmacc.vf v15, fa3, v3 + flh ft3, 22(a2) + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + flh fa0, 24(a2) + vfmacc.vf v9, ft1, v5 + flh fa1, 26(a2) + vfmacc.vf v10, ft2, v5 + flh fa2, 28(a2) + vfmacc.vf v11, ft3, v5 + flh fa3, 30(a2) + addi a2, a2, 32 + vfmacc.vf v12, fa0, v5 + flh ft0, 0(a2) + vfmacc.vf v13, fa1, v5 + flh ft1, 2(a2) + vfmacc.vf v14, fa2, v5 + flh ft2, 4(a2) + vfmacc.vf v15, fa3, v5 + flh ft3, 6(a2) + + addi t2, t2, -1 + bnez t2, packnx8_k2 + +packnx8_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx8_relu + + vfmacc.vf v8, ft0, v3 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + flh fa3, 14(a2) + addi a2, a2, 16 + vfmacc.vf v12, fa0, v3 + vfmacc.vf v13, fa1, v3 + vfmacc.vf v14, fa2, v3 + vfmacc.vf v15, fa3, v3 + +packnx8_relu: + beqz a7, packnx8_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + +packnx8_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + vse16.v v12, (a0) + add a0, a0, t0 + vse16.v v13, (a0) + add a0, a0, t0 + vse16.v v14, (a0) + add a0, a0, t0 + vse16.v v15, (a0) + add a0, a0, t0 + +packnx4_start: + andi t4, t5, 4 // s1 = bool_n4 + beqz t4, packnx2_start // if n4==0, jump to packnx2 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + flh ft2, 4(a2) + flh ft3, 6(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx4_k1 + +packnx4_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flh fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 10(a2) + vfmacc.vf v10, ft2, v3 + flh fa2, 12(a2) + vfmacc.vf v11, ft3, v3 + flh fa3, 14(a2) + addi a2, a2, 16 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + flh ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + flh ft1, 2(a2) + vfmacc.vf v10, fa2, v5 + flh ft2, 4(a2) + vfmacc.vf v11, fa3, v5 + flh ft3, 6(a2) + + addi t2, t2, -1 + bnez t2, packnx4_k2 + +packnx4_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx4_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v10, ft2, v3 + vfmacc.vf v11, ft3, v3 + addi a2, a2, 8 + +packnx4_relu: + beqz a7, packnx4_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + +packnx4_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + vse16.v v10, (a0) + add a0, a0, t0 + vse16.v v11, (a0) + add a0, a0, t0 + +packnx2_start: + andi t4, t5, 2 // s1 = bool_n2 + beqz t4, packnx1_start // if n2==0, jump to pack1nx1 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + flh ft1, 2(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx2_k1 + +packnx2_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flh fa0, 4(a2) + vfmacc.vf v9, ft1, v3 + flh fa1, 6(a2) + addi a2, a2, 8 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + flh ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + flh ft1, 2(a2) + + addi t2, t2, -1 + bnez t2, packnx2_k2 + +packnx2_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx2_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v9, ft1, v3 + addi a2, a2, 4 + +packnx2_relu: + beqz a7, packnx2_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + +packnx2_end: + vse16.v v8, (a0) + add a0, a0, t0 + vse16.v v9, (a0) + add a0, a0, t0 + +packnx1_start: + andi t4, t5, 1 // s1 = bool_n1 + beqz t4, packn_end // if n1==0, jump to end + + vmv.v.v v8, v1 + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle16.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flh ft0, 0(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx1_k1 + +packnx1_k2: + vle16.v v5, (t3) + add t3, t3, t0 // +packn + vfmacc.vf v8, ft0, v3 + flh fa0, 2(a2) + addi a2, a2, 4 + + vle16.v v3, (t3) + add t3, t3, t0 // +packn + vfmacc.vf v8, fa0, v5 + flh ft0, 0(a2) + + addi t2, t2, -1 + bnez t2, packnx1_k2 + +packnx1_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx1_relu + + vfmacc.vf v8, ft0, v3 + addi a2, a2, 2 + +packnx1_relu: + beqz a7, packnx1_end + vfmax.vv v8, v8, v0 + +packnx1_end: + vse16.v v8, (a0) + +packn_end: + ret + .end diff --git a/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S new file mode 100644 index 00000000..d09f8f86 --- /dev/null +++ b/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S @@ -0,0 +1,1309 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void gemm_fp32_ncxhwx_12xpack2n(const float *output, + const float *kernel, + const float *input, + const float *bias, + int m, // maxtrix A row + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + bool fuse_relu) + + Algorithm works as follows: + (1) perform matrix-multiplication [pack2n, k] x [k, n] = [pack2n, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr [NULL without bais] + a4: m [packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: fuse_bias + + t0 = packn * 4 maintenance kernel_addr + t1 = tmp variable + t2 = k2 input_channel dim loop count + t3 = kernel data addr + t4 = n12 + t5 = n_tail + t6 = next packn line output + + ft0-ft5: hold input data + fa0-fa5: hold input data + + v1-v2: acc initial (bias or zero) + v3-v6: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .file "gemm_fp32_ncxhwx.S" + .section .text.gemm_fp32_ncxhwx_12xpack2n, "ax", @progbits + .align 5 + .global gemm_fp32_ncxhwx_12xpack2n + .type gemm_fp32_ncxhwx_12xpack2n, @function + +gemm_fp32_ncxhwx_12xpack2n: + slli t0, a4, 2 // t0 = packn * 4 + vsetvli zero, a4, e32, m1 + + mul t1, t0, a6 // packn * n + add t6, a0, t1 // t6[out1_addr] = out0_addr + packn * n + + li t1, 12 + divw t4, a6, t1 // t4 = n12 + remw t5, a6, t1 // t5 = n % 12 (n_tail) + + vmv.v.x v1, zero // clear acc + vmv.v.x v2, zero + // pack2n * n [init] + beqz a3, non_bias1 + vle32.v v1, (a3) + add a3, a3, t0 // +packn + vle32.v v2, (a3) + +non_bias1: + + beqz t4, pack2nx8_start // if n12==0, jump to pack2nx8 + +pack2nx12_start: + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + vmv.v.v v16, v1 + vmv.v.v v17, v1 + vmv.v.v v18, v1 + vmv.v.v v19, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + vmv.v.v v24, v2 + vmv.v.v v25, v2 + vmv.v.v v26, v2 + vmv.v.v v27, v2 + vmv.v.v v28, v2 + vmv.v.v v29, v2 + vmv.v.v v30, v2 + vmv.v.v v31, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + flw ft4, 16(a2) + flw ft5, 20(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx12_k1 + +pack2nx12_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vle32.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 24(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 28(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flw fa2, 32(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flw fa3, 36(a2) + vfmacc.vf v12, ft4, v3 + vfmacc.vf v24, ft4, v4 + flw fa4, 40(a2) + vfmacc.vf v13, ft5, v3 + vfmacc.vf v25, ft5, v4 + flw fa5, 44(a2) + vfmacc.vf v14, fa0, v3 + vfmacc.vf v26, fa0, v4 + flw ft0, 48(a2) + vfmacc.vf v15, fa1, v3 + vfmacc.vf v27, fa1, v4 + flw ft1, 52(a2) + vfmacc.vf v16, fa2, v3 + vfmacc.vf v28, fa2, v4 + flw ft2, 56(a2) + vfmacc.vf v17, fa3, v3 + vfmacc.vf v29, fa3, v4 + flw ft3, 60(a2) + vfmacc.vf v18, fa4, v3 + vfmacc.vf v30, fa4, v4 + flw ft4, 64(a2) + vfmacc.vf v19, fa5, v3 + vfmacc.vf v31, fa5, v4 + flw ft5, 68(a2) + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + vfmacc.vf v20, ft0, v6 + flw fa0, 72(a2) + vfmacc.vf v9, ft1, v5 + vfmacc.vf v21, ft1, v6 + flw fa1, 76(a2) + vfmacc.vf v10, ft2, v5 + vfmacc.vf v22, ft2, v6 + flw fa2, 80(a2) + vfmacc.vf v11, ft3, v5 + vfmacc.vf v23, ft3, v6 + flw fa3, 84(a2) + vfmacc.vf v12, ft4, v5 + vfmacc.vf v24, ft4, v6 + flw fa4, 88(a2) + vfmacc.vf v13, ft5, v5 + vfmacc.vf v25, ft5, v6 + flw fa5, 92(a2) + addi a2, a2, 96 + vfmacc.vf v14, fa0, v5 + vfmacc.vf v26, fa0, v6 + flw ft0, 0(a2) + vfmacc.vf v15, fa1, v5 + vfmacc.vf v27, fa1, v6 + flw ft1, 4(a2) + vfmacc.vf v16, fa2, v5 + vfmacc.vf v28, fa2, v6 + flw ft2, 8(a2) + vfmacc.vf v17, fa3, v5 + vfmacc.vf v29, fa3, v6 + flw ft3, 12(a2) + vfmacc.vf v18, fa4, v5 + vfmacc.vf v30, fa4, v6 + flw ft4, 16(a2) + vfmacc.vf v19, fa5, v5 + vfmacc.vf v31, fa5, v6 + flw ft5, 20(a2) + + addi t2, t2, -1 + bnez t2, pack2nx12_k2 + +pack2nx12_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx12_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 24(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 28(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flw fa2, 32(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flw fa3, 36(a2) + vfmacc.vf v12, ft4, v3 + vfmacc.vf v24, ft4, v4 + flw fa4, 40(a2) + vfmacc.vf v13, ft5, v3 + vfmacc.vf v25, ft5, v4 + flw fa5, 44(a2) + addi a2, a2, 48 + vfmacc.vf v14, fa0, v3 + vfmacc.vf v26, fa0, v4 + vfmacc.vf v15, fa1, v3 + vfmacc.vf v27, fa1, v4 + vfmacc.vf v16, fa2, v3 + vfmacc.vf v28, fa2, v4 + vfmacc.vf v17, fa3, v3 + vfmacc.vf v29, fa3, v4 + vfmacc.vf v18, fa4, v3 + vfmacc.vf v30, fa4, v4 + vfmacc.vf v19, fa5, v3 + vfmacc.vf v31, fa5, v4 + +pack2nx12_relu: + beqz a7, pack2nx12_end + vmv.v.x v0, zero + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v16, v16, v0 + vfmax.vv v17, v17, v0 + vfmax.vv v18, v18, v0 + vfmax.vv v19, v19, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + vfmax.vv v24, v24, v0 + vfmax.vv v25, v25, v0 + vfmax.vv v26, v26, v0 + vfmax.vv v27, v27, v0 + vfmax.vv v28, v28, v0 + vfmax.vv v29, v29, v0 + vfmax.vv v30, v30, v0 + vfmax.vv v31, v31, v0 + +pack2nx12_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + vse32.v v12, (a0) + add a0, a0, t0 + vse32.v v13, (a0) + add a0, a0, t0 + vse32.v v14, (a0) + add a0, a0, t0 + vse32.v v15, (a0) + add a0, a0, t0 + vse32.v v16, (a0) + add a0, a0, t0 + vse32.v v17, (a0) + add a0, a0, t0 + vse32.v v18, (a0) + add a0, a0, t0 + vse32.v v19, (a0) + add a0, a0, t0 + + vse32.v v20, (t6) + add t6, t6, t0 + vse32.v v21, (t6) + add t6, t6, t0 + vse32.v v22, (t6) + add t6, t6, t0 + vse32.v v23, (t6) + add t6, t6, t0 + vse32.v v24, (t6) + add t6, t6, t0 + vse32.v v25, (t6) + add t6, t6, t0 + vse32.v v26, (t6) + add t6, t6, t0 + vse32.v v27, (t6) + add t6, t6, t0 + vse32.v v28, (t6) + add t6, t6, t0 + vse32.v v29, (t6) + add t6, t6, t0 + vse32.v v30, (t6) + add t6, t6, t0 + vse32.v v31, (t6) + add t6, t6, t0 + + addi t4, t4, -1 + bnez t4, pack2nx12_start + +pack2nx8_start: + andi t4, t5, 8 // s1 = bool_n8 + beqz t4, pack2nx4_start // if n8==0, jump to pack2nx4 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + vmv.v.v v24, v2 + vmv.v.v v25, v2 + vmv.v.v v26, v2 + vmv.v.v v27, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx8_k1 + +pack2nx8_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vle32.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flw fa3, 28(a2) + vfmacc.vf v12, fa0, v3 + vfmacc.vf v24, fa0, v4 + flw ft0, 32(a2) + vfmacc.vf v13, fa1, v3 + vfmacc.vf v25, fa1, v4 + flw ft1, 36(a2) + vfmacc.vf v14, fa2, v3 + vfmacc.vf v26, fa2, v4 + flw ft2, 40(a2) + vfmacc.vf v15, fa3, v3 + vfmacc.vf v27, fa3, v4 + flw ft3, 44(a2) + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + vfmacc.vf v20, ft0, v6 + flw fa0, 48(a2) + vfmacc.vf v9, ft1, v5 + vfmacc.vf v21, ft1, v6 + flw fa1, 52(a2) + vfmacc.vf v10, ft2, v5 + vfmacc.vf v22, ft2, v6 + flw fa2, 56(a2) + vfmacc.vf v11, ft3, v5 + vfmacc.vf v23, ft3, v6 + flw fa3, 60(a2) + addi a2, a2, 64 + vfmacc.vf v12, fa0, v5 + vfmacc.vf v24, fa0, v6 + flw ft0, 0(a2) + vfmacc.vf v13, fa1, v5 + vfmacc.vf v25, fa1, v6 + flw ft1, 4(a2) + vfmacc.vf v14, fa2, v5 + vfmacc.vf v26, fa2, v6 + flw ft2, 8(a2) + vfmacc.vf v15, fa3, v5 + vfmacc.vf v27, fa3, v6 + flw ft3, 12(a2) + + addi t2, t2, -1 + bnez t2, pack2nx8_k2 + +pack2nx8_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx8_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flw fa3, 28(a2) + addi a2, a2, 32 + vfmacc.vf v12, fa0, v3 + vfmacc.vf v24, fa0, v4 + vfmacc.vf v13, fa1, v3 + vfmacc.vf v25, fa1, v4 + vfmacc.vf v14, fa2, v3 + vfmacc.vf v26, fa2, v4 + vfmacc.vf v15, fa3, v3 + vfmacc.vf v27, fa3, v4 + +pack2nx8_relu: + beqz a7, pack2nx8_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + vfmax.vv v24, v24, v0 + vfmax.vv v25, v25, v0 + vfmax.vv v26, v26, v0 + vfmax.vv v27, v27, v0 + +pack2nx8_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + vse32.v v12, (a0) + add a0, a0, t0 + vse32.v v13, (a0) + add a0, a0, t0 + vse32.v v14, (a0) + add a0, a0, t0 + vse32.v v15, (a0) + add a0, a0, t0 + + vse32.v v20, (t6) + add t6, t6, t0 + vse32.v v21, (t6) + add t6, t6, t0 + vse32.v v22, (t6) + add t6, t6, t0 + vse32.v v23, (t6) + add t6, t6, t0 + vse32.v v24, (t6) + add t6, t6, t0 + vse32.v v25, (t6) + add t6, t6, t0 + vse32.v v26, (t6) + add t6, t6, t0 + vse32.v v27, (t6) + add t6, t6, t0 + +pack2nx4_start: + andi t4, t5, 4 // s1 = bool_n4 + beqz t4, pack2nx2_start // if n4==0, jump to pack2nx2 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + vmv.v.v v22, v2 + vmv.v.v v23, v2 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx4_k1 + +pack2nx4_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vle32.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + flw fa3, 28(a2) + addi a2, a2, 32 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flw ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + vfmacc.vf v21, fa1, v6 + flw ft1, 4(a2) + vfmacc.vf v10, fa2, v5 + vfmacc.vf v22, fa2, v6 + flw ft2, 8(a2) + vfmacc.vf v11, fa3, v5 + vfmacc.vf v23, fa3, v6 + flw ft3, 12(a2) + + addi t2, t2, -1 + bnez t2, pack2nx4_k2 + +pack2nx4_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx4_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + vfmacc.vf v10, ft2, v3 + vfmacc.vf v22, ft2, v4 + vfmacc.vf v11, ft3, v3 + vfmacc.vf v23, ft3, v4 + addi a2, a2, 16 + +pack2nx4_relu: + beqz a7, pack2nx4_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + vfmax.vv v22, v22, v0 + vfmax.vv v23, v23, v0 + +pack2nx4_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + + vse32.v v20, (t6) + add t6, t6, t0 + vse32.v v21, (t6) + add t6, t6, t0 + vse32.v v22, (t6) + add t6, t6, t0 + vse32.v v23, (t6) + add t6, t6, t0 + +pack2nx2_start: + andi t4, t5, 2 // s1 = bool_n2 + beqz t4, pack2nx1_start // if n2==0, jump to pack2nx1 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + + vmv.v.v v20, v2 + vmv.v.v v21, v2 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx2_k1 + +pack2nx2_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vle32.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + flw fa1, 12(a2) + addi a2, a2, 16 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flw ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + vfmacc.vf v21, fa1, v6 + flw ft1, 4(a2) + + addi t2, t2, -1 + bnez t2, pack2nx2_k2 + +pack2nx2_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx2_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v21, ft1, v4 + addi a2, a2, 8 + +pack2nx2_relu: + beqz a7, pack2nx2_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v20, v20, v0 + vfmax.vv v21, v21, v0 + +pack2nx2_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + + vse32.v v20, (t6) + add t6, t6, t0 + vse32.v v21, (t6) + add t6, t6, t0 + +pack2nx1_start: + andi t4, t5, 1 // s1 = bool_n1 + beqz t4, pack2n_end // if n1==0, jump to end + + vmv.v.v v8, v1 + vmv.v.v v20, v2 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + + srai t2, a5, 1 // k2 + beqz t2, pack2nx1_k1 + +pack2nx1_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vle32.v v6, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + flw fa0, 4(a2) + addi a2, a2, 8 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vle32.v v4, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + vfmacc.vf v20, fa0, v6 + flw ft0, 0(a2) + + addi t2, t2, -1 + bnez t2, pack2nx1_k2 + +pack2nx1_k1: + andi t2, a5, 1 // k1 + beqz t2, pack2nx1_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v20, ft0, v4 + addi a2, a2, 4 + +pack2nx1_relu: + beqz a7, pack2nx1_end + vfmax.vv v8, v8, v0 + vfmax.vv v20, v20, v0 + +pack2nx1_end: + vse32.v v8, (a0) + vse32.v v20, (t6) + +pack2n_end: + ret + +/************************************************************************************************** + + void gemm_fp32_ncxhwx_12xpackn(const float *output, + const float *kernel, + const float *input, + const float *bias, + int m, // maxtrix A row + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + bool fuse_relu) + + Algorithm works as follows: + (1) perform matrix-multiplication [m, k] x [k, n] = [m, n] + m = packn or tail_packn + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr [NULL without bais] + a4: m [packn or tail_packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: fuse_bias + + t0 = packn * 4 maintenance kernel_addr + t1 = tmp variable + t2 = k2 input_channel dim loop count + t3 = kernel data addr + t4 = n12 + t5 = n_tail + t6 = unused + + ft0-ft5: hold input data + fa0-fa5: hold input data + + v1: acc initial (bias or zero) + v3/v5: hold kernel data + v8-v19: packn line acc + + *************************************************************************************************/ + .section .text.gemm_fp32_ncxhwx_12xpackn, "ax", @progbits + .align 5 + .global gemm_fp32_ncxhwx_12xpackn + .type gemm_fp32_ncxhwx_12xpackn, @function + +gemm_fp32_ncxhwx_12xpackn: + slli t0, a4, 2 // t0 = packn * 4 + vsetvli zero, a4, e32, m1 + + li t1, 12 + divw t4, a6, t1 // t4 = n12 + remw t5, a6, t1 // t5 = n % 12 (n_tail) + + vmv.v.x v1, zero // clear acc + + beqz a3, non_bias2 + vle32.v v1, (a3) + +non_bias2: + beqz t4, packnx8_start // if n12==0, jump to pack2nx8 + +packnx12_start: + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + vmv.v.v v16, v1 + vmv.v.v v17, v1 + vmv.v.v v18, v1 + vmv.v.v v19, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + flw ft4, 16(a2) + flw ft5, 20(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx12_k1 + +packnx12_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flw fa0, 24(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 28(a2) + vfmacc.vf v10, ft2, v3 + flw fa2, 32(a2) + vfmacc.vf v11, ft3, v3 + flw fa3, 36(a2) + vfmacc.vf v12, ft4, v3 + flw fa4, 40(a2) + vfmacc.vf v13, ft5, v3 + flw fa5, 44(a2) + vfmacc.vf v14, fa0, v3 + flw ft0, 48(a2) + vfmacc.vf v15, fa1, v3 + flw ft1, 52(a2) + vfmacc.vf v16, fa2, v3 + flw ft2, 56(a2) + vfmacc.vf v17, fa3, v3 + flw ft3, 60(a2) + vfmacc.vf v18, fa4, v3 + flw ft4, 64(a2) + vfmacc.vf v19, fa5, v3 + flw ft5, 68(a2) + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + flw fa0, 72(a2) + vfmacc.vf v9, ft1, v5 + flw fa1, 76(a2) + vfmacc.vf v10, ft2, v5 + flw fa2, 80(a2) + vfmacc.vf v11, ft3, v5 + flw fa3, 84(a2) + vfmacc.vf v12, ft4, v5 + flw fa4, 88(a2) + vfmacc.vf v13, ft5, v5 + flw fa5, 92(a2) + addi a2, a2, 96 + vfmacc.vf v14, fa0, v5 + flw ft0, 0(a2) + vfmacc.vf v15, fa1, v5 + flw ft1, 4(a2) + vfmacc.vf v16, fa2, v5 + flw ft2, 8(a2) + vfmacc.vf v17, fa3, v5 + flw ft3, 12(a2) + vfmacc.vf v18, fa4, v5 + flw ft4, 16(a2) + vfmacc.vf v19, fa5, v5 + flw ft5, 20(a2) + + addi t2, t2, -1 + bnez t2, packnx12_k2 + +packnx12_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx12_relu + + vfmacc.vf v8, ft0, v3 + flw fa0, 24(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 28(a2) + vfmacc.vf v10, ft2, v3 + flw fa2, 32(a2) + vfmacc.vf v11, ft3, v3 + flw fa3, 36(a2) + vfmacc.vf v12, ft4, v3 + flw fa4, 40(a2) + vfmacc.vf v13, ft5, v3 + flw fa5, 44(a2) + addi a2, a2, 48 + vfmacc.vf v14, fa0, v3 + vfmacc.vf v15, fa1, v3 + vfmacc.vf v16, fa2, v3 + vfmacc.vf v17, fa3, v3 + vfmacc.vf v18, fa4, v3 + vfmacc.vf v19, fa5, v3 + +packnx12_relu: + beqz a7, packnx12_end + vmv.v.x v0, zero + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + vfmax.vv v16, v16, v0 + vfmax.vv v17, v17, v0 + vfmax.vv v18, v18, v0 + vfmax.vv v19, v19, v0 + +packnx12_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + vse32.v v12, (a0) + add a0, a0, t0 + vse32.v v13, (a0) + add a0, a0, t0 + vse32.v v14, (a0) + add a0, a0, t0 + vse32.v v15, (a0) + add a0, a0, t0 + vse32.v v16, (a0) + add a0, a0, t0 + vse32.v v17, (a0) + add a0, a0, t0 + vse32.v v18, (a0) + add a0, a0, t0 + vse32.v v19, (a0) + add a0, a0, t0 + + addi t4, t4, -1 + bnez t4, packnx12_start + +packnx8_start: + andi t4, t5, 8 // s1 = bool_n8 + beqz t4, packnx4_start // if n8==0, jump to packnx4 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + vmv.v.v v12, v1 + vmv.v.v v13, v1 + vmv.v.v v14, v1 + vmv.v.v v15, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx8_k1 + +packnx8_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + flw fa3, 28(a2) + vfmacc.vf v12, fa0, v3 + flw ft0, 32(a2) + vfmacc.vf v13, fa1, v3 + flw ft1, 36(a2) + vfmacc.vf v14, fa2, v3 + flw ft2, 40(a2) + vfmacc.vf v15, fa3, v3 + flw ft3, 44(a2) + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v5 + flw fa0, 48(a2) + vfmacc.vf v9, ft1, v5 + flw fa1, 52(a2) + vfmacc.vf v10, ft2, v5 + flw fa2, 56(a2) + vfmacc.vf v11, ft3, v5 + flw fa3, 60(a2) + addi a2, a2, 64 + vfmacc.vf v12, fa0, v5 + flw ft0, 0(a2) + vfmacc.vf v13, fa1, v5 + flw ft1, 4(a2) + vfmacc.vf v14, fa2, v5 + flw ft2, 8(a2) + vfmacc.vf v15, fa3, v5 + flw ft3, 12(a2) + + addi t2, t2, -1 + bnez t2, packnx8_k2 + +packnx8_k1: + andi t2, a5, 1 // k2 + beqz t2, packnx8_relu + + vfmacc.vf v8, ft0, v3 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + flw fa3, 28(a2) + addi a2, a2, 32 + vfmacc.vf v12, fa0, v3 + vfmacc.vf v13, fa1, v3 + vfmacc.vf v14, fa2, v3 + vfmacc.vf v15, fa3, v3 + +packnx8_relu: + beqz a7, packnx8_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + vfmax.vv v12, v12, v0 + vfmax.vv v13, v13, v0 + vfmax.vv v14, v14, v0 + vfmax.vv v15, v15, v0 + +packnx8_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + vse32.v v12, (a0) + add a0, a0, t0 + vse32.v v13, (a0) + add a0, a0, t0 + vse32.v v14, (a0) + add a0, a0, t0 + vse32.v v15, (a0) + add a0, a0, t0 + +packnx4_start: + andi t4, t5, 4 // s1 = bool_n4 + beqz t4, packnx2_start // if n4==0, jump to packnx2 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + vmv.v.v v10, v1 + vmv.v.v v11, v1 + + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + flw ft2, 8(a2) + flw ft3, 12(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx4_k1 + +packnx4_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flw fa0, 16(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 20(a2) + vfmacc.vf v10, ft2, v3 + flw fa2, 24(a2) + vfmacc.vf v11, ft3, v3 + flw fa3, 28(a2) + addi a2, a2, 32 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + flw ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + flw ft1, 4(a2) + vfmacc.vf v10, fa2, v5 + flw ft2, 8(a2) + vfmacc.vf v11, fa3, v5 + flw ft3, 12(a2) + + addi t2, t2, -1 + bnez t2, packnx4_k2 + +packnx4_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx4_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v9, ft1, v3 + vfmacc.vf v10, ft2, v3 + vfmacc.vf v11, ft3, v3 + addi a2, a2, 16 + +packnx4_relu: + beqz a7, packnx4_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + vfmax.vv v10, v10, v0 + vfmax.vv v11, v11, v0 + +packnx4_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + vse32.v v10, (a0) + add a0, a0, t0 + vse32.v v11, (a0) + add a0, a0, t0 + +packnx2_start: + andi t4, t5, 2 // s1 = bool_n2 + beqz t4, packnx1_start // if n2==0, jump to pack1nx1 + + vmv.v.v v8, v1 + vmv.v.v v9, v1 + + mv t3, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + flw ft1, 4(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx2_k1 + +packnx2_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, ft0, v3 + flw fa0, 8(a2) + vfmacc.vf v9, ft1, v3 + flw fa1, 12(a2) + addi a2, a2, 16 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + + vfmacc.vf v8, fa0, v5 + flw ft0, 0(a2) + vfmacc.vf v9, fa1, v5 + flw ft1, 4(a2) + + addi t2, t2, -1 + bnez t2, packnx2_k2 + +packnx2_k1: + andi t2, a5, 1 // k1 + beqz t2, packnx2_relu + + vfmacc.vf v8, ft0, v3 + vfmacc.vf v9, ft1, v3 + addi a2, a2, 8 + +packnx2_relu: + beqz a7, packnx2_end + vfmax.vv v8, v8, v0 + vfmax.vv v9, v9, v0 + +packnx2_end: + vse32.v v8, (a0) + add a0, a0, t0 + vse32.v v9, (a0) + add a0, a0, t0 + +packnx1_start: + andi t4, t5, 1 // s1 = bool_n1 + beqz t4, packn_end // if n1==0, jump to end + + vmv.v.v v8, v1 + mv t3, a1 // kernel origin addr + + // pre-load kernel_data + vle32.v v3, (t3) + add t3, t3, t0 // +packn + // pre-load input_data + flw ft0, 0(a2) + + srai t2, a5, 1 // k2 + beqz t2, packnx1_k1 + +packnx1_k2: + vle32.v v5, (t3) + add t3, t3, t0 // +packn + vfmacc.vf v8, ft0, v3 + flw fa0, 4(a2) + addi a2, a2, 8 + + vle32.v v3, (t3) + add t3, t3, t0 // +packn + vfmacc.vf v8, fa0, v5 + flw ft0, 0(a2) + + addi t2, t2, -1 + bnez t2, packnx1_k2 + +packnx1_k1: + andi t2, a5, 1 // k2 + beqz t2, packnx1_relu + + vfmacc.vf v8, ft0, v3 + addi a2, a2, 4 + +packnx1_relu: + beqz a7, packnx1_end + vfmax.vv v8, v8, v0 + +packnx1_end: + vse32.v v8, (a0) + +packn_end: + ret + .end diff --git a/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S new file mode 100644 index 00000000..c1d43dda --- /dev/null +++ b/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S @@ -0,0 +1,452 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void gemm_int16_ncxhwx_12xpackn(const int32_t *output, + const int16_t *kernel, + const int16_t *input, + int k, // maxtrix A col / maxtrix B row + int n) // maxtrix B col + + Algorithm works as follows: + (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n] + (2) for int8 winograd + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: k [in_ch] + a4: n [tile] + + a5 = hold kernel data addr + t0 = packn * 2: kernel_addr stride + t5 = packn * 4: output_addr stride + t6 = k2 loop cnt + a6 = n12 + a7 = n_tail + + t1-t4: hold input data + s1-s4: hold input data + + v2/v4: hold kernel data + v8-v31: acc v-reg + + *************************************************************************************************/ + .file "gemm_int16_ncxhwx.S" + .section .text.gemm_int16_ncxhwx_12xpackn, "ax", @progbits + .align 5 + .global gemm_int16_ncxhwx_12xpackn + .type gemm_int16_ncxhwx_12xpackn, @function + +gemm_int16_ncxhwx_12xpackn: + addi sp, sp, -32 + sd s1, 0(sp) + sd s2, 8(sp) + sd s3, 16(sp) + sd s4, 24(sp) + + li t0, 12 + divw a6, a4, t0 // a6 = n12 + remw a7, a4, t0 // a7 = n % 12 (n_tail) + + csrr t0, vlenb // t0 = vlen/8 = packn/2 * 4 = 16 + slli t5, t0, 1 // packn * 4 = 32 + + beqz a6, packnx8_start // if n12==0, jump to packnx8 + +packnx12_start: + vsetvli zero, t0, e16, m1 + vmv.v.x v8, zero + vmv.v.x v9, zero + vmv.v.x v10, zero + vmv.v.x v11, zero + vmv.v.x v12, zero + vmv.v.x v13, zero + vmv.v.x v14, zero + vmv.v.x v15, zero + vmv.v.x v16, zero + vmv.v.x v17, zero + vmv.v.x v18, zero + vmv.v.x v19, zero + vmv.v.x v20, zero + vmv.v.x v21, zero + vmv.v.x v22, zero + vmv.v.x v23, zero + vmv.v.x v24, zero + vmv.v.x v25, zero + vmv.v.x v26, zero + vmv.v.x v27, zero + vmv.v.x v28, zero + vmv.v.x v29, zero + vmv.v.x v30, zero + vmv.v.x v31, zero + + mv a5, a1 // kernel origin addr + // pre-load kernel matrix + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + // pre-load input matrix + lwd t1, t3, 0(a2) + srli t2, t1, 16 + srli t4, t3, 16 + + srai t6, a3, 1 // k2 + +packnx12_k2: + vle16.v v4, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v2 + vwmacc.vx v12, t3, v2 + lwd s1, s3, 8(a2) + vwmacc.vx v10, t2, v2 + srli s2, s1, 16 + vwmacc.vx v14, t4, v2 + srli s4, s3, 16 + vwmacc.vx v16, s1, v2 + vwmacc.vx v20, s3, v2 + lwd t1, t3, 16(a2) + addi a2, a2, 24 + vwmacc.vx v18, s2, v2 + srli t2, t1, 16 + vwmacc.vx v22, s4, v2 + srli t4, t3, 16 + vwmacc.vx v24, t1, v2 + vwmacc.vx v28, t3, v2 + lwd s1, s3, 0(a2) + vwmacc.vx v26, t2, v2 + srli s2, s1, 16 + vwmacc.vx v30, t4, v2 + srli s4, s3, 16 + + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, s1, v4 + vwmacc.vx v12, s3, v4 + lwd t1, t3, 8(a2) + vwmacc.vx v10, s2, v4 + srli t2, t1, 16 + vwmacc.vx v14, s4, v4 + srli t4, t3, 16 + vwmacc.vx v16, t1, v4 + vwmacc.vx v20, t3, v4 + lwd s1, s3, 16(a2) + addi a2, a2, 24 + vwmacc.vx v18, t2, v4 + srli s2, s1, 16 + vwmacc.vx v22, t4, v4 + srli s4, s3, 16 + vwmacc.vx v24, s1, v4 + vwmacc.vx v28, s3, v4 + lwd t1, t3, 0(a2) + vwmacc.vx v26, s2, v4 + srli t2, t1, 16 + vwmacc.vx v30, s4, v4 + srli t4, t3, 16 + + addi t6, t6, -1 + bnez t6, packnx12_k2 + +packnx12_end: + vsetvli zero, zero, e32, m2 + vse32.v v8, (a0) + add a0, a0, t5 + vse32.v v10, (a0) + add a0, a0, t5 + vse32.v v12, (a0) + add a0, a0, t5 + vse32.v v14, (a0) + add a0, a0, t5 + vse32.v v16, (a0) + add a0, a0, t5 + vse32.v v18, (a0) + add a0, a0, t5 + vse32.v v20, (a0) + add a0, a0, t5 + vse32.v v22, (a0) + add a0, a0, t5 + vse32.v v24, (a0) + add a0, a0, t5 + vse32.v v26, (a0) + add a0, a0, t5 + vse32.v v28, (a0) + add a0, a0, t5 + vse32.v v30, (a0) + add a0, a0, t5 + + addi a6, a6, -1 + bnez a6, packnx12_start + +packnx8_start: + andi a6, a7, 8 // s1 = bool_n8 + beqz a6, packnx4_start // if n8==0, jump to packnx4 + + vsetvli zero, t0, e16, m1 + vmv.v.x v8, zero + vmv.v.x v9, zero + vmv.v.x v10, zero + vmv.v.x v11, zero + vmv.v.x v12, zero + vmv.v.x v13, zero + vmv.v.x v14, zero + vmv.v.x v15, zero + vmv.v.x v16, zero + vmv.v.x v17, zero + vmv.v.x v18, zero + vmv.v.x v19, zero + vmv.v.x v20, zero + vmv.v.x v21, zero + vmv.v.x v22, zero + vmv.v.x v23, zero + + mv a5, a1 // kernel origin addr + // pre-load kernel matrix + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + // pre-load input matrix + lwd t1, t3, 0(a2) + srli t2, t1, 16 + srli t4, t3, 16 + + srai t6, a3, 1 // k2 + +packnx8_k2: + vle16.v v4, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v2 + vwmacc.vx v12, t3, v2 + lwd s1, s3, 8(a2) + vwmacc.vx v10, t2, v2 + srli s2, s1, 16 + vwmacc.vx v14, t4, v2 + srli s4, s3, 16 + vwmacc.vx v16, s1, v2 + vwmacc.vx v20, s3, v2 + lwd t1, t3, 16(a2) + vwmacc.vx v18, s2, v2 + srli t2, t1, 16 + vwmacc.vx v22, s4, v2 + srli t4, t3, 16 + + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v4 + vwmacc.vx v12, t3, v4 + lwd s1, s3, 24(a2) + addi a2, a2, 32 + vwmacc.vx v10, t2, v4 + srli s2, s1, 16 + vwmacc.vx v14, t4, v4 + srli s4, s3, 16 + vwmacc.vx v16, s1, v4 + vwmacc.vx v20, s3, v4 + lwd t1, t3, 0(a2) + vwmacc.vx v18, s2, v4 + srli t2, t1, 16 + vwmacc.vx v22, s4, v4 + srli t4, t3, 16 + + addi t6, t6, -1 + bnez t6, packnx8_k2 + +packnx8_end: + vsetvli zero, zero, e32, m2 + vse32.v v8, (a0) + add a0, a0, t5 + vse32.v v10, (a0) + add a0, a0, t5 + vse32.v v12, (a0) + add a0, a0, t5 + vse32.v v14, (a0) + add a0, a0, t5 + vse32.v v16, (a0) + add a0, a0, t5 + vse32.v v18, (a0) + add a0, a0, t5 + vse32.v v20, (a0) + add a0, a0, t5 + vse32.v v22, (a0) + add a0, a0, t5 + +packnx4_start: + andi a6, a7, 4 // s1 = bool_n4 + beqz a6, packnx2_start // if n4==0, jump to packnx2 + + vsetvli zero, t0, e16, m1 + vmv.v.x v8, zero + vmv.v.x v9, zero + vmv.v.x v10, zero + vmv.v.x v11, zero + vmv.v.x v12, zero + vmv.v.x v13, zero + vmv.v.x v14, zero + vmv.v.x v15, zero + + mv a5, a1 // kernel origin addr + // pre-load kernel matrix + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + // pre-load input matrix + lwd t1, t3, 0(a2) + srli t2, t1, 16 + srli t4, t3, 16 + + srai t6, a3, 1 // k2 + +packnx4_k2: + vle16.v v4, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v2 + lwd s1, s3, 8(a2) + vwmacc.vx v12, t3, v2 + srli s2, s1, 16 + vwmacc.vx v10, t2, v2 + srli s4, s3, 16 + vwmacc.vx v14, t4, v2 + addi a2, a2, 16 + + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, s1, v4 + lwd t1, t3, 0(a2) + vwmacc.vx v12, s3, v4 + srli t2, t1, 16 + vwmacc.vx v10, s2, v4 + srli t4, t3, 16 + vwmacc.vx v14, s4, v4 + + addi t6, t6, -1 + bnez t6, packnx4_k2 + +packnx4_end: + vsetvli zero, zero, e32, m2 + vse32.v v8, (a0) + add a0, a0, t5 + vse32.v v10, (a0) + add a0, a0, t5 + vse32.v v12, (a0) + add a0, a0, t5 + vse32.v v14, (a0) + add a0, a0, t5 + +packnx2_start: + andi a6, a7, 2 // s1 = bool_n2 + beqz a6, packnx1_start // if n2==0, jump to packnx1 + + vsetvli zero, t0, e16, m1 + vmv.v.x v8, zero + vmv.v.x v9, zero + vmv.v.x v10, zero + vmv.v.x v11, zero + + mv a5, a1 // kernel origin addr + // pre-load kernel matrix + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + // pre-load input matrix + lh t1, 0(a2) + lh t2, 2(a2) + + srai t6, a3, 1 // k2 + +packnx2_k2: + vle16.v v4, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v2 + lh s1, 4(a2) + vwmacc.vx v10, t2, v2 + lh s2, 6(a2) + addi a2, a2, 8 + + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, s1, v4 + lh t1, 0(a2) + vwmacc.vx v10, s2, v4 + lh t2, 2(a2) + + addi t6, t6, -1 + bnez t6, packnx2_k2 + +packnx2_end: + vsetvli zero, zero, e32, m2 + vse32.v v8, (a0) + add a0, a0, t5 + vse32.v v10, (a0) + add a0, a0, t5 + +packnx1_start: + andi a6, a7, 1 // s1 = bool_n1 + beqz a6, packn_end // if n1==0, jump to packn_end + + vsetvli zero, t0, e16, m1 + vmv.v.x v8, zero + vmv.v.x v9, zero + + mv a5, a1 // kernel origin addr + // pre-load kernel matrix + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + // pre-load input matrix + lh t1, 0(a2) + + srai t6, a3, 1 // k2 + +packnx1_k2: + vle16.v v4, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, t1, v2 + lh s1, 2(a2) + addi a2, a2, 4 + + vle16.v v2, (a5) + add a5, a5, t0 // kernel_ptr += packn * 2 + + vwmacc.vx v8, s1, v4 + lh t1, 0(a2) + + addi t6, t6, -1 + bnez t6, packnx1_k2 + +packnx1_end: + vsetvli zero, zero, e32, m2 + vse32.v v8, (a0) + add a0, a0, t5 + +packn_end: + ld s1, 0(sp) + ld s2, 8(sp) + ld s3, 16(sp) + ld s4, 24(sp) + addi sp, sp, 32 + + ret + .end diff --git a/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S new file mode 100644 index 00000000..76ec011e --- /dev/null +++ b/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S @@ -0,0 +1,870 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void gemm_int8_ncxhwx_12xpackn(const int8_t *output, + const int8_t *kernel, + const int8_t *input, + const int32_t *bias, + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + int32_t out_zp, + int32_t *mult, + int32_t *shift) + + Algorithm works as follows: + (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr + a4: k [kernel_size] + a5: n [out_hw] + a6: out_zp + a7: mult addr + s0: shift addr + + t0 = packn/2 * 4 maintenance kernel_addr + s7 = tmp variable + s8 = k8(k2) input_channel dim loop count + s9 = kernel data addr + s10 = n12 + s11 = n_tail + + t1-t6: hold input data + s1-s6: hold input data + + v2-v3: acc initial = bias + v4-v7: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .file "gemm_int4_ncxhwx.S" + .section .text.gemm_int4_ncxhwx_12xpackn, "ax", @progbits + .align 5 + .global gemm_int4_ncxhwx_12xpackn + .type gemm_int4_ncxhwx_12xpackn, @function + +.macro GEMM_INT4_NCXHWX_REQUANTIZE v_dst + vsetvli zero, s7, e32, m2 + vmulh.vv \v_dst, \v_dst, v4 // * mult + vssra.vv \v_dst, \v_dst, v6 // shift + vadd.vx \v_dst, \v_dst, a6 // + out_zp + vsetvli zero, s7, e16, m1 + vnclip.wi v0, \v_dst, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v1, v0, 0 + vsetvli zero, s8, e8, mf4 + vpnclip.wx \v_dst, v1, zero + +.endm + +gemm_int4_ncxhwx_12xpackn: + addi sp, sp, -96 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s5, 40(sp) + sd s6, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + + ld s0, 96(sp) + + csrr t0, vlenb // t0 = vlen/8 = packn/2 * 4 = 16 + slli t0, t0, 1 // t0 = packn * 4 = 32 + srai s7, t0, 2 // t1 = packn = 8 + vsetvli zero, s7, e32, m2 + + li s7, 12 + divw s10, a5, s7 // s10 = n12 + remw s11, a5, s7 // s11 = n % 12 (n_tail) + + vle32.v v2, (a3) // bias + + beqz s10, packnx8_start // if n12==0, jump to packnx8 + +packnx12_start: + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + vmv.v.v v24, v2 + vmv.v.v v26, v2 + vmv.v.v v28, v2 + vmv.v.v v30, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + lwd t5, t6, 16(a2) + + srai s8, a4, 3 // k8(k2) + +packnx12_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + vmaqa.vx v12, t3, v4 + lwd s1, s2, 24(a2) + addi a2, a2, 32 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, t5, v4 + lwd s3, s4, 0(a2) + lwd s5, s6, 8(a2) + vmaqa.vx v18, t6, v4 + vmaqa.vx v20, s1, v4 + vmaqa.vx v22, s2, v4 + lwd t1, t2, 16(a2) + lwd t3, t4, 24(a2) + addi a2, a2, 32 + vmaqa.vx v24, s3, v4 + vmaqa.vx v26, s4, v4 + lwd t5, t6, 0(a2) + vmaqa.vx v28, s5, v4 + vmaqa.vx v30, s6, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 8(a2) + lwd s3, s4, 16(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + lwd s5, s6, 24(a2) + addi a2, a2, 32 + vmaqa.vx v16, t5, v6 + vmaqa.vx v18, t6, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v20, s1, v6 + vmaqa.vx v22, s2, v6 + lwd t3, t4, 8(a2) + vmaqa.vx v24, s3, v6 + vmaqa.vx v26, s4, v6 + lwd t5, t6, 16(a2) + vmaqa.vx v28, s5, v6 + vmaqa.vx v30, s6, v6 + + addi s8, s8, -1 + bnez s8, packnx12_k2 + +packnx12_post: + srai s7, t0, 2 + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + GEMM_INT4_NCXHWX_REQUANTIZE v12 + GEMM_INT4_NCXHWX_REQUANTIZE v14 + GEMM_INT4_NCXHWX_REQUANTIZE v16 + GEMM_INT4_NCXHWX_REQUANTIZE v18 + GEMM_INT4_NCXHWX_REQUANTIZE v20 + GEMM_INT4_NCXHWX_REQUANTIZE v22 + GEMM_INT4_NCXHWX_REQUANTIZE v24 + GEMM_INT4_NCXHWX_REQUANTIZE v26 + GEMM_INT4_NCXHWX_REQUANTIZE v28 + GEMM_INT4_NCXHWX_REQUANTIZE v30 + +packnx12_end: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + vse8.v v12, (a0) + add a0, a0, s8 + vse8.v v14, (a0) + add a0, a0, s8 + vse8.v v16, (a0) + add a0, a0, s8 + vse8.v v18, (a0) + add a0, a0, s8 + vse8.v v20, (a0) + add a0, a0, s8 + vse8.v v22, (a0) + add a0, a0, s8 + vse8.v v24, (a0) + add a0, a0, s8 + vse8.v v26, (a0) + add a0, a0, s8 + vse8.v v28, (a0) + add a0, a0, s8 + vse8.v v30, (a0) + add a0, a0, s8 + + vsetvli zero, s7, e32, m2 + addi s10, s10, -1 + bnez s10, packnx12_start + +packnx8_start: + andi s10, s11, 8 // s1 = bool_n8 + beqz s10, packnx4_start // if n8==0, jump to packnx4 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s8, a4, 3 // k2 + +packnx8_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + vmaqa.vx v16, s1, v6 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v6 + vmaqa.vx v20, s3, v6 + vmaqa.vx v22, s4, v6 + + addi s8, s8, -1 + bnez s8, packnx8_k2 + +packnx8_post: + srai s7, t0, 2 + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + GEMM_INT4_NCXHWX_REQUANTIZE v12 + GEMM_INT4_NCXHWX_REQUANTIZE v14 + GEMM_INT4_NCXHWX_REQUANTIZE v16 + GEMM_INT4_NCXHWX_REQUANTIZE v18 + GEMM_INT4_NCXHWX_REQUANTIZE v20 + GEMM_INT4_NCXHWX_REQUANTIZE v22 + +packnx8_end: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + vse8.v v12, (a0) + add a0, a0, s8 + vse8.v v14, (a0) + add a0, a0, s8 + vse8.v v16, (a0) + add a0, a0, s8 + vse8.v v18, (a0) + add a0, a0, s8 + vse8.v v20, (a0) + add a0, a0, s8 + vse8.v v22, (a0) + add a0, a0, s8 + +packnx4_start: + andi s10, s11, 4 // s1 = bool_n4 + beqz s10, packnx2_start // if n4==0, jump to packnx2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s8, a4, 3 // k2 + +packnx4_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 32 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + vmaqa.vx v10, s2, v6 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v12, s3, v6 + vmaqa.vx v14, s4, v6 + + addi s8, s8, -1 + bnez s8, packnx4_k2 + +packnx4_post: + srai s7, t0, 2 + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + GEMM_INT4_NCXHWX_REQUANTIZE v12 + GEMM_INT4_NCXHWX_REQUANTIZE v14 + +packnx4_end: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + vse8.v v12, (a0) + add a0, a0, s8 + vse8.v v14, (a0) + add a0, a0, s8 + +packnx2_start: + andi s10, s11, 2 // s1 = bool_n2 + beqz s10, packnx1_start // if n2==0, jump to packnx1 + + vsetvli zero, s7, e32, m2 + vmv.v.v v8, v2 + vmv.v.v v10, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + + srai s8, a4, 3 // k2 + +packnx2_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lwd s1, s2, 8(a2) + vmaqa.vx v10, t2, v4 + addi a2, a2, 16 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v10, s2, v6 + + addi s8, s8, -1 + bnez s8, packnx2_k2 +packnx2_post: + srai s7, t0, 2 + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + +packnx2_end: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + +packnx1_start: + andi s10, s11, 1 // s1 = bool_n1 + beqz s10, packn_end // if n1==0, jump to packn_end + + vsetvli zero, s7, e32, m2 + vmv.v.v v8, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lw t1, 0(a2) + + srai s8, a4, 3 // k2 + +packnx1_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lw s1, 4(a2) + addi a2, a2, 8 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lw t1, 0(a2) + + addi s8, s8, -1 + bnez s8, packnx1_k2 + +packnx1_post: + srai s7, t0, 2 + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + +packnx1_end: + vse8.v v8, (a0) + add a0, a0, s8 + +packn_end: + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s5, 40(sp) + ld s6, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + addi sp, sp, 96 + + ret + + +/************************************************************************************************** + + void gemm_int4_ncxhwx_8xpackn(const int8_t *output, + const int8_t *kernel, + const int8_t *input, + const int32_t *bias, + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + int32_t out_zp, + int32_t *mult, + int32_t *shift) + + Algorithm works as follows: + (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr + a4: k [kernel_size] + a5: n [out_hw] + a6: out_zp + a7: mult addr + s0: shift addr + + t0 = packn/2 * 4 maintenance kernel_addr + s7 = tmp variable + s8 = k8(k2) input_channel dim loop count + s9 = kernel data addr + s10 = n8 / n4 / n2 / n1 + + t1-t4: hold input data + s1-s4: hold input data + + v2-v3: acc initial = bias + v4-v7: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .section .text.gemm_int4_ncxhwx_8xpackn, "ax", @progbits + .align 5 + .global gemm_int4_ncxhwx_8xpackn + .type gemm_int4_ncxhwx_8xpackn, @function + +gemm_int4_ncxhwx_8xpackn: + addi sp, sp, -72 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s7, 40(sp) + sd s8, 48(sp) + sd s9, 56(sp) + sd s10, 64(sp) + + ld s0, 72(sp) + + csrr t0, vlenb // t0 = vlen/8 = packn/2 * 4 = 16 + slli t0, t0, 1 // t0 = packn * 4 = 32 + srai s7, t0, 2 // t1 = packn = 8 + vsetvli zero, s7, e32, m2 + + srai s10, a5, 3 // s10 = n8 + + vle32.v v2, (a3) // bias + + beqz s10, packnx4_start_1 // if n8==0, jump to packnx4 + +packnx8_start_1: + vsetvli zero, s7, e32, m2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s8, a4, 3 // k2 + +packnx8_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + vmaqa.vx v16, s1, v6 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v6 + vmaqa.vx v20, s3, v6 + vmaqa.vx v22, s4, v6 + + addi s8, s8, -1 + bnez s8, packnx8_k2_1 + +packnx8_post_1: + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + GEMM_INT4_NCXHWX_REQUANTIZE v12 + GEMM_INT4_NCXHWX_REQUANTIZE v14 + GEMM_INT4_NCXHWX_REQUANTIZE v16 + GEMM_INT4_NCXHWX_REQUANTIZE v18 + GEMM_INT4_NCXHWX_REQUANTIZE v20 + GEMM_INT4_NCXHWX_REQUANTIZE v22 + +packnx8_end_1: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + vse8.v v12, (a0) + add a0, a0, s8 + vse8.v v14, (a0) + add a0, a0, s8 + vse8.v v16, (a0) + add a0, a0, s8 + vse8.v v18, (a0) + add a0, a0, s8 + vse8.v v20, (a0) + add a0, a0, s8 + vse8.v v22, (a0) + add a0, a0, s8 + + addi s10, s10, -1 + bnez s10, packnx8_start_1 + +packnx4_start_1: + andi s10, a5, 4 // s1 = bool_n4 + beqz s10, packnx2_start_1 // if n4==0, jump to packnx2 + + vsetvli zero, s7, e32, m2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s8, a4, 3 // k2 + +packnx4_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 32 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + vmaqa.vx v10, s2, v6 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v12, s3, v6 + vmaqa.vx v14, s4, v6 + + addi s8, s8, -1 + bnez s8, packnx4_k2_1 + +packnx4_post_1: + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + GEMM_INT4_NCXHWX_REQUANTIZE v12 + GEMM_INT4_NCXHWX_REQUANTIZE v14 + +packnx4_end_1: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + vse8.v v12, (a0) + add a0, a0, s8 + vse8.v v14, (a0) + add a0, a0, s8 + +packnx2_start_1: + andi s10, a5, 2 // s1 = bool_n2 + beqz s10, packnx1_start_1 // if n2==0, jump to packnx1 + + vsetvli zero, s7, e32, m2 + vmv.v.v v8, v2 + vmv.v.v v10, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + + srai s8, a4, 3 // k2 + +packnx2_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lwd s1, s2, 8(a2) + vmaqa.vx v10, t2, v4 + addi a2, a2, 16 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v10, s2, v6 + + addi s8, s8, -1 + bnez s8, packnx2_k2_1 + +packnx2_post_1: + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + GEMM_INT4_NCXHWX_REQUANTIZE v10 + +packnx2_end_1: + vse8.v v8, (a0) + add a0, a0, s8 + vse8.v v10, (a0) + add a0, a0, s8 + +packnx1_start_1: + andi s10, a5, 1 // s1 = bool_n1 + beqz s10, packn_end_1 // if n1==0, jump to packn_end + + vsetvli zero, s7, e32, m2 + vmv.v.v v8, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lw t1, 0(a2) + + srai s8, a4, 3 // k2 + +packnx1_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lw s1, 4(a2) + addi a2, a2, 8 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lw t1, 0(a2) + + addi s8, s8, -1 + bnez s8, packnx1_k2_1 + +packnx1_post_1: + vsetvli zero, s7, e32, m2 // set vl = 8 + vle32.v v4, (a7) // mult + srai s8, s7, 1 + vle32.v v6, (s0) // shift + vxor.vi v6, v6, -1 + + GEMM_INT4_NCXHWX_REQUANTIZE v8 + +packnx1_end_1: + vse8.v v8, (a0) + add a0, a0, s8 + +packn_end_1: + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s7, 40(sp) + ld s8, 48(sp) + ld s9, 56(sp) + ld s10, 64(sp) + addi sp, sp, 72 + + ret + .end diff --git a/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S new file mode 100644 index 00000000..b5e94fd1 --- /dev/null +++ b/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S @@ -0,0 +1,1078 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +/************************************************************************************************** + + void gemm_int8_ncxhwx_12xpackn(const int8_t *output, + const int8_t *kernel, + const int8_t *input, + const int32_t *bias, + int m, // maxtrix A row + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + int32_t out_zp, + int32_t *mult, + int32_t *shift) + + Algorithm works as follows: + (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr + a4: m [packn or tail_packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: out_zp + + s7: mult addr + s8: shift addr + + t0 = packn/2 * 4 maintenance kernel_addr + s0 = tmp variable [k8(k2) input_channel dim loop count] ... + s9 = kernel data addr + s10 = n12 + s11 = n_tail + + t1-t6: hold input data + s1-s6: hold input data + + v2-v3: acc initial = bias + v4-v7: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .file "gemm_int8_ncxhwx.S" + .section .text.gemm_int8_ncxhwx_12xpackn, "ax", @progbits + .align 5 + .global gemm_int8_ncxhwx_12xpackn + .type gemm_int8_ncxhwx_12xpackn, @function + +.macro GEMM_INT8_NCXHWX_REQUANTIZE v_dst + vsetvli zero, a4, e32, m2 + vmulh.vv \v_dst, \v_dst, v4 // * mult + vssra.vv \v_dst, \v_dst, v6 // shift + vadd.vx \v_dst, \v_dst, a7 // + out_zp + vsetvli zero, a4, e16, m1 + vnclip.wi v0, \v_dst, 0 + vsetvli zero, a4, e8, mf2 + vnclip.wi \v_dst, v0, 0 +.endm + +gemm_int8_ncxhwx_12xpackn: + addi sp, sp, -96 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s5, 40(sp) + sd s6, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + + ld s7, 96(sp) + ld s8, 104(sp) + + slli t0, a4, 2 // t0 = packn * 4 = 32 + vsetvli zero, a4, e32, m2 + + li s0, 12 + divw s10, a6, s0 // s10 = n12 + remw s11, a6, s0 // s11 = n % 12 (n_tail) + + vle32.v v2, (a3) // bias + + beqz s10, packnx8_start // if n12==0, jump to packnx8 + +packnx12_start: + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + vmv.v.v v24, v2 + vmv.v.v v26, v2 + vmv.v.v v28, v2 + vmv.v.v v30, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + lwd t5, t6, 16(a2) + + srai s0, a5, 3 // k8(k2) + beqz s0, packnx12_k1 + +packnx12_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + vmaqa.vx v12, t3, v4 + lwd s1, s2, 24(a2) + addi a2, a2, 32 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, t5, v4 + lwd s3, s4, 0(a2) + lwd s5, s6, 8(a2) + vmaqa.vx v18, t6, v4 + vmaqa.vx v20, s1, v4 + vmaqa.vx v22, s2, v4 + lwd t1, t2, 16(a2) + lwd t3, t4, 24(a2) + addi a2, a2, 32 + vmaqa.vx v24, s3, v4 + vmaqa.vx v26, s4, v4 + lwd t5, t6, 0(a2) + vmaqa.vx v28, s5, v4 + vmaqa.vx v30, s6, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 8(a2) + lwd s3, s4, 16(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + lwd s5, s6, 24(a2) + addi a2, a2, 32 + vmaqa.vx v16, t5, v6 + vmaqa.vx v18, t6, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v20, s1, v6 + vmaqa.vx v22, s2, v6 + lwd t3, t4, 8(a2) + vmaqa.vx v24, s3, v6 + vmaqa.vx v26, s4, v6 + lwd t5, t6, 16(a2) + vmaqa.vx v28, s5, v6 + vmaqa.vx v30, s6, v6 + + addi s0, s0, -1 + bnez s0, packnx12_k2 + +packnx12_k1: + andi s0, a5, 4 // k4(k1) + beqz s0, packnx12_post + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + vmaqa.vx v12, t3, v4 + lwd s1, s2, 24(a2) + addi a2, a2, 32 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, t5, v4 + lwd s3, s4, 0(a2) + lwd s5, s6, 8(a2) + vmaqa.vx v18, t6, v4 + vmaqa.vx v20, s1, v4 + vmaqa.vx v22, s2, v4 + addi a2, a2, 16 + vmaqa.vx v24, s3, v4 + vmaqa.vx v26, s4, v4 + vmaqa.vx v28, s5, v4 + vmaqa.vx v30, s6, v4 + +packnx12_post: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + GEMM_INT8_NCXHWX_REQUANTIZE v12 + GEMM_INT8_NCXHWX_REQUANTIZE v14 + GEMM_INT8_NCXHWX_REQUANTIZE v16 + GEMM_INT8_NCXHWX_REQUANTIZE v18 + GEMM_INT8_NCXHWX_REQUANTIZE v20 + GEMM_INT8_NCXHWX_REQUANTIZE v22 + GEMM_INT8_NCXHWX_REQUANTIZE v24 + GEMM_INT8_NCXHWX_REQUANTIZE v26 + GEMM_INT8_NCXHWX_REQUANTIZE v28 + GEMM_INT8_NCXHWX_REQUANTIZE v30 + +/* + vmulh.vv v8, v8, v4 // * mult + vssra.vv v8, v8, v6 // shift + vadd.vx v8, v8, a6 // + out_zp + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v8, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v8, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v10, v10, v4 + vssra.vv v10, v10, v6 + vadd.vx v10, v10, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v10, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v10, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v12, v12, v4 + vssra.vv v12, v12, v6 + vadd.vx v12, v12, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v12, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v12, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v14, v14, v4 + vssra.vv v14, v14, v6 + vadd.vx v14, v14, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v14, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v14, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v16, v16, v4 + vssra.vv v16, v16, v6 + vadd.vx v16, v16, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v16, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v16, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v18, v18, v4 + vssra.vv v18, v18, v6 + vadd.vx v18, v18, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v18, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v18, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v20, v20, v4 + vssra.vv v20, v20, v6 + vadd.vx v20, v20, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v20, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v20, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v22, v22, v4 + vssra.vv v22, v22, v6 + vadd.vx v22, v22, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v22, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v22, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v24, v24, v4 + vssra.vv v24, v24, v6 + vadd.vx v24, v24, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v24, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v24, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v26, v26, v4 + vssra.vv v26, v26, v6 + vadd.vx v26, v26, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v26, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v26, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v28, v28, v4 + vssra.vv v28, v28, v6 + vadd.vx v28, v28, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v28, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v28, v0, 0 + + vsetvli zero, s7, e32, m2 + vmulh.vv v30, v30, v4 + vssra.vv v30, v30, v6 + vadd.vx v30, v30, a6 + vsetvli zero, s7, e16, m1 + vnclip.wi v0, v30, 0 + vsetvli zero, s7, e8, mf2 + vnclip.wi v30, v0, 0 +*/ + +packnx12_end: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + vse8.v v12, (a0) + add a0, a0, a4 + vse8.v v14, (a0) + add a0, a0, a4 + vse8.v v16, (a0) + add a0, a0, a4 + vse8.v v18, (a0) + add a0, a0, a4 + vse8.v v20, (a0) + add a0, a0, a4 + vse8.v v22, (a0) + add a0, a0, a4 + vse8.v v24, (a0) + add a0, a0, a4 + vse8.v v26, (a0) + add a0, a0, a4 + vse8.v v28, (a0) + add a0, a0, a4 + vse8.v v30, (a0) + add a0, a0, a4 + + vsetvli zero, a4, e32, m2 + addi s10, s10, -1 + bnez s10, packnx12_start + +packnx8_start: + andi s10, s11, 8 // s1 = bool_n8 + beqz s10, packnx4_start // if n8==0, jump to packnx4 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx8_k1 + +packnx8_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + vmaqa.vx v16, s1, v6 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v6 + vmaqa.vx v20, s3, v6 + vmaqa.vx v22, s4, v6 + + addi s0, s0, -1 + bnez s0, packnx8_k2 + +packnx8_k1: + andi s0, a5, 4 // k1 + beqz s0, packnx8_post + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + +packnx8_post: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + GEMM_INT8_NCXHWX_REQUANTIZE v12 + GEMM_INT8_NCXHWX_REQUANTIZE v14 + GEMM_INT8_NCXHWX_REQUANTIZE v16 + GEMM_INT8_NCXHWX_REQUANTIZE v18 + GEMM_INT8_NCXHWX_REQUANTIZE v20 + GEMM_INT8_NCXHWX_REQUANTIZE v22 + +packnx8_end: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + vse8.v v12, (a0) + add a0, a0, a4 + vse8.v v14, (a0) + add a0, a0, a4 + vse8.v v16, (a0) + add a0, a0, a4 + vse8.v v18, (a0) + add a0, a0, a4 + vse8.v v20, (a0) + add a0, a0, a4 + vse8.v v22, (a0) + add a0, a0, a4 + +packnx4_start: + andi s10, s11, 4 // s1 = bool_n4 + beqz s10, packnx2_start // if n4==0, jump to packnx2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx4_k1 + +packnx4_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 32 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + vmaqa.vx v10, s2, v6 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v12, s3, v6 + vmaqa.vx v14, s4, v6 + + addi s0, s0, -1 + bnez s0, packnx4_k2 + +packnx4_k1: + andi s0, a5, 4 // k1 + beqz s0, packnx4_post + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 16 + +packnx4_post: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + GEMM_INT8_NCXHWX_REQUANTIZE v12 + GEMM_INT8_NCXHWX_REQUANTIZE v14 + +packnx4_end: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + vse8.v v12, (a0) + add a0, a0, a4 + vse8.v v14, (a0) + add a0, a0, a4 + +packnx2_start: + andi s10, s11, 2 // s1 = bool_n2 + beqz s10, packnx1_start // if n2==0, jump to packnx1 + + vsetvli zero, a4, e32, m2 + vmv.v.v v8, v2 + vmv.v.v v10, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx2_k1 + +packnx2_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lwd s1, s2, 8(a2) + vmaqa.vx v10, t2, v4 + addi a2, a2, 16 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v10, s2, v6 + + addi s0, s0, -1 + bnez s0, packnx2_k2 + +packnx2_k1: + andi s0, a5, 4 // k1 + beqz s0, packnx2_post + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + addi a2, a2, 8 + +packnx2_post: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + +packnx2_end: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + +packnx1_start: + andi s10, s11, 1 // s1 = bool_n1 + beqz s10, packn_end // if n1==0, jump to packn_end + + vsetvli zero, a4, e32, m2 + vmv.v.v v8, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lw t1, 0(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx1_k1 + +packnx1_k2: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lw s1, 4(a2) + addi a2, a2, 8 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lw t1, 0(a2) + + addi s0, s0, -1 + bnez s0, packnx1_k2 + +packnx1_k1: + andi s0, a5, 4 // k1 + beqz s0, packnx1_post + + vmaqa.vx v8, t1, v4 + addi a2, a2, 4 + +packnx1_post: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + +packnx1_end: + vse8.v v8, (a0) + add a0, a0, a4 + +packn_end: + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s5, 40(sp) + ld s6, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + addi sp, sp, 96 + + ret + + +/************************************************************************************************** + + void gemm_int8_ncxhwx_8xpackn(const int8_t *output, + const int8_t *kernel, + const int8_t *input, + const int32_t *bias, + int m, // maxtrix A row + int k, // maxtrix A col / maxtrix B row + int n, // maxtrix B col + int32_t out_zp, + int32_t *mult, + int32_t *shift) + + Algorithm works as follows: + (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n] + ... + + register definition: + a0: output addr + a1: kernel addr + a2: input addr + a3: bias addr + a4: m [packn or tail_packn] + a5: k [kernel_size] + a6: n [out_hw] + a7: out_zp + + s7: mult addr + s8: shift addr + + t0 = packn/2 * 4 maintenance kernel_addr + s0 = tmp variable [k8(k2) input_channel dim loop count] + s9 = kernel data addr + s10 = n8 / n4 / n2 / n1 + + t1-t4: hold input data + s1-s4: hold input data + + v2-v3: acc initial = bias + v4-v7: hold kernel data + v8-v19: fisrt packn line acc + v20-v31: second packn line acc + + *************************************************************************************************/ + .section .text.gemm_int8_ncxhwx_8xpackn, "ax", @progbits + .align 5 + .global gemm_int8_ncxhwx_8xpackn + .type gemm_int8_ncxhwx_8xpackn, @function + +gemm_int8_ncxhwx_8xpackn: + addi sp, sp, -72 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s7, 40(sp) + sd s8, 48(sp) + sd s9, 56(sp) + sd s10, 64(sp) + + ld s7, 72(sp) + ld s8, 80(sp) + + slli t0, a4, 2 // t0 = packn * 4 = 32 + vsetvli zero, a4, e32, m2 + + srai s10, a6, 3 // s10 = n8 + + vle32.v v2, (a3) // bias + + beqz s10, packnx4_start_1 // if n8==0, jump to packnx4 + +packnx8_start_1: + vsetvli zero, a4, e32, m2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + vmv.v.v v16, v2 + vmv.v.v v18, v2 + vmv.v.v v20, v2 + vmv.v.v v22, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx8_k1_1 + +packnx8_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v6 + vmaqa.vx v10, t2, v6 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v6 + vmaqa.vx v14, t4, v6 + vmaqa.vx v16, s1, v6 + addi a2, a2, 32 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v18, s2, v6 + vmaqa.vx v20, s3, v6 + vmaqa.vx v22, s4, v6 + + addi s0, s0, -1 + bnez s0, packnx8_k2_1 + +packnx8_k1_1: + andi s0, a5, 4 // k1 + beqz s0, packnx8_post_1 + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + vmaqa.vx v16, s1, v4 + addi a2, a2, 32 + vmaqa.vx v18, s2, v4 + vmaqa.vx v20, s3, v4 + vmaqa.vx v22, s4, v4 + +packnx8_post_1: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + GEMM_INT8_NCXHWX_REQUANTIZE v12 + GEMM_INT8_NCXHWX_REQUANTIZE v14 + GEMM_INT8_NCXHWX_REQUANTIZE v16 + GEMM_INT8_NCXHWX_REQUANTIZE v18 + GEMM_INT8_NCXHWX_REQUANTIZE v20 + GEMM_INT8_NCXHWX_REQUANTIZE v22 + +packnx8_end_1: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + vse8.v v12, (a0) + add a0, a0, a4 + vse8.v v14, (a0) + add a0, a0, a4 + vse8.v v16, (a0) + add a0, a0, a4 + vse8.v v18, (a0) + add a0, a0, a4 + vse8.v v20, (a0) + add a0, a0, a4 + vse8.v v22, (a0) + add a0, a0, a4 + + addi s10, s10, -1 + bnez s10, packnx8_start_1 + +packnx4_start_1: + andi s10, a6, 4 // s1 = bool_n4 + beqz s10, packnx2_start_1 // if n4==0, jump to packnx2 + + vsetvli zero, a4, e32, m2 + + vmv.v.v v8, v2 + vmv.v.v v10, v2 + vmv.v.v v12, v2 + vmv.v.v v14, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx4_k1_1 + +packnx4_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + lwd s1, s2, 16(a2) + lwd s3, s4, 24(a2) + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 32 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + vmaqa.vx v10, s2, v6 + lwd t1, t2, 0(a2) + lwd t3, t4, 8(a2) + vmaqa.vx v12, s3, v6 + vmaqa.vx v14, s4, v6 + + addi s0, s0, -1 + bnez s0, packnx4_k2_1 + +packnx4_k1_1: + andi s0, a5, 4 // k1 + beqz s0, packnx4_post_1 + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + vmaqa.vx v12, t3, v4 + vmaqa.vx v14, t4, v4 + addi a2, a2, 16 + +packnx4_post_1: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + GEMM_INT8_NCXHWX_REQUANTIZE v12 + GEMM_INT8_NCXHWX_REQUANTIZE v14 + +packnx4_end_1: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + vse8.v v12, (a0) + add a0, a0, a4 + vse8.v v14, (a0) + add a0, a0, a4 + +packnx2_start_1: + andi s10, a6, 2 // s1 = bool_n2 + beqz s10, packnx1_start_1 // if n2==0, jump to packnx1 + + vsetvli zero, a4, e32, m2 + vmv.v.v v8, v2 + vmv.v.v v10, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lwd t1, t2, 0(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx2_k1_1 + +packnx2_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lwd s1, s2, 8(a2) + vmaqa.vx v10, t2, v4 + addi a2, a2, 16 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lwd t1, t2, 0(a2) + vmaqa.vx v10, s2, v6 + + addi s0, s0, -1 + bnez s0, packnx2_k2_1 + +packnx2_k1_1: + andi s0, a5, 4 // k1 + beqz s0, packnx2_post_1 + + vmaqa.vx v8, t1, v4 + vmaqa.vx v10, t2, v4 + addi a2, a2, 8 + +packnx2_post_1: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + GEMM_INT8_NCXHWX_REQUANTIZE v10 + +packnx2_end_1: + vse8.v v8, (a0) + add a0, a0, a4 + vse8.v v10, (a0) + add a0, a0, a4 + +packnx1_start_1: + andi s10, a6, 1 // s1 = bool_n1 + beqz s10, packn_end_1 // if n1==0, jump to packn_end + + vsetvli zero, a4, e32, m2 + vmv.v.v v8, v2 + + mv s9, a1 // kernel origin addr + // pre-load kernel_data + vle32.v v4, (s9) + add s9, s9, t0 // +packn + // pre-load input_data + lw t1, 0(a2) + + srai s0, a5, 3 // k2 + beqz s0, packnx1_k1_1 + +packnx1_k2_1: + vle32.v v6, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, t1, v4 + lw s1, 4(a2) + addi a2, a2, 8 + + vle32.v v4, (s9) + add s9, s9, t0 // +packn + + vmaqa.vx v8, s1, v6 + lw t1, 0(a2) + + addi s0, s0, -1 + bnez s0, packnx1_k2_1 + +packnx1_k1_1: + andi s0, a5, 4 // k1 + beqz s0, packnx1_post_1 + + vmaqa.vx v8, t1, v4 + addi a2, a2, 4 + +packnx1_post_1: + vsetvli zero, a4, e32, m2 // set vl = 8 + vle32.v v4, (s7) // mult + vle32.v v6, (s8) // shift + vxor.vi v6, v6, -1 + + GEMM_INT8_NCXHWX_REQUANTIZE v8 + +packnx1_end_1: + vse8.v v8, (a0) + add a0, a0, a4 + +packn_end_1: + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s7, 40(sp) + ld s8, 48(sp) + ld s9, 56(sp) + ld s10, 64(sp) + addi sp, sp, 72 + + ret + .end diff --git a/source/c908_opt/maxpool.c b/source/c908_opt/maxpool.c new file mode 100644 index 00000000..9a12d421 --- /dev/null +++ b/source/c908_opt/maxpool.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(float); + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 + : shl_rvv_global_maxpool2d_fp32; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32 + : shl_rvv_maxpool2x2s2_fp32; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32 + : shl_rvv_maxpool2x2s2_p1_fp32; + } + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32 + : shl_rvv_maxpool3x3s2_fp32; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32 + : shl_rvv_maxpool3x3s2_p1_fp32; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp32 + : shl_rvv_maxpool3x3s1_p1_fp32; + } + } + } + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_maxpool2d_f32; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 + : shl_rvv_global_maxpool2d_fp16; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16 + : shl_rvv_maxpool2x2s2_fp16; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16 + : shl_rvv_maxpool2x2s2_p1_fp16; + } + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16 + : shl_rvv_maxpool3x3s2_fp16; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16 + : shl_rvv_maxpool3x3s2_p1_fp16; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp16 + : shl_rvv_maxpool3x3s1_p1_fp16; + } + } + } + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 + : shl_ref_global_maxpool2d_quant; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8 + : shl_rvv_maxpool2x2s2_int8; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8 + : shl_rvv_maxpool2x2s2_p1_int8; + } + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8 + : shl_rvv_maxpool3x3s2_int8; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8 + : shl_rvv_maxpool3x3s2_p1_int8; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_int8 + : shl_rvv_maxpool3x3s1_p1_int8; + } + } + } + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on C908, call reference func " + "replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_c908_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + return CSINN_FALSE; +} diff --git a/source/c908_opt/reorder.c b/source/c908_opt/reorder.c new file mode 100644 index 00000000..86392547 --- /dev/null +++ b/source/c908_opt/reorder.c @@ -0,0 +1,1128 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +/************************************************************************ + * reorder kernel matrix + ***********************************************************************/ +// vlen=128 +void shl_c908_reorder_kernel_n8_fp32(float *src, float *dst, int m, int k, int ldc) +{ + shl_rvv_reorder_kernel_n8_fp32(src, dst, m, k, ldc); +} + +void shl_c908_reorder_kernel_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldc) +{ + shl_rvv_reorder_kernel_n8_fp16(src, dst, m, k, ldc); +} + +void shl_c908_reorder_kernel_n8_int8(int8_t *src, int8_t *dst, int m, int k, int ldc) +{ + shl_rvv_reorder_kernel_n8_int8(src, dst, m, k, ldc); +} + +/************************************************************************ + * reorder input matrix + ***********************************************************************/ +// vlen=128 +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z8 Z4 Z4_tail + **************************************************************/ +void shl_c908_reorder_input_z8_fp32(float *src, float *dst, int k, int n, int ldc) +{ + asm volatile( + "li a0, 8\n\t" + "srai t0, %[n], 3\n\t" // t0 = n8 + "andi t1, %[n], 7\n\t" // t1 = n & 7 + "slli t2, %[ldc], 2\n\t" // t2 = ldc * 4 (line stride) + + "beqz t0, 3f\n\t" // if n8 == 0, jump to packn4 + "vsetvli zero, a0, e32, m2\n\t" // set vl = 8 + + "1:\n\t" // n8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn8k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n4 + "andi t0, t1, 4\n\t" // n & 4u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e32, m1\n\t" // set vl = 4 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 4 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn4k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n_tail + "andi t0, t1, 3\n\t" // n & 3u + "beqz t0, 7f\n\t" + "slli t4, t0, 2\n\t" // t4 = 4 * n_tail + + "vsetvli zero, t0, e32, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn4k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z12 Z8 Z4 Z4_tail + **************************************************************/ +void shl_c908_reorder_input_z12_fp32(float *src, float *dst, int k, int n, int ldc) +{ + asm volatile( + "li a1, 12\n\t" + "divw t0, %[n], a1\n\t" // t0 = n12 + "remw t1, %[n], a1\n\t" // t1 = n % 12 + "slli t2, %[ldc], 2\n\t" // t2 = ldc * 4 (line stride) + + "beqz t0, 3f\n\t" // if n12 == 0, jump to packn8 + "vsetvli zero, a1, e32, m4\n\t" // set vl = 12 + + "1:\n\t" // n12 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 48\n\t" // src_ptr += 12 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn12k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 48\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e32, m2\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn8k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n4 + "andi t0, t1, 4\n\t" // n & 4u + "beqz t0, 7f\n\t" + + "vsetvli zero, t0, e32, m1\n\t" // set vl = 4 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 4 + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn4k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // n_tail + "andi t0, t1, 3\n\t" // n & 3u + "beqz t0, 9f\n\t" + "slli t4, t0, 2\n\t" // t4 = 4 * n_tail + + "vsetvli zero, t0, e32, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "8:\n\t" + // start packn_tailk1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" + + "9:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z16 Z8 Z8_tail + **************************************************************/ +void shl_c908_reorder_input_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc) +{ + asm volatile( + "li a0, 16\n\t" + "srai t0, %[n], 4\n\t" // t0 = n16 + "andi t1, %[n], 15\n\t" // t1 = n & 15 + "slli t2, %[ldc], 1\n\t" // t2 = ldc * 2 (line stride) + + "beqz t0, 3f\n\t" // if n18 == 0, jump to packn8 + "vsetvli zero, a0, e16, m2\n\t" // set vl = 16 + + "1:\n\t" // n16 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 16 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn16k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e16, m1\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn8k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n_tail + "andi t0, t1, 7\n\t" // n & 7u + "beqz t0, 7f\n\t" + "slli t4, t0, 1\n\t" // t4 = 2 * n_tail + + "vsetvli zero, t0, e16, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn8k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z24 Z16 Z8 Z8_tail + **************************************************************/ +void shl_c908_reorder_input_z24_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc) +{ + asm volatile( + "li a1, 24\n\t" + "divw t0, %[n], a1\n\t" // t0 = n24 + "remw t1, %[n], a1\n\t" // t1 = n % 24 + "slli t2, %[ldc], 1\n\t" // t2 = ldc * 2 (line stride) + + "beqz t0, 3f\n\t" // if n24 == 0, jump to packn16 + "vsetvli zero, a1, e16, m4\n\t" // set vl = 24 + + "1:\n\t" // n24 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 48\n\t" // src_ptr += 24 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn24k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 48\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n16 + "andi t0, t1, 16\n\t" // n & 16u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e16, m2\n\t" // set vl = 16 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 16 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn16k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 7f\n\t" + + "vsetvli zero, t0, e16, m1\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn8k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // n_tail + "andi t0, t1, 7\n\t" // n & 7u + "beqz t0, 9f\n\t" + "slli t4, t0, 1\n\t" // t4 = 2 * n_tail + + "vsetvli zero, t0, e16, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "8:\n\t" + // start packn_tailk1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" + + "9:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z8 Z4 Z4_tail + **************************************************************/ +void shl_c908_reorder_input_z8_int8(int8_t *src, int8_t *dst, int k, int n, int ldc) +{ + int vl = vsetvl_e8m1(8); + int i = 0; + for (; i + 7 < n; i += 8) { + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 32 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 32; + } + } + for (; i + 3 < n; i += 4) { + vl = vsetvl_e8m1(4); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 13; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 16; + } + } + // n_tail + if (i < n) { + vl = vsetvl_e8m1(n & 3); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 4 * vl - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + } + } +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z12 Z8 Z4 Z4_tail + **************************************************************/ +void shl_c908_reorder_input_z12_int8(int8_t *src, int8_t *dst, int k, int n, int ldc) +{ + int vl = vsetvl_e8m1(12); + int i = 0; + for (; i + 11 < n; i += 12) { + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 48 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 48; + } + } + for (; i + 7 < n; i += 8) { + vl = vsetvl_e8m1(8); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 32 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 32; + } + } + for (; i + 3 < n; i += 4) { + vl = vsetvl_e8m1(4); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 13; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 16; + } + } + // n_tail + if (i < n) { + vl = vsetvl_e8m1(n & 3); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 4 * vl - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + } + } +} + +// vlen256 +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z16 Z8 Z8_tail + **************************************************************/ +void shl_c908_reorder_input_z16_fp32_v256(float *src, float *dst, int k, int n, int ldc) +{ + asm volatile( + "li a0, 16\n\t" + "srai t0, %[n], 4\n\t" // t0 = n16 + "andi t1, %[n], 15\n\t" // t1 = n & 15 + "slli t2, %[ldc], 2\n\t" // t2 = ldc * 4 (line stride) + + "beqz t0, 3f\n\t" // if n16 == 0, jump to packn8 + "vsetvli zero, a0, e32, m2\n\t" // set vl = 16 + + "1:\n\t" // n16 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 64\n\t" // src_ptr += 16 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn16k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 64\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e32, m1\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn8k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n_tail + "andi t0, t1, 7\n\t" // n & 7u + "beqz t0, 7f\n\t" + "slli t4, t0, 2\n\t" // t4 = 4 * n_tail + + "vsetvli zero, t0, e32, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn8k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z12 Z8 Z4 Z4_tail + **************************************************************/ +void shl_c908_reorder_input_z24_fp32_v256(float *src, float *dst, int k, int n, int ldc) +{ + asm volatile( + "li a1, 12\n\t" + "divw t0, %[n], a1\n\t" // t0 = n12 + "remw t1, %[n], a1\n\t" // t1 = n % 12 + "slli t2, %[ldc], 2\n\t" // t2 = ldc * 4 (line stride) + + "beqz t0, 3f\n\t" // if n12 == 0, jump to packn8 + "vsetvli zero, a1, e32, m4\n\t" // set vl = 12 + + "1:\n\t" // n12 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 48\n\t" // src_ptr += 12 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn12k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 48\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e32, m2\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn8k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n4 + "andi t0, t1, 4\n\t" // n & 4u + "beqz t0, 7f\n\t" + + "vsetvli zero, t0, e32, m1\n\t" // set vl = 4 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 4 + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn4k1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // n_tail + "andi t0, t1, 3\n\t" // n & 3u + "beqz t0, 9f\n\t" + "slli t4, t0, 2\n\t" // t4 = 4 * n_tail + + "vsetvli zero, t0, e32, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "8:\n\t" + // start packn_tailk1 + "vle32.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse32.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" + + "9:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z32 Z16 Z16_tail + **************************************************************/ +void shl_c908_reorder_input_z32_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc) +{ + asm volatile( + "li a0, 32\n\t" + "srai t0, %[n], 5\n\t" // t0 = n32 + "andi t1, %[n], 31\n\t" // t1 = n & 31 + "slli t2, %[ldc], 1\n\t" // t2 = ldc * 2 (line stride) + + "beqz t0, 3f\n\t" // if n32 == 0, jump to packn16 + "vsetvli zero, a0, e16, m2\n\t" // set vl = 32 + + "1:\n\t" // n32 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 64\n\t" // src_ptr += 32 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn32k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 64\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n16 + "andi t0, t1, 16\n\t" // n & 16u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e16, m1\n\t" // set vl = 16 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 16 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn16k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n_tail + "andi t0, t1, 15\n\t" // n & 15u + "beqz t0, 7f\n\t" + "slli t4, t0, 1\n\t" // t4 = 2 * n_tail + + "vsetvli zero, t0, e16, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn_tailk1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z24 Z16 Z8 Z8_tail + **************************************************************/ +void shl_c908_reorder_input_z48_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc) +{ + asm volatile( + "li a1, 24\n\t" + "divw t0, %[n], a1\n\t" // t0 = n24 + "remw t1, %[n], a1\n\t" // t1 = n % 24 + "slli t2, %[ldc], 1\n\t" // t2 = ldc * 2 (line stride) + + "beqz t0, 3f\n\t" // if n24 == 0, jump to packn16 + "vsetvli zero, a1, e16, m4\n\t" // set vl = 24 + + "1:\n\t" // n24 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 48\n\t" // src_ptr += 24 + "mv t3, %[k]\n\t" // k + + "2:\n\t" + // start packn24k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 48\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 2b\n\t" + + "addi t0, t0, -1\n\t" + "bnez t0, 1b\n\t" + + "3:\n\t" // n16 + "andi t0, t1, 16\n\t" // n & 16u + "beqz t0, 5f\n\t" + + "vsetvli zero, t0, e16, m2\n\t" // set vl = 16 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 32\n\t" // src_ptr += 16 + "mv t3, %[k]\n\t" // k + + "4:\n\t" + // start packn16k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 32\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 4b\n\t" + + "5:\n\t" // n8 + "andi t0, t1, 8\n\t" // n & 8u + "beqz t0, 7f\n\t" + + "vsetvli zero, t0, e16, m1\n\t" // set vl = 8 + "mv a0, %[src]\n\t" + "addi %[src], %[src], 16\n\t" // src_ptr += 8 + "mv t3, %[k]\n\t" // k + + "6:\n\t" + // start packn8k1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "addi %[dst], %[dst], 16\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 6b\n\t" + + "7:\n\t" // n_tail + "andi t0, t1, 7\n\t" // n & 7u + "beqz t0, 9f\n\t" + "slli t4, t0, 1\n\t" // t4 = 2 * n_tail + + "vsetvli zero, t0, e16, m1\n\t" // set vl = n_tail + "mv a0, %[src]\n\t" + "mv t3, %[k]\n\t" // k + + "8:\n\t" + // start packn_tailk1 + "vle16.v v4, (a0)\n\t" + "add a0, a0, t2\n\t" + "vse16.v v4, (%[dst])\n\t" + "add %[dst], %[dst], t4\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, 8b\n\t" + + "9:\n\t" // ending + + : [src] "+r"(src), [dst] "+r"(dst) + + : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc) + + : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4"); +} + +/************************************************************** + * input—matrix: [k, n] + * Data arrangement: Z16 Z8 Z8_tail + **************************************************************/ +void shl_c908_reorder_input_z16_int8_v256(int8_t *src, int8_t *dst, int k, int n, int ldc) +{ + int vl = vsetvl_e8m1(16); + int i = 0; + for (; i + 15 < n; i += 16) { + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 64 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 64; + } + } + for (; i + 7 < n; i += 8) { + vl = vsetvl_e8m1(8); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 32 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + dst += 32; + } + } + // n_tail + if (i < n) { + vl = vsetvl_e8m1(n & 7); + int8_t *b0 = src + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl); + dst += 4 * vl - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = dst; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + } + } +} diff --git a/source/c908_opt/setup.c b/source/c908_opt/setup.c new file mode 100644 index 00000000..d60dc3e6 --- /dev/null +++ b/source/c908_opt/setup.c @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_c908.h" + +#define C908_OP_PATTERN_MAX 60 +static struct csinn_callback __c908_cb_table[C908_OP_PATTERN_MAX]; +static int __c908_cb_key[C908_OP_PATTERN_MAX]; + +void shl_c908_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, + void *exec, void *est) +{ + static int i = 0; + __c908_cb_key[i] = op_name * CSINN_DTYPE_SIZE + dtype; + __c908_cb_table[i].init = init; + __c908_cb_table[i].exec = exec; + __c908_cb_table[i].est = est; + i++; +} + +struct csinn_callback *shl_cb_map_rvv(int op, int dtype); +struct csinn_callback *shl_cb_map_c908(int op, int dtype) +{ + struct csinn_callback *cb = NULL; + for (int i = 0; i < C908_OP_PATTERN_MAX; i++) { + if (__c908_cb_key[i] == (op * CSINN_DTYPE_SIZE + dtype)) { + cb = &__c908_cb_table[i]; + break; + } + } + if ((cb == NULL) || (cb->est == NULL && (cb->init == NULL || cb->exec == NULL))) { + cb = shl_cb_map_rvv(op, dtype); + } + return cb; +} + +void shl_target_init_c908() +{ + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c908_conv2d_init_fp32, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c908_conv2d_init_fp16, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_c908_conv2d_init_int8, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D, shl_c908_conv2d_init_int4, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_fp32, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_fp16, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_int8, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_int4, NULL, + shl_gref_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, + shl_c908_depthwise_conv2d_init_fp32, NULL, shl_gref_depthwise_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, + shl_c908_depthwise_conv2d_init_fp16, NULL, shl_gref_depthwise_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D, + shl_c908_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D, + shl_c908_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_fp32, NULL, + shl_gref_maxpool2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_fp16, NULL, + shl_gref_maxpool2d); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_int8, NULL, + shl_gref_maxpool2d); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_int4, NULL, + shl_gref_maxpool2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_fp32, NULL, + shl_gref_avgpool2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_fp16, NULL, + shl_gref_avgpool2d); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_int8, NULL, + shl_gref_avgpool2d); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_int4, NULL, + shl_gref_avgpool2d); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, + NULL, shl_gref_fullyconnected); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, + NULL, shl_gref_fullyconnected); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, NULL, + shl_gref_fullyconnected); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, NULL, + shl_gref_fullyconnected); + shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL, + shl_gref_data_convert); + shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL, + shl_gref_data_convert); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL, + shl_gref_data_convert); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL, + shl_gref_data_convert); + + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU, shl_c908_conv2d_init_int8, NULL, + shl_gref_conv2d_relu); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D_RELU, shl_c908_conv2d_init_int4, NULL, + shl_gref_conv2d_relu); + shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU, + shl_c908_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu); + shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D_RELU, + shl_c908_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d_relu); + + shl_register_runtime_callback(CSINN_C908, NULL); + + shl_register_op_callback(CSINN_C908, shl_cb_map_c908); + shl_register_runtime_callback(CSINN_C908, shl_gref_runtime_callback); +} diff --git a/source/e804_opt/activation/csi_xt800p_nn_activations_q15.S b/source/e804_opt/activation/shl_xt800p_nn_activations_q15.S similarity index 86% rename from source/e804_opt/activation/csi_xt800p_nn_activations_q15.S rename to source/e804_opt/activation/shl_xt800p_nn_activations_q15.S index 38c5caa7..a1f21529 100644 --- a/source/e804_opt/activation/csi_xt800p_nn_activations_q15.S +++ b/source/e804_opt/activation/shl_xt800p_nn_activations_q15.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800p_nn_activations_q15.S + * @file shl_xt800p_nn_activations_q15.S * @brief Q15 neural network activation function using direct table look-up. * @version V1.0 * @date 01. June 2018 @@ -26,19 +26,19 @@ .import tanhTable_q15 /* - *void csi_xt800p_nn_activations_direct_q15(q15_t * data, + *void shl_xt800p_nn_activations_direct_q15(q15_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800p_nn_activation_type type) + * shl_xt800p_nn_activation_type type) */ - .file "csi_xt800p_nn_activations_q15.S" - .section .text.csi_xt800p_nn_activations_direct_q15,"ax",@progbits + .file "shl_xt800p_nn_activations_q15.S" + .section .text.shl_xt800p_nn_activations_direct_q15,"ax",@progbits .align 2 - .global csi_xt800p_nn_activations_direct_q15 - .type csi_xt800p_nn_activations_direct_q15, @function + .global shl_xt800p_nn_activations_direct_q15 + .type shl_xt800p_nn_activations_direct_q15, @function -csi_xt800p_nn_activations_direct_q15: +shl_xt800p_nn_activations_direct_q15: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 lrw l0, sigmoidTable_q15 lrw l1, tanhTable_q15 @@ -138,8 +138,6 @@ csi_xt800p_nn_activations_direct_q15: .L3: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 - .size csi_xt800p_nn_activations_direct_q15, .-csi_xt800p_nn_activations_direct_q15 -.weak csi_nn_activations_direct_q15 -.set csi_nn_activations_direct_q15, csi_xt800p_nn_activations_direct_q15 + .size shl_xt800p_nn_activations_direct_q15, .-shl_xt800p_nn_activations_direct_q15 .weak csky_dsp2_nn_activations_direct_q15 -.set csky_dsp2_nn_activations_direct_q15, csi_xt800p_nn_activations_direct_q15 +.set csky_dsp2_nn_activations_direct_q15, shl_xt800p_nn_activations_direct_q15 diff --git a/source/e804_opt/activation/csi_xt800p_nn_activations_q7.S b/source/e804_opt/activation/shl_xt800p_nn_activations_q7.S similarity index 82% rename from source/e804_opt/activation/csi_xt800p_nn_activations_q7.S rename to source/e804_opt/activation/shl_xt800p_nn_activations_q7.S index 1522096e..6f99f556 100644 --- a/source/e804_opt/activation/csi_xt800p_nn_activations_q7.S +++ b/source/e804_opt/activation/shl_xt800p_nn_activations_q7.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800p_nn_activations_q7.S + * @file shl_xt800p_nn_activations_q7.S * @brief Q7 neural network activation function using direct table look-up. * @version V1.0 * @date 05. June 2018 @@ -26,19 +26,19 @@ .import sigmoidTable_q7 .import tanhTable_q7 /* - *void csi_xt800p_nn_activations_direct_q7(q7_t * data, + *void shl_xt800p_nn_activations_direct_q7(q7_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800p_nn_activation_type type) + * shl_xt800p_nn_activation_type type) */ - .file "csi_xt800p_nn_activations_q7.S" - .section .text.csi_xt800p_nn_activations_direct_q7,"ax",@progbits + .file "shl_xt800p_nn_activations_q7.S" + .section .text.shl_xt800p_nn_activations_direct_q7,"ax",@progbits .align 2 - .global csi_xt800p_nn_activations_direct_q7 - .type csi_xt800p_nn_activations_direct_q7, @function + .global shl_xt800p_nn_activations_direct_q7 + .type shl_xt800p_nn_activations_direct_q7, @function -csi_xt800p_nn_activations_direct_q7: +shl_xt800p_nn_activations_direct_q7: push l0, l1, l2, l3, l4, l5, l6, l7 movi l0, 3 // shift_size = 3 - int_width subu t2, l0, a2 @@ -106,8 +106,6 @@ csi_xt800p_nn_activations_direct_q7: .L3: pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_xt800p_nn_activations_direct_q7, .-csi_xt800p_nn_activations_direct_q7 -.weak csi_nn_activations_direct_q7 -.set csi_nn_activations_direct_q7, csi_xt800p_nn_activations_direct_q7 + .size shl_xt800p_nn_activations_direct_q7, .-shl_xt800p_nn_activations_direct_q7 .weak csky_dsp2_nn_activations_direct_q7 -.set csky_dsp2_nn_activations_direct_q7, csi_xt800p_nn_activations_direct_q7 +.set csky_dsp2_nn_activations_direct_q7, shl_xt800p_nn_activations_direct_q7 diff --git a/source/e804_opt/activation/csi_xt800p_relu_q15.S b/source/e804_opt/activation/shl_xt800p_relu_q15.S similarity index 78% rename from source/e804_opt/activation/csi_xt800p_relu_q15.S rename to source/e804_opt/activation/shl_xt800p_relu_q15.S index cd1b07d0..995b64a3 100644 --- a/source/e804_opt/activation/csi_xt800p_relu_q15.S +++ b/source/e804_opt/activation/shl_xt800p_relu_q15.S @@ -17,24 +17,24 @@ */ /****************************************************************************** - * @file csi_xt800p_relu_q15.S + * @file shl_xt800p_relu_q15.S * @brief Q15 version of ReLU. * @version V1.0 * @date 01. June 2018 ******************************************************************************/ /* - *void csi_xt800p_relu_q15(q15_t * data, + *void shl_xt800p_relu_q15(q15_t * data, * uint16_t size) */ - .file "csi_xt800p_relu_q15.S" - .section .text.csi_xt800p_relu_q15,"ax",@progbits + .file "shl_xt800p_relu_q15.S" + .section .text.shl_xt800p_relu_q15,"ax",@progbits .align 2 - .global csi_xt800p_relu_q15 - .type csi_xt800p_relu_q15, @function + .global shl_xt800p_relu_q15 + .type shl_xt800p_relu_q15, @function -csi_xt800p_relu_q15: +shl_xt800p_relu_q15: movi t9, 0 mov t8, a0 lsri t7, a1, 3 @@ -69,8 +69,6 @@ csi_xt800p_relu_q15: .L3: rts - .size csi_xt800p_relu_q15, .-csi_xt800p_relu_q15 -.weak csi_relu_q15 -.set csi_relu_q15, csi_xt800p_relu_q15 + .size shl_xt800p_relu_q15, .-shl_xt800p_relu_q15 .weak csky_dsp2_relu_q15 -.set csky_dsp2_relu_q15, csi_xt800p_relu_q15 +.set csky_dsp2_relu_q15, shl_xt800p_relu_q15 diff --git a/source/e804_opt/activation/csi_xt800p_relu_q7.S b/source/e804_opt/activation/shl_xt800p_relu_q7.S similarity index 81% rename from source/e804_opt/activation/csi_xt800p_relu_q7.S rename to source/e804_opt/activation/shl_xt800p_relu_q7.S index c597b7f1..84d5c5c6 100644 --- a/source/e804_opt/activation/csi_xt800p_relu_q7.S +++ b/source/e804_opt/activation/shl_xt800p_relu_q7.S @@ -17,24 +17,24 @@ */ /****************************************************************************** - * @file csi_xt800p_relu_q7.S + * @file shl_xt800p_relu_q7.S * @brief Q15 version of ReLU. * @version V1.0 * @date 01. June 2018 ******************************************************************************/ /* - *void csi_xt800p_relu_q7(q7_t * data, + *void shl_xt800p_relu_q7(q7_t * data, * uint8_t size) */ - .file "csi_xt800p_relu_q7.S" - .section .text.csi_xt800p_relu_q7,"ax",@progbits + .file "shl_xt800p_relu_q7.S" + .section .text.shl_xt800p_relu_q7,"ax",@progbits .align 2 - .global csi_xt800p_relu_q7 - .type csi_xt800p_relu_q7, @function + .global shl_xt800p_relu_q7 + .type shl_xt800p_relu_q7, @function -csi_xt800p_relu_q7: +shl_xt800p_relu_q7: movi t9, 0 mov t8, a0 lsri t7, a1, 4 @@ -81,8 +81,6 @@ csi_xt800p_relu_q7: .L5: rts - .size csi_xt800p_relu_q7, .-csi_xt800p_relu_q7 -.weak csi_relu_q7 -.set csi_relu_q7, csi_xt800p_relu_q7 + .size shl_xt800p_relu_q7, .-shl_xt800p_relu_q7 .weak csky_dsp2_relu_q7 -.set csky_dsp2_relu_q7, csi_xt800p_relu_q7 +.set csky_dsp2_relu_q7, shl_xt800p_relu_q7 diff --git a/source/e804_opt/avgpool.c b/source/e804_opt/avgpool.c index 8b6b7793..7c853cbe 100644 --- a/source/e804_opt/avgpool.c +++ b/source/e804_opt/avgpool.c @@ -16,39 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -static int csi_e804_avgpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_e804_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - q7_t *input_data = (q7_t *)input->data; + q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] q7_t buffer_tmp[out_hw * out_hw * in_c]; // buffer_size = out_h * out_w * channel - csky_dsp2_avepool_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, + csky_dsp2_avepool_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, params->stride_height, out_hw, buffer_tmp, output_data); return CSINN_TRUE; } -int csi_e804_avgpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_e804_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { @@ -61,10 +60,12 @@ int csi_e804_avgpool2d_init_q7(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("avgpool q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n"); - params->base.bc = csi_ref_avgpool2d_quant; + shl_debug_warning( + "avgpool q7 is not optimized to achieve under this condition on e804, call reference " + "func replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; } else { - params->base.bc = csi_e804_avgpool2d_q7; + cb->exec = shl_e804_avgpool2d_q7; } return CSINN_TRUE; } diff --git a/source/e804_opt/convolution.c b/source/e804_opt/convolution.c index bcb28b0d..0f7b969e 100644 --- a/source/e804_opt/convolution.c +++ b/source/e804_opt/convolution.c @@ -16,23 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -static int csi_e804_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_e804_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; - uint16_t batch = input->dim[0]; // batch = 1 + uint16_t batch = input->dim[0]; // batch = 1 uint16_t in_h = input->dim[1]; uint16_t in_w = input->dim[2]; uint16_t in_c = input->dim[3]; @@ -51,105 +49,105 @@ static int csi_e804_conv2d_q7(struct csi_tensor *input, uint16_t pad_x = params->pad_left; uint16_t pad_y = params->pad_top; - q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_h * + kernel_w]; // buffer_size = in_c * kernel_size * kernel_size - if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) { - if ( (kernel_h == 1) && (kernel_w == 1) ) { + if ((in_c % 4 == 0) && (out_c % 2 == 0)) { + if ((kernel_h == 1) && (kernel_w == 1)) { csky_dsp2_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_w, out_h, buffer_tmp); } else { csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + pad_y, stride_h, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_h, buffer_tmp); } } else if (in_c == 3) { - csky_dsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, + csky_dsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y, + stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, out_h, buffer_tmp); } else { - csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y, + stride_h, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_h, buffer_tmp); } return CSINN_TRUE; } -static int csi_e804_conv2d_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_e804_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q15_t *input_data = (q15_t *)input->data; - q15_t *kernel_data = (q15_t *)kernel->data; - q15_t *bias_data = (q15_t *)bias->data; - q15_t *output_data = (q15_t *)output->data; + q15_t *input_data = (q15_t *)input->data; + q15_t *kernel_data = (q15_t *)kernel->data; + q15_t *bias_data = (q15_t *)bias->data; + q15_t *output_data = (q15_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; - uint16_t in_c = input->dim[3]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] uint16_t out_c = output->dim[3]; - uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; - uint16_t stride = params->stride_height; // e.g. stride = params->stride_width - uint16_t padding = params->pad_top; // e.g. padding = params->down = params->left = params->right + uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; + uint16_t stride = params->stride_height; // e.g. stride = params->stride_width + uint16_t padding = + params->pad_top; // e.g. padding = params->down = params->left = params->right - q15_t buffer_tmp[in_c * kernel_size * kernel_size]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[in_c * kernel_size * + kernel_size]; // buffer_size = in_c * kernel_size * kernel_size - csky_dsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, - kernel_size, padding, stride, bias_data, bias->qinfo->shift, - output->qinfo->shift, output_data, out_hw, buffer_tmp); + csky_dsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, + padding, stride, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_hw, buffer_tmp); return CSINN_TRUE; } -static int csi_e804_depthwise_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_e804_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; - uint16_t in_c = input->dim[3]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] uint16_t out_c = output->dim[3]; - uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; - uint16_t stride = params->stride_height; // e.g. stride = params->stride_width - uint16_t padding = params->pad_top; // e.g. padding = params->down = params->left = params->right + uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; + uint16_t stride = params->stride_height; // e.g. stride = params->stride_width + uint16_t padding = + params->pad_top; // e.g. padding = params->down = params->left = params->right - q15_t buffer_tmp[2 * in_c * kernel_size * kernel_size]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_size * + kernel_size]; // buffer_size = in_c * kernel_size * kernel_size - csky_dsp2_depthwise_separable_conv_HWC_q7(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, - padding, stride, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_hw, buffer_tmp); + csky_dsp2_depthwise_separable_conv_HWC_q7( + input_data, in_hw, in_c, kernel_data, out_c, kernel_size, padding, stride, bias_data, + bias->qinfo->shift, output->qinfo->shift, output_data, out_hw, buffer_tmp); return CSINN_TRUE; } -int csi_e804_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_e804_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { flag |= 0x01; } - if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || - (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) { - if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) { + if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || + (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width)) { + if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) { flag |= 0x02; } else { if (kernel->dim[2] != 1 || kernel->dim[3] != 1) { @@ -158,27 +156,28 @@ int csi_e804_conv2d_init_q7(struct csi_tensor *input, } } if (flag > 0) { - csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q7 is not optimized to achieve under this condition on e804, call reference " + "func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_e804_conv2d_q7; + cb->exec = shl_e804_conv2d_q7; } return CSINN_TRUE; } -int csi_e804_conv2d_init_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_e804_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { - flag |= 0x02; + flag |= 0x02; } if (kernel->dim[2] != kernel->dim[3]) { flag |= 0x04; @@ -187,28 +186,28 @@ int csi_e804_conv2d_init_q15(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on e804, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q15 is not optimized to achieve under this condition on e804, call reference " + "func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_e804_conv2d_q15; + cb->exec = shl_e804_conv2d_q15; } return CSINN_TRUE; } -int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_e804_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { - flag |= 0x02; + flag |= 0x02; } if (kernel->dim[2] != kernel->dim[3]) { flag |= 0x04; @@ -217,11 +216,13 @@ int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - params->base.bc = csi_ref_depthwise_conv2d_quant; - csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n"); + cb->exec = shl_ref_depthwise_conv2d_quant; + shl_debug_warning( + "depthwise_conv2d q7 is not optimized to achieve under this condition on e804, call " + "reference func replaced.\n"); } else { - params->base.bc = csi_e804_depthwise_conv2d_q7; + cb->exec = shl_e804_depthwise_conv2d_q7; } return CSINN_TRUE; } diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S b/source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S similarity index 93% rename from source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S rename to source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S index 6b28899e..64c90270 100644 --- a/source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S +++ b/source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800p_convolve_1x1_HWC_q7_fast.S + * @file shl_xt800p_convolve_1x1_HWC_q7_fast.S * @brief Fast Q7 vresion of 1x1 convolution (non-square shape). * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - * void csi_xt800p_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, + * void shl_xt800p_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, * const uint16_t dim_im_in_x, * const uint16_t dim_im_in_y, * const uint16_t ch_im_in, @@ -40,13 +40,13 @@ * */ - .file "csi_xt800p_convolve_1x1_HWC_q7_fast.S" - .section .text.csi_xt800p_convolve_HWC_q7_fast,"ax",@progbits + .file "shl_xt800p_convolve_1x1_HWC_q7_fast.S" + .section .text.shl_xt800p_convolve_HWC_q7_fast,"ax",@progbits .align 2 - .global csi_xt800p_convolve_1x1_HWC_q7_fast - .type csi_xt800p_convolve_1x1_HWC_q7_fast, @function + .global shl_xt800p_convolve_1x1_HWC_q7_fast + .type shl_xt800p_convolve_1x1_HWC_q7_fast, @function -csi_xt800p_convolve_1x1_HWC_q7_fast: +shl_xt800p_convolve_1x1_HWC_q7_fast: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 8 st.w a0, (sp, 0x0) @@ -301,9 +301,7 @@ csi_xt800p_convolve_1x1_HWC_q7_fast: .L23: addi sp, sp, 8 pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_convolve_1x1_HWC_q7_fast, .-csi_xt800p_convolve_1x1_HWC_q7_fast + .size shl_xt800p_convolve_1x1_HWC_q7_fast, .-shl_xt800p_convolve_1x1_HWC_q7_fast -.weak csi_convolve_1x1_HWC_q7_fast -.set csi_convolve_1x1_HWC_q7_fast, csi_xt800p_convolve_1x1_HWC_q7_fast .weak csky_dsp2_convolve_1x1_HWC_q7_fast -.set csky_dsp2_convolve_1x1_HWC_q7_fast, csi_xt800p_convolve_1x1_HWC_q7_fast +.set csky_dsp2_convolve_1x1_HWC_q7_fast, shl_xt800p_convolve_1x1_HWC_q7_fast diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S similarity index 94% rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S index a2477870..50006d0e 100644 --- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S +++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_convolve_HWC_q15_basic.S + * @file shl_xt800p_convolve_HWC_q15_basic.S * @brief Q7 vresion of convolution. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_convolve_HWC_q15_basic(const q15_t * Im_in, + * shl_xt800p_status + * shl_xt800p_convolve_HWC_q15_basic(const q15_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q15_t * wt, @@ -41,13 +41,13 @@ * q15_t * bufferA) */ - .file "csi_xt800p_convolve_HWC_q15_basic.S" - .section .text.csi_xt800p_convolve_HWC_q15_basic,"ax",@progbits + .file "shl_xt800p_convolve_HWC_q15_basic.S" + .section .text.shl_xt800p_convolve_HWC_q15_basic,"ax",@progbits .align 2 - .global csi_xt800p_convolve_HWC_q15_basic - .type csi_xt800p_convolve_HWC_q15_basic, @function + .global shl_xt800p_convolve_HWC_q15_basic + .type shl_xt800p_convolve_HWC_q15_basic, @function -csi_xt800p_convolve_HWC_q15_basic: +shl_xt800p_convolve_HWC_q15_basic: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 12 st.w a0, (sp) @@ -336,9 +336,7 @@ csi_xt800p_convolve_HWC_q15_basic: .L22: addi sp, sp, 12 pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_convolve_HWC_q15_basic, .-csi_xt800p_convolve_HWC_q15_basic + .size shl_xt800p_convolve_HWC_q15_basic, .-shl_xt800p_convolve_HWC_q15_basic -.weak csi_convolve_HWC_q15_basic -.set csi_convolve_HWC_q15_basic, csi_xt800p_convolve_HWC_q15_basic .weak csky_dsp2_convolve_HWC_q15_basic -.set csky_dsp2_convolve_HWC_q15_basic, csi_xt800p_convolve_HWC_q15_basic +.set csky_dsp2_convolve_HWC_q15_basic, shl_xt800p_convolve_HWC_q15_basic diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S similarity index 94% rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S index f0df9751..c68988ca 100644 --- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S +++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_convolve_HWC_q7_RGB.S + * @file shl_xt800p_convolve_HWC_q7_RGB.S * @brief Q7 vresion of convolution. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_convolve_HWC_q7_RGB(const q7_t * Im_in, + * shl_xt800p_status + * shl_xt800p_convolve_HWC_q7_RGB(const q7_t * Im_in, * const uint16_t dim_im_in, * const q7_t * wt, * const uint16_t ch_im_out, @@ -40,13 +40,13 @@ * q15_t * bufferA) */ - .file "csi_xt800p_convolve_HWC_q7_RGB.S" - .section .text.csi_xt800p_convolve_HWC_q7_RGB,"ax",@progbits + .file "shl_xt800p_convolve_HWC_q7_RGB.S" + .section .text.shl_xt800p_convolve_HWC_q7_RGB,"ax",@progbits .align 2 - .global csi_xt800p_convolve_HWC_q7_RGB - .type csi_xt800p_convolve_HWC_q7_RGB, @function + .global shl_xt800p_convolve_HWC_q7_RGB + .type shl_xt800p_convolve_HWC_q7_RGB, @function -csi_xt800p_convolve_HWC_q7_RGB: +shl_xt800p_convolve_HWC_q7_RGB: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 12 st.w a0, (sp) @@ -349,9 +349,7 @@ csi_xt800p_convolve_HWC_q7_RGB: .L22: addi sp, sp, 12 pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_convolve_HWC_q7_RGB, .-csi_xt800p_convolve_HWC_q7_RGB + .size shl_xt800p_convolve_HWC_q7_RGB, .-shl_xt800p_convolve_HWC_q7_RGB -.weak csi_convolve_HWC_q7_RGB -.set csi_convolve_HWC_q7_RGB, csi_xt800p_convolve_HWC_q7_RGB .weak csky_dsp2_convolve_HWC_q7_RGB -.set csky_dsp2_convolve_HWC_q7_RGB, csi_xt800p_convolve_HWC_q7_RGB +.set csky_dsp2_convolve_HWC_q7_RGB, shl_xt800p_convolve_HWC_q7_RGB diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S similarity index 94% rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S index a26d2e56..ce205702 100644 --- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S +++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_convolve_HWC_q7_basic.S + * @file shl_xt800p_convolve_HWC_q7_basic.S * @brief Q7 vresion of convolution. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_convolve_HWC_q7_basic(const q7_t * Im_in, + * shl_xt800p_status + * shl_xt800p_convolve_HWC_q7_basic(const q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q7_t * wt, @@ -41,13 +41,13 @@ * q15_t * bufferA) */ - .file "csi_xt800p_convolve_HWC_q7_basic.S" - .section .text.csi_xt800p_convolve_HWC_q7_basic,"ax",@progbits + .file "shl_xt800p_convolve_HWC_q7_basic.S" + .section .text.shl_xt800p_convolve_HWC_q7_basic,"ax",@progbits .align 2 - .global csi_xt800p_convolve_HWC_q7_basic - .type csi_xt800p_convolve_HWC_q7_basic, @function + .global shl_xt800p_convolve_HWC_q7_basic + .type shl_xt800p_convolve_HWC_q7_basic, @function -csi_xt800p_convolve_HWC_q7_basic: +shl_xt800p_convolve_HWC_q7_basic: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 12 st.w a0, (sp) @@ -380,9 +380,7 @@ csi_xt800p_convolve_HWC_q7_basic: .L22: addi sp, sp, 12 pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_convolve_HWC_q7_basic, .-csi_xt800p_convolve_HWC_q7_basic + .size shl_xt800p_convolve_HWC_q7_basic, .-shl_xt800p_convolve_HWC_q7_basic -.weak csi_convolve_HWC_q7_basic -.set csi_convolve_HWC_q7_basic, csi_xt800p_convolve_HWC_q7_basic .weak csky_dsp2_convolve_HWC_q7_basic -.set csky_dsp2_convolve_HWC_q7_basic, csi_xt800p_convolve_HWC_q7_basic +.set csky_dsp2_convolve_HWC_q7_basic, shl_xt800p_convolve_HWC_q7_basic diff --git a/source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S b/source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S similarity index 92% rename from source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S rename to source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S index ca56ba09..4ff79e74 100644 --- a/source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S +++ b/source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800p_depthwise_separable_conv_HWC_q7.S + * @file shl_xt800p_depthwise_separable_conv_HWC_q7.S * @brief Q7 depthwise separable convolution function. * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - *csi_xt800p_status csi_xt800p_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, + *shl_xt800p_status shl_xt800p_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q7_t * wt, @@ -40,13 +40,13 @@ * q15_t * bufferA) */ - .file "csi_xt800p_depthwise_separable_conv_HWC_q7.S" - .section .text.csi_xt800p_depthwise_separatable_conv_HWC_q7,"ax",@progbits + .file "shl_xt800p_depthwise_separable_conv_HWC_q7.S" + .section .text.shl_xt800p_depthwise_separatable_conv_HWC_q7,"ax",@progbits .align 2 - .global csi_xt800p_depthwise_separable_conv_HWC_q7 - .type csi_xt800p_depthwise_separable_conv_HWC_q7, @function + .global shl_xt800p_depthwise_separable_conv_HWC_q7 + .type shl_xt800p_depthwise_separable_conv_HWC_q7, @function -csi_xt800p_depthwise_separable_conv_HWC_q7: +shl_xt800p_depthwise_separable_conv_HWC_q7: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 16 st.w a0, (sp) @@ -301,9 +301,7 @@ csi_xt800p_depthwise_separable_conv_HWC_q7: .L16: addi sp, sp, 16 pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_depthwise_separable_conv_HWC_q7, .-csi_xt800p_depthwise_separable_conv_HWC_q7 + .size shl_xt800p_depthwise_separable_conv_HWC_q7, .-shl_xt800p_depthwise_separable_conv_HWC_q7 -.weak csi_depthwise_separable_conv_HWC_q7 -.set csi_depthwise_separable_conv_HWC_q7, csi_xt800p_depthwise_separable_conv_HWC_q7 .weak csky_dsp2_depthwise_separable_conv_HWC_q7 -.set csky_dsp2_depthwise_separable_conv_HWC_q7, csi_xt800p_depthwise_separable_conv_HWC_q7 +.set csky_dsp2_depthwise_separable_conv_HWC_q7, shl_xt800p_depthwise_separable_conv_HWC_q7 diff --git a/include/include_xt800/csky_vdsp2_nnfunctions.h b/source/e804_opt/e804_function.h similarity index 56% rename from include/include_xt800/csky_vdsp2_nnfunctions.h rename to source/e804_opt/e804_function.h index 52b2bbe9..179a4632 100644 --- a/include/include_xt800/csky_vdsp2_nnfunctions.h +++ b/source/e804_opt/e804_function.h @@ -17,19 +17,34 @@ */ /* ---------------------------------------------------------------------- - * Title: csky_vdsp2_nnfunctions.h + * Title: csky_dsp2_nnfunctions.h * Description: Public header file for CSI NN Library * * -------------------------------------------------------------------- */ -#ifndef INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_ -#define INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_ +#ifndef SOURCE_E804_OPT_E804_FUNCTION_H_ +#define SOURCE_E804_OPT_E804_FUNCTION_H_ #ifdef __cplusplus extern "C" { #endif -#include "csi_instance.h" +#include + +/** + * @brief 8-bit fractional data type in 1.7 format. + */ +typedef int8_t q7_t; + +/** + * @brief 16-bit fractional data type in 1.15 format. + */ +typedef int16_t q15_t; + +/** + * @brief 32-bit fractional data type in 1.31 format. + */ +typedef int32_t q31_t; /** * @brief Struct for specifying activation function types @@ -38,7 +53,7 @@ extern "C" { typedef enum { CSKY_SIGMOID = 0, /**< Sigmoid activation function */ CSKY_TANH = 1, /**< Tanh activation function */ -} csky_vdsp2_nn_activation_type; +} csky_dsp2_nn_activation_type; /** * @brief Basic Q7 convolution function @@ -60,16 +75,44 @@ typedef enum { * */ -void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, - const uint16_t ch_im_in, const q7_t *wt, +void csky_dsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + */ + +void csky_dsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, - const q7_t *bias, const uint16_t bias_shift, - const uint16_t out_shift, q7_t *Im_out, + const q15_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); /** - * @brief Basic Q15 convolution function + * @brief Fast Q7 convolution function * @param[in] Im_in pointer to input tensor * @param[in] dim_im_in input tensor dimention * @param[in] ch_im_in number of input tensor channels @@ -86,15 +129,18 @@ void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_i * @param[in,out] bufferA pointer to buffer space for input * @return none. * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 */ -void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, - const uint16_t ch_im_in, const q15_t *wt, - const uint16_t ch_im_out, const uint16_t dim_kernel, - const uint16_t padding, const uint16_t stride, - const q15_t *bias, const uint16_t bias_shift, - const uint16_t out_shift, q15_t *Im_out, - const uint16_t dim_im_out, q15_t *bufferA); +void csky_dsp2_convolve_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t *bias, + const uint16_t bias_shift, const uint16_t out_shift, + q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); /** * @brief Fast Q7 convolution function (non-sqaure shape) @@ -125,7 +171,7 @@ void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im * ch_im_out is multiple of 2 */ -void csky_vdsp2_convolve_HWC_q7_fast_nonsquare( +void csky_dsp2_convolve_HWC_q7_fast_nonsquare( const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, @@ -165,12 +211,12 @@ void csky_vdsp2_convolve_HWC_q7_fast_nonsquare( * ch_im_in is multiple of 4 * ch_im_out is multiple of 2 */ -void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, const uint16_t ch_im_in, - const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias, - const uint16_t bias_shift, const uint16_t out_shift, - q7_t *Im_out, const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, q15_t *bufferA); +void csky_dsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, const uint16_t ch_im_in, + const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias, + const uint16_t bias_shift, const uint16_t out_shift, + q7_t *Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t *bufferA); /** * @brief Q7 version of convolution for RGB image @@ -195,11 +241,43 @@ void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_i * image with RGB format. */ -void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt, - const uint16_t ch_im_out, const uint16_t dim_kernel, - const uint16_t padding, const uint16_t stride, const q7_t *bias, - const uint16_t bias_shift, const uint16_t out_shift, - q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); +void csky_dsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t *bias, + const uint16_t bias_shift, const uint16_t out_shift, + q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +void csky_dsp2_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q15_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q15_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q15_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); /** * @brief Q7 depthwise separable convolution function @@ -225,13 +303,13 @@ void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, * ch_im_out is multiple of 2 */ -void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, - const uint16_t ch_im_in, const q7_t *wt, - const uint16_t ch_im_out, const uint16_t dim_kernel, - const uint16_t padding, const uint16_t stride, - const q7_t *bias, const uint16_t bias_shift, - const uint16_t out_shift, q7_t *Im_out, - const uint16_t dim_im_out, q15_t *bufferA); +void csky_dsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); /** * @brief Q7 depthwise separable convolution function (non-square shape) @@ -261,7 +339,7 @@ void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_ * ch_im_in is multiple of 2 * ch_im_out is multiple of 2 */ -void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare( +void csky_dsp2_depthwise_separable_conv_HWC_q7_nonsquare( const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, @@ -282,9 +360,29 @@ void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare( * @return none. */ -void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, - const uint16_t num_of_rows, const uint16_t bias_shift, - const uint16_t out_shift, const q7_t *bias, q7_t *pOut); +void csky_dsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t *bias, q7_t *pOut); + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return none. + * + */ + +void csky_dsp2_fully_connected_q7_opt(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t *bias, q7_t *pOut, + q15_t *vec_buffer); /** * @brief Q15 basic fully-connected layer function @@ -300,9 +398,27 @@ void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_ * */ -void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, - const uint16_t num_of_rows, const uint16_t bias_shift, - const uint16_t out_shift, const q15_t *bias, q15_t *pOut); +void csky_dsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q15_t *bias, q15_t *pOut); + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return none. + * + */ + +void csky_dsp2_fully_connected_q15_opt(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q15_t *bias, q15_t *pOut); /** * @brief Mixed Q15-Q7 fully-connected layer function @@ -318,10 +434,75 @@ void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint * */ -void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, - const uint16_t dim_vec, const uint16_t num_of_rows, - const uint16_t bias_shift, const uint16_t out_shift, - const q7_t *bias, q15_t *pOut); +void csky_dsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, + const uint16_t dim_vec, const uint16_t num_of_rows, + const uint16_t bias_shift, const uint16_t out_shift, + const q7_t *bias, q15_t *pOut); + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return none. + * + */ + +void csky_dsp2_fully_connected_mat_q7_vec_q15_opt( + const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, + const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q15_t *pOut); + +/** + * @brief Matrix-Multiplication Kernels for Convolution + * + * These functions are used within convolution layer functions for + * matrix multiplication. + * + * The implementation is similar to CSI-DSP csky_dsp2_mat_mult functions + * with one Q7 and one Q15 operands. The Q15 operand is the im2col + * output which is always with 2 columns. + * + */ + +/** + * @brief Matrix-multiplication function for convolution + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + +q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15(const q7_t *pA, const q15_t *pInBuffer, + const uint16_t ch_im_out, const uint16_t numCol_A, + const uint16_t bias_shift, const uint16_t out_shift, + const q7_t *bias, q7_t *pOut); + +/** + * @brief Matrix-multiplication function for convolution with reordered columns + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + +q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15_reordered( + const q7_t *pA, const q15_t *pInBuffer, const uint16_t ch_im_out, const uint16_t numCol_A, + const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut); /** * @brief Q7 RELU function @@ -330,7 +511,7 @@ void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, * @return none. */ -void csky_vdsp2_relu_q7(q7_t *data, uint16_t size); +void csky_dsp2_relu_q7(q7_t *data, uint16_t size); /** * @brief Q15 RELU function @@ -339,7 +520,7 @@ void csky_vdsp2_relu_q7(q7_t *data, uint16_t size); * @return none. */ -void csky_vdsp2_relu_q15(q15_t *data, uint16_t size); +void csky_dsp2_relu_q15(q15_t *data, uint16_t size); /** * @brief Q7 neural network activation function using direct table look-up @@ -350,8 +531,8 @@ void csky_vdsp2_relu_q15(q15_t *data, uint16_t size); * @return none. */ -void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, - csky_vdsp2_nn_activation_type type); +void csky_dsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, + csky_dsp2_nn_activation_type type); /** * @brief Q15 neural network activation function using direct table look-up @@ -362,8 +543,8 @@ void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int * @return none. */ -void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, - csky_vdsp2_nn_activation_type type); +void csky_dsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, + csky_dsp2_nn_activation_type type); /** * @brief Q7 max pooling function @@ -380,10 +561,10 @@ void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t i * */ -void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, - const uint16_t dim_kernel, const uint16_t padding, - const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, - q7_t *Im_out); +void csky_dsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, + q7_t *Im_out); /** * @brief Q7 average pooling function @@ -400,26 +581,10 @@ void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const ui * */ -void csky_vdsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, - const uint16_t dim_kernel, const uint16_t padding, - const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, - q7_t *Im_out); - -void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image - const uint16_t dim_im_in_x, // input image dimension - const uint16_t dim_im_in_y, // input image dimension - const uint16_t ch_im_in, // number of input image channels - const uint16_t dim_kernel_x, // window kernel size - const uint16_t dim_kernel_y, // window kernel size - const uint16_t padding_x, // padding sizes - const uint16_t padding_y, // padding sizes - const uint16_t stride_x, // stride - const uint16_t stride_y, // stride - const uint16_t dim_im_out_x, // output image dimension - const uint16_t dim_im_out_y, // output image dimension - q7_t *bufferA, // a buffer for local storage - q7_t *Im_out, // output feature - const uint16_t out_lshift); // output left shift (scaling) +void csky_dsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, + q7_t *Im_out); /** * @brief Q7 softmax function @@ -430,7 +595,7 @@ void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input i * */ -void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); +void csky_dsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); /** * @brief Q15 softmax function @@ -441,10 +606,10 @@ void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_o * */ -void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); +void csky_dsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); #ifdef __cplusplus } #endif -#endif // INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_ +#endif // SOURCE_E804_OPT_E804_FUNCTION_H_ diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S similarity index 88% rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S index 0498f87c..e5b223c9 100644 --- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S +++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_fully_connected_mat_q7_vec_q15.S + * @file shl_xt800p_fully_connected_mat_q7_vec_q15.S * @brief Mixed Q15-Q7 fully-connected layer function. * @version V1.0 * @date 31. May 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_fully_connected_mat_q7_vec_q15(const q15_t * pV, + * shl_xt800p_status + * shl_xt800p_fully_connected_mat_q7_vec_q15(const q15_t * pV, * const q7_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q15_t * pOut) */ - .file "csi_xt800p_fully_connected_mat_q7_vec_q15.S" - .section .text.csi_xt800p_fully_connected_mat_q7_vec_q15,"ax",@progbits + .file "shl_xt800p_fully_connected_mat_q7_vec_q15.S" + .section .text.shl_xt800p_fully_connected_mat_q7_vec_q15,"ax",@progbits .align 2 - .global csi_xt800p_fully_connected_mat_q7_vec_q15 - .type csi_xt800p_fully_connected_mat_q7_vec_q15, @function + .global shl_xt800p_fully_connected_mat_q7_vec_q15 + .type shl_xt800p_fully_connected_mat_q7_vec_q15, @function -csi_xt800p_fully_connected_mat_q7_vec_q15: +shl_xt800p_fully_connected_mat_q7_vec_q15: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.h l0, (sp, 0x2c) // bias_shift ld.h l1, (sp, 0x30) // out_shift @@ -188,8 +188,7 @@ csi_xt800p_fully_connected_mat_q7_vec_q15: .L10: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_fully_connected_mat_q7_vec_q15, .-csi_xt800p_fully_connected_mat_q7_vec_q15 -.weak csi_fully_connected_mat_q7_vec_q15 -.set csi_fully_connected_mat_q7_vec_q15, csi_xt800p_fully_connected_mat_q7_vec_q15 + .size shl_xt800p_fully_connected_mat_q7_vec_q15, .-shl_xt800p_fully_connected_mat_q7_vec_q15 + .weak csky_dsp2_fully_connected_mat_q7_vec_q15 -.set csky_dsp2_fully_connected_mat_q7_vec_q15, csi_xt800p_fully_connected_mat_q7_vec_q15 +.set csky_dsp2_fully_connected_mat_q7_vec_q15, shl_xt800p_fully_connected_mat_q7_vec_q15 diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S similarity index 90% rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S index 5919bbff..0ca0a8b8 100644 --- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S +++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_fully_connected_q15.S + * @file shl_xt800p_fully_connected_q15.S * @brief Q15 basic fully-connected layer function. * @version V1.0 * @date 31. May 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_fully_connected_q15(const q15_t * pV, + * shl_xt800p_status + * shl_xt800p_fully_connected_q15(const q15_t * pV, * const q15_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q15_t * pOut) */ - .file "csi_xt800p_fully_connected_q15.S" - .section .text.csi_xt800p_fully_connected_q15,"ax",@progbits + .file "shl_xt800p_fully_connected_q15.S" + .section .text.shl_xt800p_fully_connected_q15,"ax",@progbits .align 2 - .global csi_xt800p_fully_connected_q15 - .type csi_xt800p_fully_connected_q15, @function + .global shl_xt800p_fully_connected_q15 + .type shl_xt800p_fully_connected_q15, @function -csi_xt800p_fully_connected_q15: +shl_xt800p_fully_connected_q15: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.h l0, (sp, 0x2c) // bias_shift ld.h l1, (sp, 0x30) // out_shift @@ -186,8 +186,7 @@ csi_xt800p_fully_connected_q15: .L10: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_fully_connected_q15, .-csi_xt800p_fully_connected_q15 -.weak csi_fully_connected_q15 -.set csi_fully_connected_q15, csi_xt800p_fully_connected_q15 + .size shl_xt800p_fully_connected_q15, .-shl_xt800p_fully_connected_q15 + .weak csky_dsp2_fully_connected_q15 -.set csky_dsp2_fully_connected_q15, csi_xt800p_fully_connected_q15 +.set csky_dsp2_fully_connected_q15, shl_xt800p_fully_connected_q15 diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S similarity index 90% rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S index 8c755e31..8e4a6f93 100644 --- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S +++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800p_fully_connected_q7.S + * @file shl_xt800p_fully_connected_q7.S * @brief Q7 basic fully-connected layer function. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800p_status - * csi_xt800p_fully_connected_q7(const q7_t * pV, + * shl_xt800p_status + * shl_xt800p_fully_connected_q7(const q7_t * pV, * const q7_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q7_t * pOut) */ - .file "csi_xt800p_fully_connected_q7.S" - .section .text.csi_xt800p_fully_connected_q7,"ax",@progbits + .file "shl_xt800p_fully_connected_q7.S" + .section .text.shl_xt800p_fully_connected_q7,"ax",@progbits .align 2 - .global csi_xt800p_fully_connected_q7 - .type csi_xt800p_fully_connected_q7, @function + .global shl_xt800p_fully_connected_q7 + .type shl_xt800p_fully_connected_q7, @function -csi_xt800p_fully_connected_q7: +shl_xt800p_fully_connected_q7: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.h l0, (sp, 0x2c) // bias_shift ld.h l1, (sp, 0x30) // out_shift @@ -187,8 +187,7 @@ csi_xt800p_fully_connected_q7: .L10: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_fully_connected_q7, .-csi_xt800p_fully_connected_q7 -.weak csi_fully_connected_q7 -.set csi_fully_connected_q7, csi_xt800p_fully_connected_q7 + .size shl_xt800p_fully_connected_q7, .-shl_xt800p_fully_connected_q7 + .weak csky_dsp2_fully_connected_q7 -.set csky_dsp2_fully_connected_q7, csi_xt800p_fully_connected_q7 +.set csky_dsp2_fully_connected_q7, shl_xt800p_fully_connected_q7 diff --git a/source/e804_opt/fullyconnected.c b/source/e804_opt/fullyconnected.c index 46d0228f..b0cdcd76 100644 --- a/source/e804_opt/fullyconnected.c +++ b/source/e804_opt/fullyconnected.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -int csi_e804_fullyconnected_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_e804_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *weight_data = (q7_t *)weights->data; @@ -37,11 +35,9 @@ int csi_e804_fullyconnected_q7(struct csi_tensor *input, return CSINN_TRUE; } -int csi_e804_fullyconnected_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_e804_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *weight_data = (q15_t *)weights->data; @@ -52,4 +48,3 @@ int csi_e804_fullyconnected_q15(struct csi_tensor *input, bias->qinfo->shift, output->qinfo->shift, bias_data, output_data); return CSINN_TRUE; } - \ No newline at end of file diff --git a/source/e804_opt/maxpool.c b/source/e804_opt/maxpool.c index 8aaba6f1..658022ff 100644 --- a/source/e804_opt/maxpool.c +++ b/source/e804_opt/maxpool.c @@ -16,39 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -static int csi_e804_maxpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_e804_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - q7_t *input_data = (q7_t *)input->data; + q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] q7_t buffer_tmp[out_hw * out_hw * in_c]; // buffer_size = out_h * out_w * channel - csky_dsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, - params->stride_height, out_hw, buffer_tmp, output_data); + csky_dsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, + params->stride_height, out_hw, buffer_tmp, output_data); return CSINN_TRUE; } -int csi_e804_maxpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_e804_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { @@ -61,10 +60,12 @@ int csi_e804_maxpool2d_init_q7(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n"); - params->base.bc = csi_ref_maxpool2d_quant; + shl_debug_warning( + "maxpool q7 is not optimized to achieve under this condition on e804, call reference " + "func replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; } else { - params->base.bc = csi_e804_maxpool2d_q7; + cb->exec = shl_e804_maxpool2d_q7; } return CSINN_TRUE; } \ No newline at end of file diff --git a/source/e804_opt/nn-support/csi_xt800p_nntables.c b/source/e804_opt/nn-support/csi_xt800p_nntables.c deleted file mode 100644 index b25db41f..00000000 --- a/source/e804_opt/nn-support/csi_xt800p_nntables.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csky_vdsp2_nntables.c - * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift - * - * -------------------------------------------------------------------- */ -#include "csi_instance.h" -/** - * @brief tables for various activation functions - * - * This file include the declaration of common tables. - * Most of them are used for activation functions - * - * Assumption: - * Unified table: input is 3.x format, i.e, range of [-8, 8) - * sigmoid(8) = 0.9996646498695336 - * tanh(8) = 0.9999997749296758 - * The accuracy here should be good enough - * - * 2-stage HL table: - * - * The entire input range is divided into two parts: - * - * Low range table: 0x000x xxxx or 0x111x xxxx - * table entry will be the binary number excluding the first - * two digits, i.e., 0x0x xxxx or 0x1x xxxx - * - * - * - * High range table 0x0010 0000 -- 0x0111 1111 - * 0x1000 0000 -- 0x1101 1111 - * - * For positive numbers, table entry will be - * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 - * i.e., 0x0000 0000 - 0x0101 11111 - * - * same thing for the negative numbers, table entry will be - * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 - * i.e., 0x0110 0000 - 0x1011 1111 - */ - -const q7_t sigmoidTable_q7[256] = { - 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, - 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, - 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, - 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, - 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, - 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, - 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, - 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, - 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, - 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, - 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, - 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, - 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, - 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, - 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, -}; - -const q15_t sigmoidTable_q15[256] = { - 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, - 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb, - 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, - 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, - 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, - 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, - 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, - 0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00, -}; - -const q15_t sigmoidLTable_q15[128] = { - 0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, - 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb, - 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, - 0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, - 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc, - 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, - 0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a, - 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, - 0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, - 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3, - 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, - 0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, - 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833, - 0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, - 0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f, - 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00, -}; - -const q15_t sigmoidHTable_q15[192] = { - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, -}; - -const q7_t tanhTable_q7[256] = { - 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, - 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, - 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, - 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, - 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, - 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, - 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, - 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, - 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, - 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, - 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, - 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, -}; - -const q15_t tanhTable_q15[256] = { - 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, - 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6, - 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, - 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, - 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, - 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, - 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, - 0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803, -}; - -const q15_t tanhLTable_q15[128] = { - 0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, - 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6, - 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, - 0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, - 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e, - 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, - 0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807, - 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, - 0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, - 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70, - 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, - 0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, - 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d, - 0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, - 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec, - 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00, -}; - -const q15_t tanhHTable_q15[192] = { - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, -}; diff --git a/source/e804_opt/nn-support/shl_xt800p_nntables.c b/source/e804_opt/nn-support/shl_xt800p_nntables.c new file mode 100644 index 00000000..1e21ec94 --- /dev/null +++ b/source/e804_opt/nn-support/shl_xt800p_nntables.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csky_vdsp2_nntables.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * -------------------------------------------------------------------- */ + +#include +/** + * @brief 8-bit fractional data type in 1.7 format. + */ +typedef int8_t q7_t; + +/** + * @brief 16-bit fractional data type in 1.15 format. + */ +typedef int16_t q15_t; + +/** + * @brief tables for various activation functions + * + * This file include the declaration of common tables. + * Most of them are used for activation functions + * + * Assumption: + * Unified table: input is 3.x format, i.e, range of [-8, 8) + * sigmoid(8) = 0.9996646498695336 + * tanh(8) = 0.9999997749296758 + * The accuracy here should be good enough + * + * 2-stage HL table: + * + * The entire input range is divided into two parts: + * + * Low range table: 0x000x xxxx or 0x111x xxxx + * table entry will be the binary number excluding the first + * two digits, i.e., 0x0x xxxx or 0x1x xxxx + * + * + * + * High range table 0x0010 0000 -- 0x0111 1111 + * 0x1000 0000 -- 0x1101 1111 + * + * For positive numbers, table entry will be + * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 + * i.e., 0x0000 0000 - 0x0101 11111 + * + * same thing for the negative numbers, table entry will be + * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 + * i.e., 0x0110 0000 - 0x1011 1111 + */ + +const q7_t sigmoidTable_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, + 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, + 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, + 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, + 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, + 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + +const q15_t sigmoidTable_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, + 0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, + 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, + 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, + 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, + 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, + 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, + 0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, + 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, + 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, + 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, + 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, + 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, + 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, + 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, + 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, + 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7, + 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, + 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615, + 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + +const q7_t tanhTable_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, + 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, + 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, + 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + +const q15_t tanhTable_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, + 0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, + 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, + 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, + 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, + 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, + 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, + 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, + 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, + 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, + 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, + 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5, + 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, + 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941, + 0xe0a7, 0xe847, 0xf015, 0xf803, +}; diff --git a/source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S b/source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S similarity index 93% rename from source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S rename to source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S index 6de52ba4..5bd3daee 100644 --- a/source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S +++ b/source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800p_pool_q7_HWC.S + * @file shl_xt800p_pool_q7_HWC.S * @brief Pooling functions implementations. * @version V1.0 * @date 31. May 2018 @@ -25,7 +25,7 @@ /* * void - * csi_xt800p_maxpool2d_q7_HWC(q7_t * Im_in, + * shl_xt800p_maxpool2d_q7_HWC(q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const uint16_t dim_kernel, @@ -36,13 +36,13 @@ * q7_t * Im_out) */ - .file "csi_xt800p_pool_HWC_q7.S" - .section .text.csi_xt800p_maxpool2d_q7_HWC,"ax",@progbits + .file "shl_xt800p_pool_HWC_q7.S" + .section .text.shl_xt800p_maxpool2d_q7_HWC,"ax",@progbits .align 2 - .global csi_xt800p_maxpool2d_q7_HWC - .type csi_xt800p_maxpool2d_q7_HWC, @function + .global shl_xt800p_maxpool2d_q7_HWC + .type shl_xt800p_maxpool2d_q7_HWC, @function -csi_xt800p_maxpool2d_q7_HWC: +shl_xt800p_maxpool2d_q7_HWC: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.w l0, (sp, 0x3c) // im_out ld.hs l1, (sp, 0x34) // dim_im_out @@ -265,16 +265,14 @@ csi_xt800p_maxpool2d_q7_HWC: .L28: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_maxpool2d_q7_HWC, .-csi_xt800p_maxpool2d_q7_HWC + .size shl_xt800p_maxpool2d_q7_HWC, .-shl_xt800p_maxpool2d_q7_HWC -.weak csi_maxpool2d_q7_HWC -.set csi_maxpool2d_q7_HWC, csi_xt800p_maxpool2d_q7_HWC .weak csky_dsp2_maxpool2d_q7_HWC -.set csky_dsp2_maxpool2d_q7_HWC, csi_xt800p_maxpool2d_q7_HWC +.set csky_dsp2_maxpool2d_q7_HWC, shl_xt800p_maxpool2d_q7_HWC /* * void - * csi_xt800p_avepool_q7_HWC(q7_t * Im_in, + * shl_xt800p_avepool_q7_HWC(q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const uint16_t dim_kernel, @@ -285,12 +283,12 @@ csi_xt800p_maxpool2d_q7_HWC: * q7_t * Im_out) */ - .section .text.csi_xt800p_avepool_q7_HWC,"ax",@progbits + .section .text.shl_xt800p_avepool_q7_HWC,"ax",@progbits .align 2 - .global csi_xt800p_avepool_q7_HWC - .type csi_xt800p_avepool_q7_HWC, @function + .global shl_xt800p_avepool_q7_HWC + .type shl_xt800p_avepool_q7_HWC, @function -csi_xt800p_avepool_q7_HWC: +shl_xt800p_avepool_q7_HWC: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.w l0, (sp, 0x3c) // im_out ld.w lr, (sp, 0x38) // bufferA @@ -584,9 +582,7 @@ csi_xt800p_avepool_q7_HWC: .L67: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800p_avepool_q7_HWC, .-csi_xt800p_avepool_q7_HWC + .size shl_xt800p_avepool_q7_HWC, .-shl_xt800p_avepool_q7_HWC -.weak csi_avepool_q7_HWC -.set csi_avepool_q7_HWC, csi_xt800p_avepool_q7_HWC .weak csky_dsp2_avepool_q7_HWC -.set csky_dsp2_avepool_q7_HWC, csi_xt800p_avepool_q7_HWC +.set csky_dsp2_avepool_q7_HWC, shl_xt800p_avepool_q7_HWC diff --git a/source/e804_opt/relu.c b/source/e804_opt/relu.c index 5f5015a8..97c4187f 100644 --- a/source/e804_opt/relu.c +++ b/source/e804_opt/relu.c @@ -16,28 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -int csi_e804_relu_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_e804_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_relu_q7(input_data, size); output->data = input->data; return CSINN_TRUE; } -int csi_e804_relu_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_e804_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_relu_q15(input_data, size); output->data = input->data; return CSINN_TRUE; diff --git a/source/e804_opt/setup.c b/source/e804_opt/setup.c index b78e83e5..9cf249c5 100644 --- a/source/e804_opt/setup.c +++ b/source/e804_opt/setup.c @@ -16,93 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "shl_e804.h" -static void *setup_init_map() +static void *setup_cb_map() { - static void* init_map[CSINN_OP_AND_UTILS_SIZE][2]; + static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2]; + memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2); + /* q7 dtype */ - init_map[CSINN_OP_AVGPOOL2D][0] = csi_e804_avgpool2d_init_q7; - init_map[CSINN_OP_CONV2D][0] = csi_e804_conv2d_init_q7; - init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_e804_depthwise_conv2d_init_q7; - init_map[CSINN_OP_MAXPOOL2D][0] = csi_e804_maxpool2d_init_q7; - + cb_map[CSINN_OP_AVGPOOL2D][0].init = shl_e804_avgpool2d_init_q7; + cb_map[CSINN_OP_CONV2D][0].init = shl_e804_conv2d_init_q7; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_e804_depthwise_conv2d_init_q7; + cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_e804_maxpool2d_init_q7; + cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_e804_fullyconnected_q7; + cb_map[CSINN_OP_RELU][0].exec = shl_e804_relu_q7; + cb_map[CSINN_OP_SIGMOID][0].exec = shl_e804_sigmoid_q7; + cb_map[CSINN_OP_SOFTMAX][0].exec = shl_e804_softmax_q7; + cb_map[CSINN_OP_TANH][0].exec = shl_e804_tanh_q7; + /* q15 dtype */ - init_map[CSINN_OP_CONV2D][1] = csi_e804_conv2d_init_q15; + cb_map[CSINN_OP_CONV2D][1].init = shl_e804_conv2d_init_q15; + cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_e804_fullyconnected_q15; + cb_map[CSINN_OP_RELU][1].exec = shl_e804_relu_q15; + cb_map[CSINN_OP_SIGMOID][1].exec = shl_e804_sigmoid_q15; + cb_map[CSINN_OP_SOFTMAX][1].exec = shl_e804_softmax_q15; + cb_map[CSINN_OP_TANH][1].exec = shl_e804_tanh_q15; - return init_map; + return cb_map; } -static int get_init_map_index(int op, int dtype) +static int get_cb_map_index(int op, int dtype) { switch (dtype) { - case CSINN_DTYPE_INT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; + case CSINN_DTYPE_INT8: + return op * 2; + break; + case CSINN_DTYPE_INT16: + return op * 2 + 1; + break; + default: + return CSINN_UNSUPPORT_DTYPE; } } -void *csi_init_map_e804(int op, int dtype) +static struct csinn_callback *__cb_map_table_e804; +struct csinn_callback *shl_cb_map_e804(int op, int dtype) { - void **init_map_table = setup_init_map(); - return init_map_table[get_init_map_index(op, dtype)]; + return &__cb_map_table_e804[get_cb_map_index(op, dtype)]; } - -static void *setup_bc_map() +void shl_target_init_e804() { - static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2]; - - /* q7 dtype */ - bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant; - bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_depthwise_conv2d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_e804_fullyconnected_q7; - bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant; - bc_map[CSINN_OP_RELU][0] = csi_e804_relu_q7; - bc_map[CSINN_OP_SIGMOID][0] = csi_e804_sigmoid_q7; - bc_map[CSINN_OP_SOFTMAX][0] = csi_e804_softmax_q7; - bc_map[CSINN_OP_TANH][0] = csi_e804_tanh_q7; - - /* q15 dtype */ - bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_e804_fullyconnected_q15; - bc_map[CSINN_OP_RELU][1] = csi_e804_relu_q15; - bc_map[CSINN_OP_SIGMOID][1] = csi_e804_sigmoid_q15; - bc_map[CSINN_OP_SOFTMAX][1] = csi_e804_softmax_q15; - bc_map[CSINN_OP_TANH][1] = csi_e804_tanh_q15; - - return bc_map; -} - -static int get_bc_map_index(int op, int dtype) -{ - switch (dtype) { - case CSINN_DTYPE_INT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; - } -} - -void *csi_bc_map_e804(int op, int dtype) -{ - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; - } - return bc_map_table[get_bc_map_index(op, dtype)]; + __cb_map_table_e804 = setup_cb_map(); + shl_register_runtime_callback(CSINN_E804, NULL); + shl_register_op_callback(CSINN_E804, shl_cb_map_e804); } diff --git a/source/e804_opt/sigmoid.c b/source/e804_opt/sigmoid.c index f5eeb581..cf5989c8 100644 --- a/source/e804_opt/sigmoid.c +++ b/source/e804_opt/sigmoid.c @@ -16,36 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -int csi_e804_sigmoid_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_e804_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_nn_activations_direct_q7(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; } -int csi_e804_sigmoid_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_e804_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_nn_activations_direct_q15(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; diff --git a/source/e804_opt/softmax.c b/source/e804_opt/softmax.c index 79a033f5..e3b49161 100644 --- a/source/e804_opt/softmax.c +++ b/source/e804_opt/softmax.c @@ -16,29 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -int csi_e804_softmax_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_e804_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_softmax_q7(input_data, size, output_data); return CSINN_TRUE; } -int csi_e804_softmax_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_e804_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *output_data = (q15_t *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_softmax_q15(input_data, size, output_data); return CSINN_TRUE; } diff --git a/source/e804_opt/softmax/csi_xt800p_softmax_q15.S b/source/e804_opt/softmax/shl_xt800p_softmax_q15.S similarity index 91% rename from source/e804_opt/softmax/csi_xt800p_softmax_q15.S rename to source/e804_opt/softmax/shl_xt800p_softmax_q15.S index f57f9410..2354501d 100644 --- a/source/e804_opt/softmax/csi_xt800p_softmax_q15.S +++ b/source/e804_opt/softmax/shl_xt800p_softmax_q15.S @@ -17,25 +17,25 @@ */ /****************************************************************************** - * @file csi_xt800p_softmax_q15.S + * @file shl_xt800p_softmax_q15.S * @brief Pooling functions implementations. * @version V1.0 * @date 01. June 20116 ******************************************************************************/ /* - * void csi_xt800p_softmax_q15(const q15_t * vec_in, + * void shl_xt800p_softmax_q15(const q15_t * vec_in, * const uint8_t dim_vec, * q15_t * p_out) */ - .file "csi_xt800p_softmax_q15.S" - .section .text.csi_xt800p_softmax_q15,"ax",@progbits + .file "shl_xt800p_softmax_q15.S" + .section .text.shl_xt800p_softmax_q15,"ax",@progbits .align 2 - .global csi_xt800p_softmax_q15 - .type csi_xt800p_softmax_q15, @function + .global shl_xt800p_softmax_q15 + .type shl_xt800p_softmax_q15, @function -csi_xt800p_softmax_q15: +shl_xt800p_softmax_q15: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 lrw t9, 0x80008000 // init max value mov l0, a0 @@ -221,8 +221,7 @@ csi_xt800p_softmax_q15: .L11: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 - .size csi_xt800p_softmax_q15, .-csi_xt800p_softmax_q15 -.weak csi_softmax_q15 -.set csi_softmax_q15, csi_xt800p_softmax_q15 + .size shl_xt800p_softmax_q15, .-shl_xt800p_softmax_q15 + .weak csky_dsp2_softmax_q15 -.set csky_dsp2_softmax_q15, csi_xt800p_softmax_q15 +.set csky_dsp2_softmax_q15, shl_xt800p_softmax_q15 diff --git a/source/e804_opt/softmax/csi_xt800p_softmax_q7.S b/source/e804_opt/softmax/shl_xt800p_softmax_q7.S similarity index 91% rename from source/e804_opt/softmax/csi_xt800p_softmax_q7.S rename to source/e804_opt/softmax/shl_xt800p_softmax_q7.S index 04df43e6..70484467 100644 --- a/source/e804_opt/softmax/csi_xt800p_softmax_q7.S +++ b/source/e804_opt/softmax/shl_xt800p_softmax_q7.S @@ -17,25 +17,25 @@ */ /****************************************************************************** - * @file csi_xt800p_softmax_q7.S + * @file shl_xt800p_softmax_q7.S * @brief Pooling functions implementations. * @version V1.0 * @date 04. June 2018 ******************************************************************************/ /* - * void csi_xt800p_softmax_q7(const q7_t * vec_in, + * void shl_xt800p_softmax_q7(const q7_t * vec_in, * const uint16_t dim_vec, * q7_t * p_out) */ - .file "csi_xt800p_softmax_q7.S" - .section .text.csi_xt800p_softmax_q7,"ax",@progbits + .file "shl_xt800p_softmax_q7.S" + .section .text.shl_xt800p_softmax_q7,"ax",@progbits .align 2 - .global csi_xt800p_softmax_q7 - .type csi_xt800p_softmax_q7, @function + .global shl_xt800p_softmax_q7 + .type shl_xt800p_softmax_q7, @function -csi_xt800p_softmax_q7: +shl_xt800p_softmax_q7: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 lrw t9, 0x80808080 // init max value mov l0, a0 @@ -224,8 +224,7 @@ csi_xt800p_softmax_q7: .L11: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9 - .size csi_xt800p_softmax_q7, .-csi_xt800p_softmax_q7 -.weak csi_softmax_q7 -.set csi_softmax_q7, csi_xt800p_softmax_q7 + .size shl_xt800p_softmax_q7, .-shl_xt800p_softmax_q7 + .weak csky_dsp2_softmax_q7 -.set csky_dsp2_softmax_q7, csi_xt800p_softmax_q7 +.set csky_dsp2_softmax_q7, shl_xt800p_softmax_q7 diff --git a/source/e804_opt/tanh.c b/source/e804_opt/tanh.c index a9343c3e..dc97fc96 100644 --- a/source/e804_opt/tanh.c +++ b/source/e804_opt/tanh.c @@ -16,36 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_e804.h" +#include "e804_function.h" +#include "shl_e804.h" - -int csi_e804_tanh_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_e804_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_nn_activations_direct_q7(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; } -int csi_e804_tanh_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_e804_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_dsp2_nn_activations_direct_q15(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; diff --git a/source/graph_ref/abs.c b/source/graph_ref/abs.c index 2d7d2476..7a433d7f 100644 --- a/source/graph_ref/abs.c +++ b/source/graph_ref/abs.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_abs(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_abs(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ABS, params); + shl_gref_siso_op(input, output, CSINN_OP_ABS, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/acos.c b/source/graph_ref/acos.c index f8ff2efe..7e78e425 100644 --- a/source/graph_ref/acos.c +++ b/source/graph_ref/acos.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_acos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_acos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ACOS, params); + shl_gref_siso_op(input, output, CSINN_OP_ACOS, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/acosh.c b/source/graph_ref/acosh.c index 9969d232..78a90f90 100644 --- a/source/graph_ref/acosh.c +++ b/source/graph_ref/acosh.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_acosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_acosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ACOSH, params); + shl_gref_siso_op(input, output, CSINN_OP_ACOSH, params); return CSINN_TRUE; } diff --git a/source/graph_ref/add.c b/source/graph_ref/add.c index a90f17fd..82783c0f 100644 --- a/source/graph_ref/add.c +++ b/source/graph_ref/add.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_add(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_add(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/all.c b/source/graph_ref/all.c index 3ab40d26..9d3b3f4b 100644 --- a/source/graph_ref/all.c +++ b/source/graph_ref/all.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_all(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_all(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_debug_error("csi_gref_all unsupport\n"); + shl_debug_error("shl_gref_all unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/and.c b/source/graph_ref/and.c index f883ae75..d939d381 100644 --- a/source/graph_ref/and.c +++ b/source/graph_ref/and.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_AND, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_AND, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/any.c b/source/graph_ref/any.c index 49d6b5db..3c7e1e9a 100644 --- a/source/graph_ref/any.c +++ b/source/graph_ref/any.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_any(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_any(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ANY, params); + shl_gref_siso_op(input, output, CSINN_OP_ANY, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/arange.c b/source/graph_ref/arange.c index 5d044223..0cbf3af9 100644 --- a/source/graph_ref/arange.c +++ b/source/graph_ref/arange.c @@ -16,14 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_arange(struct csi_tensor *output, - struct arange_params *params) +int shl_gref_arange(struct csinn_tensor *output, struct csinn_arange_params *params) { - csi_debug_error("csi_gref_arange unsupport\n"); + shl_debug_error("shl_gref_arange unsupport\n"); return CSINN_FALSE; } - diff --git a/source/graph_ref/argmax.c b/source/graph_ref/argmax.c index 81c72e2c..7f1a5bba 100644 --- a/source/graph_ref/argmax.c +++ b/source/graph_ref/argmax.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_argmax(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ARGMAX, params); + shl_gref_siso_op(input, output, CSINN_OP_ARGMAX, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/argmin.c b/source/graph_ref/argmin.c index 1e2abc0d..856825f4 100644 --- a/source/graph_ref/argmin.c +++ b/source/graph_ref/argmin.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_argmin(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_argmin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ARGMIN, params); + shl_gref_siso_op(input, output, CSINN_OP_ARGMIN, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/asin.c b/source/graph_ref/asin.c index 21d6f356..0c80c7ca 100644 --- a/source/graph_ref/asin.c +++ b/source/graph_ref/asin.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_asin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_asin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ASIN, params); + shl_gref_siso_op(input, output, CSINN_OP_ASIN, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/asinh.c b/source/graph_ref/asinh.c index c9c817e2..5872980d 100644 --- a/source/graph_ref/asinh.c +++ b/source/graph_ref/asinh.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_asinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_asinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ASINH, params); + shl_gref_siso_op(input, output, CSINN_OP_ASINH, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/atan.c b/source/graph_ref/atan.c index 68a82797..07fe7525 100644 --- a/source/graph_ref/atan.c +++ b/source/graph_ref/atan.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_atan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_atan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ATAN, params); + shl_gref_siso_op(input, output, CSINN_OP_ATAN, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/atanh.c b/source/graph_ref/atanh.c index f8ce38e4..4b82c8bf 100644 --- a/source/graph_ref/atanh.c +++ b/source/graph_ref/atanh.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_atanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_atanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ATANH, params); + shl_gref_siso_op(input, output, CSINN_OP_ATANH, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/avgpool.c b/source/graph_ref/avgpool.c index 6fa6f630..50fd5afd 100644 --- a/source/graph_ref/avgpool.c +++ b/source/graph_ref/avgpool.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ +#include "shl_gref.h" -#include "csi_gref.h" - -int csi_gref_avgpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params); return CSINN_TRUE; } diff --git a/source/graph_ref/avgpool3d.c b/source/graph_ref/avgpool3d.c index d08775aa..cc574c0b 100644 --- a/source/graph_ref/avgpool3d.c +++ b/source/graph_ref/avgpool3d.c @@ -16,23 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ +#include "shl_gref.h" -#include "csi_gref.h" - -int csi_gref_avgpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params); return CSINN_TRUE; } -int csi_gref_global_avgpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_global_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/batch_to_space.c b/source/graph_ref/batch_to_space.c index 6c6d8976..f1d5f526 100644 --- a/source/graph_ref/batch_to_space.c +++ b/source/graph_ref/batch_to_space.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_batch_to_space(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params) +int shl_gref_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE, params); + shl_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/batch_to_space_nd.c b/source/graph_ref/batch_to_space_nd.c index 54568832..9b1ef988 100644 --- a/source/graph_ref/batch_to_space_nd.c +++ b/source/graph_ref/batch_to_space_nd.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_batch_to_space_nd(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_nd_params *params) +int shl_gref_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE_ND, params); + shl_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE_ND, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/bn.c b/source/graph_ref/bn.c index bb2d186c..a8a98b62 100644 --- a/source/graph_ref/bn.c +++ b/source/graph_ref/bn.c @@ -16,19 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_batch_normalization(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params) +int shl_gref_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params) { - csi_debug_error("csi_gref_batch_normalization unsupport\n"); + shl_debug_error("shl_gref_batch_normalization unsupport\n"); return CSINN_TRUE; } - diff --git a/source/graph_ref/broadcast_to.c b/source/graph_ref/broadcast_to.c index ed00208c..eb5f1bf7 100644 --- a/source/graph_ref/broadcast_to.c +++ b/source/graph_ref/broadcast_to.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_broadcast_to(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int shl_gref_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_BROADCOST, params); + shl_gref_siso_op(input, output, CSINN_OP_BROADCOST, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/cache_conv1d.c b/source/graph_ref/cache_conv1d.c index 0aaa7630..1f0f39e9 100644 --- a/source/graph_ref/cache_conv1d.c +++ b/source/graph_ref/cache_conv1d.c @@ -16,14 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_gref_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { - csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_CONV1D, params); + shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_CONV1D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/cache_matmul.c b/source/graph_ref/cache_matmul.c index 8d5ca4f5..a7027840 100644 --- a/source/graph_ref/cache_matmul.c +++ b/source/graph_ref/cache_matmul.c @@ -16,14 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) +int shl_gref_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) { - csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_MATMUL, params); + shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_MATMUL, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/ceil.c b/source/graph_ref/ceil.c index 1a23c4ba..52833676 100644 --- a/source/graph_ref/ceil.c +++ b/source/graph_ref/ceil.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_ceil(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_ceil(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_CEIL, params); + shl_gref_siso_op(input, output, CSINN_OP_CEIL, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/clip.c b/source/graph_ref/clip.c index 9cce441c..551e3c2e 100644 --- a/source/graph_ref/clip.c +++ b/source/graph_ref/clip.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_clip(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_clip(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_CLIP, params); + shl_gref_siso_op(input, output, CSINN_OP_CLIP, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/col2im.c b/source/graph_ref/col2im.c index 7956da71..25400f22 100644 --- a/source/graph_ref/col2im.c +++ b/source/graph_ref/col2im.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_col2im(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct col2im_params *params) +int shl_gref_col2im(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params) { - csi_debug_error("csi_gref_col2im unsupport\n"); + shl_debug_error("shl_gref_col2im unsupport\n"); return CSINN_TRUE; } - diff --git a/source/graph_ref/concat.c b/source/graph_ref/concat.c index aa376940..64ac1928 100644 --- a/source/graph_ref/concat.c +++ b/source/graph_ref/concat.c @@ -16,32 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_concat(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params) +int shl_gref_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { - struct csi_node *layer = csi_node_alloc(CSINN_OP_CONCAT, params->base.name, params->inputs_count, 1, params); + struct shl_node *layer = + shl_node_alloc(CSINN_OP_CONCAT, params->base.name, params->inputs_count, 1, params); - for (int i =0; i < params->inputs_count; i++){ - struct csi_node *in_tensor = (struct csi_node *)(input[i]->data); + for (int i = 0; i < params->inputs_count; i++) { + struct shl_node *in_tensor = (struct shl_node *)(input[i]->data); if (input[i]->is_const) { - in_tensor = csi_node_const_var_alloc(input[i]->name, input[i]); + in_tensor = shl_node_const_var_alloc(input[i]->name, input[i]); } else { - in_tensor = (struct csi_node *)(input[i]->data); + in_tensor = (struct shl_node *)(input[i]->data); } - csi_node_add_in(layer, in_tensor, i); + shl_node_add_in(layer, in_tensor, i); } - struct csi_node *out = csi_node_var_alloc(output->name, output); - csi_node_add_out(layer, out, 0); + struct shl_node *out = shl_node_var_alloc(output->name, output); + shl_node_add_out(layer, out, 0); output->data = out; - struct csi_ref_graph *graph = csi_gref_get_graph(input[0]->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(input[0]->sess); + shl_gref_graph_insert(layer, graph); return CSINN_TRUE; } - diff --git a/source/graph_ref/convolution.c b/source/graph_ref/convolution.c index 9e7b383e..72faacaa 100644 --- a/source/graph_ref/convolution.c +++ b/source/graph_ref/convolution.c @@ -16,77 +16,62 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_conv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D, params); return CSINN_TRUE; } -int csi_gref_conv2d_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU, params); return CSINN_TRUE; } -int csi_gref_conv2d_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU6, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU6, params); return CSINN_TRUE; } -int csi_gref_depthwise_conv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D, params); return CSINN_TRUE; } -int csi_gref_depthwise_conv2d_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU, params); return CSINN_TRUE; } -int csi_gref_depthwise_conv2d_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_depthwise_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU6, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU6, params); return CSINN_TRUE; } -int csi_gref_group_conv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_GROUP_CONV2D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_GROUP_CONV2D, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/convolution1d.c b/source/graph_ref/convolution1d.c index 5413235c..504d49cd 100644 --- a/source/graph_ref/convolution1d.c +++ b/source/graph_ref/convolution1d.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_conv1d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV1D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV1D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/convolution3d.c b/source/graph_ref/convolution3d.c index 47e3033e..a440ddd0 100644 --- a/source/graph_ref/convolution3d.c +++ b/source/graph_ref/convolution3d.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_conv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int shl_gref_conv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV3D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV3D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/cos.c b/source/graph_ref/cos.c index b365c62f..ac3c6c7d 100644 --- a/source/graph_ref/cos.c +++ b/source/graph_ref/cos.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_cos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_COS, params); + shl_gref_siso_op(input, output, CSINN_OP_COS, params); return CSINN_TRUE; } diff --git a/source/graph_ref/cosh.c b/source/graph_ref/cosh.c index 55ead117..453efe1f 100644 --- a/source/graph_ref/cosh.c +++ b/source/graph_ref/cosh.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_cosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_COSH, params); + shl_gref_siso_op(input, output, CSINN_OP_COSH, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/crop.c b/source/graph_ref/crop.c index 8d623cbc..c3763ae1 100644 --- a/source/graph_ref/crop.c +++ b/source/graph_ref/crop.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_crop(struct csi_tensor *input, - struct csi_tensor *output, - struct crop_params *params) +int shl_gref_crop(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_CROP, params); + shl_gref_siso_op(input, output, CSINN_OP_CROP, params); return CSINN_TRUE; } diff --git a/source/graph_ref/cumprod.c b/source/graph_ref/cumprod.c index ea208ddb..ffa450f6 100644 --- a/source/graph_ref/cumprod.c +++ b/source/graph_ref/cumprod.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cumprod(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) +int shl_gref_cumprod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_CUMPROD, params); + shl_gref_siso_op(input, output, CSINN_OP_CUMPROD, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/cumsum.c b/source/graph_ref/cumsum.c index 08493115..6660b98d 100644 --- a/source/graph_ref/cumsum.c +++ b/source/graph_ref/cumsum.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_cumsum(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params) +int shl_gref_cumsum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_CUMSUM, params); + shl_gref_siso_op(input, output, CSINN_OP_CUMSUM, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/data_convert.c b/source/graph_ref/data_convert.c new file mode 100644 index 00000000..680072be --- /dev/null +++ b/source/graph_ref/data_convert.c @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_gref.h" + +int shl_gref_data_convert(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) +{ + shl_gref_siso_op(input, output, CSINN_OP_DATA_CONVERT, params); + return CSINN_TRUE; +} diff --git a/source/graph_ref/deconvolution.c b/source/graph_ref/deconvolution.c index 695dc48c..7e29619d 100644 --- a/source/graph_ref/deconvolution.c +++ b/source/graph_ref/deconvolution.c @@ -16,26 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_deconv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV2D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV2D, params); return CSINN_TRUE; } -int csi_gref_depthwise_deconv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_gref_depthwise_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_DECONV2D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_DECONV2D, params); return CSINN_TRUE; } diff --git a/source/graph_ref/deconvolution3d.c b/source/graph_ref/deconvolution3d.c index eeaae97e..f81a0c63 100644 --- a/source/graph_ref/deconvolution3d.c +++ b/source/graph_ref/deconvolution3d.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_deconv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int shl_gref_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV3D, params); + shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV3D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/depth_to_space.c b/source/graph_ref/depth_to_space.c index bc07d936..69437d26 100644 --- a/source/graph_ref/depth_to_space.c +++ b/source/graph_ref/depth_to_space.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_depth_to_space(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params) +int shl_gref_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_DEPTH_TO_SPACE, params); + shl_gref_siso_op(input, output, CSINN_OP_DEPTH_TO_SPACE, params); return CSINN_TRUE; } diff --git a/source/graph_ref/div.c b/source/graph_ref/div.c index 86790bb7..623cf7b2 100644 --- a/source/graph_ref/div.c +++ b/source/graph_ref/div.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_div(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_div(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_DIV, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_DIV, params); return CSINN_TRUE; } diff --git a/source/graph_ref/elu.c b/source/graph_ref/elu.c index dabcbc84..8241048e 100644 --- a/source/graph_ref/elu.c +++ b/source/graph_ref/elu.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_elu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_elu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ELU, params); + shl_gref_siso_op(input, output, CSINN_OP_ELU, params); return CSINN_TRUE; } diff --git a/source/graph_ref/equal.c b/source/graph_ref/equal.c index b92af4c9..0675b939 100644 --- a/source/graph_ref/equal.c +++ b/source/graph_ref/equal.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_EQUANL, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_EQUANL, params); return CSINN_TRUE; } diff --git a/source/graph_ref/erf.c b/source/graph_ref/erf.c index 01889d04..3e17a49d 100644 --- a/source/graph_ref/erf.c +++ b/source/graph_ref/erf.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_erf(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_erf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ERF, params); + shl_gref_siso_op(input, output, CSINN_OP_ERF, params); return CSINN_TRUE; } diff --git a/source/graph_ref/exp.c b/source/graph_ref/exp.c index d31b4b34..9d8829e0 100644 --- a/source/graph_ref/exp.c +++ b/source/graph_ref/exp.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_exp(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_exp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_EXP, params); + shl_gref_siso_op(input, output, CSINN_OP_EXP, params); return CSINN_TRUE; } diff --git a/source/graph_ref/expand_dims.c b/source/graph_ref/expand_dims.c index 05537189..5b5d05cc 100644 --- a/source/graph_ref/expand_dims.c +++ b/source/graph_ref/expand_dims.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_expand_dims(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params) +int shl_gref_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_EXPAND_DIMS, params); + shl_gref_siso_op(input, output, CSINN_OP_EXPAND_DIMS, params); return CSINN_TRUE; } diff --git a/source/graph_ref/expm1.c b/source/graph_ref/expm1.c index 7a79aebb..056d815c 100644 --- a/source/graph_ref/expm1.c +++ b/source/graph_ref/expm1.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_expm1(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_expm1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_EXPM1, params); + shl_gref_siso_op(input, output, CSINN_OP_EXPM1, params); return CSINN_TRUE; } diff --git a/source/graph_ref/flatten.c b/source/graph_ref/flatten.c index bf2fed9b..6312d016 100644 --- a/source/graph_ref/flatten.c +++ b/source/graph_ref/flatten.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_flatten(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params) +int shl_gref_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_FLATTEN, params); + shl_gref_siso_op(input, output, CSINN_OP_FLATTEN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/floor.c b/source/graph_ref/floor.c index 617a5a6f..fa78d6b2 100644 --- a/source/graph_ref/floor.c +++ b/source/graph_ref/floor.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_floor(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_floor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_FLOOR, params); + shl_gref_siso_op(input, output, CSINN_OP_FLOOR, params); return CSINN_TRUE; } diff --git a/source/graph_ref/floor_divide.c b/source/graph_ref/floor_divide.c index 25dc7ab8..398ddc9a 100644 --- a/source/graph_ref/floor_divide.c +++ b/source/graph_ref/floor_divide.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_floor_divide(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_DIVIDE, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_DIVIDE, params); return CSINN_TRUE; } diff --git a/source/graph_ref/floor_mod.c b/source/graph_ref/floor_mod.c index bc1c2c51..7f7b99b3 100644 --- a/source/graph_ref/floor_mod.c +++ b/source/graph_ref/floor_mod.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_floor_mod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_MOD, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_MOD, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/fsmn.c b/source/graph_ref/fsmn.c index 61ae482d..5a85e167 100644 --- a/source/graph_ref/fsmn.c +++ b/source/graph_ref/fsmn.c @@ -16,34 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_fsmn(struct csi_tensor *frame, - struct csi_tensor *l_filter, - struct csi_tensor *r_filter, - struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, - struct csi_tensor *output, - struct fsmn_params *params) +int shl_gref_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params) { - struct csi_params_base *ptr = (void *)params; - struct csi_node *layer = csi_node_alloc(CSINN_OP_FSMN, ptr->name, 5, 1, params); - struct csi_node *in0 = (struct csi_node *)frame->data; - struct csi_node *in1 = csi_node_const_var_alloc(l_filter->name, l_filter); - struct csi_node *in2 = csi_node_const_var_alloc(r_filter->name, r_filter); - struct csi_node *in3 = csi_node_const_var_alloc(frame_sequence->name, frame_sequence); - struct csi_node *in4 = csi_node_const_var_alloc(frame_counter->name, frame_counter); - struct csi_node *out = csi_node_var_alloc(output->name, output); - csi_node_add_in(layer, in0, 0); - csi_node_add_in(layer, in1, 1); - csi_node_add_in(layer, in2, 2); - csi_node_add_in(layer, in3, 3); - csi_node_add_in(layer, in4, 4); - csi_node_add_out(layer, out, 0); + struct csinn_params_base *ptr = (void *)params; + struct shl_node *layer = shl_node_alloc(CSINN_OP_FSMN, ptr->name, 5, 1, params); + struct shl_node *in0 = (struct shl_node *)frame->data; + struct shl_node *in1 = shl_node_const_var_alloc(l_filter->name, l_filter); + struct shl_node *in2 = shl_node_const_var_alloc(r_filter->name, r_filter); + struct shl_node *in3 = shl_node_const_var_alloc(frame_sequence->name, frame_sequence); + struct shl_node *in4 = shl_node_const_var_alloc(frame_counter->name, frame_counter); + struct shl_node *out = shl_node_var_alloc(output->name, output); + shl_node_add_in(layer, in0, 0); + shl_node_add_in(layer, in1, 1); + shl_node_add_in(layer, in2, 2); + shl_node_add_in(layer, in3, 3); + shl_node_add_in(layer, in4, 4); + shl_node_add_out(layer, out, 0); output->data = out; - struct csi_ref_graph *graph = csi_gref_get_graph(frame->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(frame->sess); + shl_gref_graph_insert(layer, graph); return CSINN_TRUE; } diff --git a/source/graph_ref/fullyconnected.c b/source/graph_ref/fullyconnected.c index 3e2fd8da..2ca181da 100644 --- a/source/graph_ref/fullyconnected.c +++ b/source/graph_ref/fullyconnected.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_fullyconnected(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weight, - struct csi_tensor *bias, - struct fc_params *params) +int shl_gref_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_fc_params *params) { - csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_FULLYCONNECTED, params); + shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_FULLYCONNECTED, params); return CSINN_TRUE; } diff --git a/source/graph_ref/gather.c b/source/graph_ref/gather.c index f584e039..737ec2a4 100644 --- a/source/graph_ref/gather.c +++ b/source/graph_ref/gather.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_gather(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_params *params) +int shl_gref_gather(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { - csi_gref_diso_op(input, indices, output, CSINN_OP_GATHER, params); + shl_gref_diso_op(input, indices, output, CSINN_OP_GATHER, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/gather_nd.c b/source/graph_ref/gather_nd.c index 7e6fbf99..becb6ae6 100644 --- a/source/graph_ref/gather_nd.c +++ b/source/graph_ref/gather_nd.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_gather_nd(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params) +int shl_gref_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params) { - csi_debug_error("csi_gref_gather_nd unsupport\n"); + shl_debug_error("shl_gref_gather_nd unsupport\n"); return CSINN_TRUE; } - diff --git a/source/graph_ref/global_averagepool.c b/source/graph_ref/global_averagepool.c index 7ad41eea..646c2a67 100644 --- a/source/graph_ref/global_averagepool.c +++ b/source/graph_ref/global_averagepool.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_global_avgpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params); return CSINN_TRUE; } diff --git a/source/graph_ref/global_maxpool.c b/source/graph_ref/global_maxpool.c index 1b8112b9..70800a39 100644 --- a/source/graph_ref/global_maxpool.c +++ b/source/graph_ref/global_maxpool.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_global_maxpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_MAXPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_MAXPOOL2D, params); return CSINN_TRUE; } diff --git a/source/graph_ref/greater.c b/source/graph_ref/greater.c index d32ad682..860394a6 100644 --- a/source/graph_ref/greater.c +++ b/source/graph_ref/greater.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_greater(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_greater(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/greater_equal.c b/source/graph_ref/greater_equal.c index 2c4095d5..2dfc431a 100644 --- a/source/graph_ref/greater_equal.c +++ b/source/graph_ref/greater_equal.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_greater_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER_EQUAL, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER_EQUAL, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/hard_sigmoid.c b/source/graph_ref/hard_sigmoid.c index 4745b233..6c5f024f 100644 --- a/source/graph_ref/hard_sigmoid.c +++ b/source/graph_ref/hard_sigmoid.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_hard_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_gref_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SIGMOID, params); + shl_gref_siso_op(input, output, CSINN_OP_SIGMOID, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/im2col.c b/source/graph_ref/im2col.c index cdffbb2b..e241c65e 100644 --- a/source/graph_ref/im2col.c +++ b/source/graph_ref/im2col.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_im2col(struct csi_tensor *input, - struct csi_tensor *output, - struct im2col_params *params) +int shl_gref_im2col(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_IM2COL, params); + shl_gref_siso_op(input, output, CSINN_OP_IM2COL, params); return CSINN_TRUE; } diff --git a/source/graph_ref/isnan.c b/source/graph_ref/isnan.c index b2c4906a..95c5877c 100644 --- a/source/graph_ref/isnan.c +++ b/source/graph_ref/isnan.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_isnan_bool(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ISNAN, params); + shl_gref_siso_op(input, output, CSINN_OP_ISNAN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/l2_normalization.c b/source/graph_ref/l2_normalization.c index c3dc96f3..52e020d3 100644 --- a/source/graph_ref/l2_normalization.c +++ b/source/graph_ref/l2_normalization.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_l2_normalization(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) +int shl_gref_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_L2N, params); + shl_gref_siso_op(input, output, CSINN_OP_L2N, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/l2pool.c b/source/graph_ref/l2pool.c index 8553d73b..0a8c5eaf 100644 --- a/source/graph_ref/l2pool.c +++ b/source/graph_ref/l2pool.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_l2pool(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_l2pool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_L2POOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_L2POOL2D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/layer_norm.c b/source/graph_ref/layer_norm.c index 4914d346..1bea126f 100644 --- a/source/graph_ref/layer_norm.c +++ b/source/graph_ref/layer_norm.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_layer_norm(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct layer_norm_params *params) +int shl_gref_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { - csi_gref_sidcso_op(input, output, gamma, beta, CSINN_OP_LAYER_NORM, params); + shl_gref_sidcso_op(input, output, gamma, beta, CSINN_OP_LAYER_NORM, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/leaky_relu.c b/source/graph_ref/leaky_relu.c index add038ea..6414216c 100644 --- a/source/graph_ref/leaky_relu.c +++ b/source/graph_ref/leaky_relu.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_leaky_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LEAKY_RELU, params); + shl_gref_siso_op(input, output, CSINN_OP_LEAKY_RELU, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/less.c b/source/graph_ref/less.c index e51c83df..9e6fd631 100644 --- a/source/graph_ref/less.c +++ b/source/graph_ref/less.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_less(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_less(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_LESS, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_LESS, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/less_equal.c b/source/graph_ref/less_equal.c index ae93f5f1..59d18453 100644 --- a/source/graph_ref/less_equal.c +++ b/source/graph_ref/less_equal.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_less_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_LESS_EQUAL, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_LESS_EQUAL, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/log.c b/source/graph_ref/log.c index 87e2a07f..a1c595fe 100644 --- a/source/graph_ref/log.c +++ b/source/graph_ref/log.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_log(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_log(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LOG, params); + shl_gref_siso_op(input, output, CSINN_OP_LOG, params); return CSINN_TRUE; } diff --git a/source/graph_ref/log1p.c b/source/graph_ref/log1p.c index 7f225cde..6f16e6ae 100644 --- a/source/graph_ref/log1p.c +++ b/source/graph_ref/log1p.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_log1p(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_log1p(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LOG1P, params); + shl_gref_siso_op(input, output, CSINN_OP_LOG1P, params); return CSINN_TRUE; } diff --git a/source/graph_ref/log_softmax.c b/source/graph_ref/log_softmax.c index a283f6a3..6f4715b7 100644 --- a/source/graph_ref/log_softmax.c +++ b/source/graph_ref/log_softmax.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_log_softmax(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_gref_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LOG_SOFTMAX, params); + shl_gref_siso_op(input, output, CSINN_OP_LOG_SOFTMAX, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/logical_and.c b/source/graph_ref/logical_and.c index 2f32519b..3bfbe737 100644 --- a/source/graph_ref/logical_and.c +++ b/source/graph_ref/logical_and.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_logical_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_AND, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_AND, params); return CSINN_TRUE; } diff --git a/source/graph_ref/logical_not.c b/source/graph_ref/logical_not.c index 9646cdaa..a1fa234e 100644 --- a/source/graph_ref/logical_not.c +++ b/source/graph_ref/logical_not.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_logical_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_logical_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LOGICAL_NOT, params); + shl_gref_siso_op(input, output, CSINN_OP_LOGICAL_NOT, params); return CSINN_TRUE; } diff --git a/source/graph_ref/logical_or.c b/source/graph_ref/logical_or.c index 0a75f7a6..68c3ab87 100644 --- a/source/graph_ref/logical_or.c +++ b/source/graph_ref/logical_or.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_logical_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_OR, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_OR, params); return CSINN_TRUE; } diff --git a/source/graph_ref/logical_xor.c b/source/graph_ref/logical_xor.c index f506b489..634f1874 100644 --- a/source/graph_ref/logical_xor.c +++ b/source/graph_ref/logical_xor.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_logical_xor(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_XOR, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_XOR, params); return CSINN_TRUE; } diff --git a/source/graph_ref/lrn.c b/source/graph_ref/lrn.c index 0df8e8ae..f342d08f 100644 --- a/source/graph_ref/lrn.c +++ b/source/graph_ref/lrn.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_lrn(struct csi_tensor *input, - struct csi_tensor *output, - struct lrn_params *params) +int shl_gref_lrn(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_LRN, params); + shl_gref_siso_op(input, output, CSINN_OP_LRN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/matmul.c b/source/graph_ref/matmul.c index 8cfab3d1..9e68f05e 100644 --- a/source/graph_ref/matmul.c +++ b/source/graph_ref/matmul.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_matmul(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params) +int shl_gref_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { - csi_gref_diso_op(mat0, mat1, output, CSINN_OP_MATMUL, params); + shl_gref_diso_op(mat0, mat1, output, CSINN_OP_MATMUL, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/max.c b/source/graph_ref/max.c index 1422ddf6..e56b0de7 100644 --- a/source/graph_ref/max.c +++ b/source/graph_ref/max.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_max(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MAX, params); + shl_gref_siso_op(input, output, CSINN_OP_MAX, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/maximum.c b/source/graph_ref/maximum.c index 0ad0f028..846d512b 100644 --- a/source/graph_ref/maximum.c +++ b/source/graph_ref/maximum.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_maximum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_MAXIMUM, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_MAXIMUM, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/maxpool.c b/source/graph_ref/maxpool.c index 46cce60a..d0c58f2d 100644 --- a/source/graph_ref/maxpool.c +++ b/source/graph_ref/maxpool.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ +#include "shl_gref.h" -#include "csi_gref.h" - -int csi_gref_maxpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D, params); + shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D, params); return CSINN_TRUE; } diff --git a/source/graph_ref/maxpool2d_locat.c b/source/graph_ref/maxpool2d_locat.c index 7263fa8b..646df999 100644 --- a/source/graph_ref/maxpool2d_locat.c +++ b/source/graph_ref/maxpool2d_locat.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_maxpool2d_locat(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D_LOCAT, params); + shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D_LOCAT, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/maxpool3d.c b/source/graph_ref/maxpool3d.c index 95860e2f..51c641c0 100644 --- a/source/graph_ref/maxpool3d.c +++ b/source/graph_ref/maxpool3d.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_maxpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_gref_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL3D, params); + shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL3D, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/mean.c b/source/graph_ref/mean.c index aa1b469e..02b336cc 100644 --- a/source/graph_ref/mean.c +++ b/source/graph_ref/mean.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_mean(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MEAN, params); + shl_gref_siso_op(input, output, CSINN_OP_MEAN, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/min.c b/source/graph_ref/min.c index 6e79bf54..34359e0b 100644 --- a/source/graph_ref/min.c +++ b/source/graph_ref/min.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_MIN, params); + shl_gref_siso_op(input, output, CSINN_OP_MIN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/minimum.c b/source/graph_ref/minimum.c index af6a711d..f84c3bf0 100644 --- a/source/graph_ref/minimum.c +++ b/source/graph_ref/minimum.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_minimum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_MINIMUM, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_MINIMUM, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/mod.c b/source/graph_ref/mod.c index f7f26d3b..ff711397 100644 --- a/source/graph_ref/mod.c +++ b/source/graph_ref/mod.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_mod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/mul.c b/source/graph_ref/mul.c index 7ea7a30b..cbbf7012 100644 --- a/source/graph_ref/mul.c +++ b/source/graph_ref/mul.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_mul(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_MUL, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_MUL, params); return CSINN_TRUE; } diff --git a/source/graph_ref/ndarray_size.c b/source/graph_ref/ndarray_size.c index 4fde9d24..cace82f1 100644 --- a/source/graph_ref/ndarray_size.c +++ b/source/graph_ref/ndarray_size.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_ndarray_size(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int shl_gref_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_NDARRAY_SIZE, params); + shl_gref_siso_op(input, output, CSINN_OP_NDARRAY_SIZE, params); return CSINN_TRUE; } diff --git a/source/graph_ref/negative.c b/source/graph_ref/negative.c index a2280dad..06600c53 100644 --- a/source/graph_ref/negative.c +++ b/source/graph_ref/negative.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_negative(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_negative(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_NEGATIIVE, params); + shl_gref_siso_op(input, output, CSINN_OP_NEGATIIVE, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/non_max_suppression.c b/source/graph_ref/non_max_suppression.c index d6a4bbbc..73d3f80e 100644 --- a/source/graph_ref/non_max_suppression.c +++ b/source/graph_ref/non_max_suppression.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_non_max_suppression(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params) +int shl_gref_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_NON_MAX_SUPPRESSION, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_NON_MAX_SUPPRESSION, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/not.c b/source/graph_ref/not.c index 39441206..c9fb1666 100644 --- a/source/graph_ref/not.c +++ b/source/graph_ref/not.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_NOT, params); + shl_gref_siso_op(input, output, CSINN_OP_NOT, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/not_equal.c b/source/graph_ref/not_equal.c index aec880be..87a5bd2e 100644 --- a/source/graph_ref/not_equal.c +++ b/source/graph_ref/not_equal.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_not_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_NOT_EQUAL, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_NOT_EQUAL, params); return CSINN_TRUE; } diff --git a/source/graph_ref/or.c b/source/graph_ref/or.c index 556e8e11..163ebb50 100644 --- a/source/graph_ref/or.c +++ b/source/graph_ref/or.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_OR, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_OR, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/pad.c b/source/graph_ref/pad.c index a026b757..304ebd6d 100644 --- a/source/graph_ref/pad.c +++ b/source/graph_ref/pad.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_pad(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int shl_gref_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_PAD, params); + shl_gref_siso_op(input, output, CSINN_OP_PAD, params); return CSINN_TRUE; } diff --git a/source/graph_ref/power.c b/source/graph_ref/power.c index 652fb26e..9894cd7d 100644 --- a/source/graph_ref/power.c +++ b/source/graph_ref/power.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_power(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_power(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_POWER, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_POWER, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/prelu.c b/source/graph_ref/prelu.c index b0f4cf4e..d5825053 100644 --- a/source/graph_ref/prelu.c +++ b/source/graph_ref/prelu.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_prelu(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct prelu_params *params) +int shl_gref_prelu(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_prelu_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_PRELU, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_PRELU, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/prod.c b/source/graph_ref/prod.c index c19a5eb3..a3c31725 100644 --- a/source/graph_ref/prod.c +++ b/source/graph_ref/prod.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_prod(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_PROD, params); + shl_gref_siso_op(input, output, CSINN_OP_PROD, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/proposal.c b/source/graph_ref/proposal.c index a3e496df..60a3b358 100644 --- a/source/graph_ref/proposal.c +++ b/source/graph_ref/proposal.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_proposal(struct csi_tensor *cls_prob, - struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, - struct csi_tensor *output, - struct proposal_params *params) +int shl_gref_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params) { - csi_debug_error("csi_gref_proposal unsupport\n"); + shl_debug_error("shl_gref_proposal unsupport\n"); return CSINN_FALSE; } diff --git a/source/graph_ref/psroipooling.c b/source/graph_ref/psroipooling.c index 6b043c4b..9a444585 100644 --- a/source/graph_ref/psroipooling.c +++ b/source/graph_ref/psroipooling.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_psroipooling(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params) +int shl_gref_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params) { - csi_debug_error("csi_gref_psroipooling unsupport\n"); + shl_debug_error("shl_gref_psroipooling unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_logsumexp.c b/source/graph_ref/reduce_logsumexp.c index db47597f..d1e81dca 100644 --- a/source/graph_ref/reduce_logsumexp.c +++ b/source/graph_ref/reduce_logsumexp.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_logsumexp(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_LOGSUMEXP, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_LOGSUMEXP, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_max.c b/source/graph_ref/reduce_max.c index 3e018c99..a403e17a 100644 --- a/source/graph_ref/reduce_max.c +++ b/source/graph_ref/reduce_max.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_max(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MAX, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MAX, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_mean.c b/source/graph_ref/reduce_mean.c index 55b63a31..1fa30d77 100644 --- a/source/graph_ref/reduce_mean.c +++ b/source/graph_ref/reduce_mean.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_mean(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MEAN, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MEAN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_min.c b/source/graph_ref/reduce_min.c index 952cd293..8dea218b 100644 --- a/source/graph_ref/reduce_min.c +++ b/source/graph_ref/reduce_min.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MIN, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MIN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_prod.c b/source/graph_ref/reduce_prod.c index 7c91c5c8..b5345075 100644 --- a/source/graph_ref/reduce_prod.c +++ b/source/graph_ref/reduce_prod.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_prod(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_PROD, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_PROD, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reduce_sum.c b/source/graph_ref/reduce_sum.c index 13e00e65..d82c7e7c 100644 --- a/source/graph_ref/reduce_sum.c +++ b/source/graph_ref/reduce_sum.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reduce_sum(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REDUCE_SUM, params); + shl_gref_siso_op(input, output, CSINN_OP_REDUCE_SUM, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/relu.c b/source/graph_ref/relu.c index 3bf0a5a6..1d49216c 100644 --- a/source/graph_ref/relu.c +++ b/source/graph_ref/relu.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RELU, params); + shl_gref_siso_op(input, output, CSINN_OP_RELU, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/relu1.c b/source/graph_ref/relu1.c index 809716cf..d379aa69 100644 --- a/source/graph_ref/relu1.c +++ b/source/graph_ref/relu1.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_relu1(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RELU1, params); + shl_gref_siso_op(input, output, CSINN_OP_RELU1, params); return CSINN_TRUE; } diff --git a/source/graph_ref/relu6.c b/source/graph_ref/relu6.c index 1b830603..2b52e014 100644 --- a/source/graph_ref/relu6.c +++ b/source/graph_ref/relu6.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RELU6, params); + shl_gref_siso_op(input, output, CSINN_OP_RELU6, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/relun.c b/source/graph_ref/relun.c index 69ca4b79..02dd26f3 100644 --- a/source/graph_ref/relun.c +++ b/source/graph_ref/relun.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_relun(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_relun(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RELUN, params); + shl_gref_siso_op(input, output, CSINN_OP_RELUN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/reorg.c b/source/graph_ref/reorg.c index b945fd71..8d2800a3 100644 --- a/source/graph_ref/reorg.c +++ b/source/graph_ref/reorg.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reorg(struct csi_tensor *input, - struct csi_tensor *output, - struct reorg_params *params) +int shl_gref_reorg(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REORG, params); + shl_gref_siso_op(input, output, CSINN_OP_REORG, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reshape.c b/source/graph_ref/reshape.c index 54106616..fdfc970d 100644 --- a/source/graph_ref/reshape.c +++ b/source/graph_ref/reshape.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reshape(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int shl_gref_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RESHAPE, params); + shl_gref_siso_op(input, output, CSINN_OP_RESHAPE, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/resize.c b/source/graph_ref/resize.c index 7b8fa1d1..e717d888 100644 --- a/source/graph_ref/resize.c +++ b/source/graph_ref/resize.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_resize(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params) +int shl_gref_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RESIZE, params); + shl_gref_siso_op(input, output, CSINN_OP_RESIZE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/reverse.c b/source/graph_ref/reverse.c index 52f18f9d..ad61496a 100644 --- a/source/graph_ref/reverse.c +++ b/source/graph_ref/reverse.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_reverse(struct csi_tensor *input, - struct csi_tensor *output, - struct reverse_params *params) +int shl_gref_reverse(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_REVERSE, params); + shl_gref_siso_op(input, output, CSINN_OP_REVERSE, params); return CSINN_TRUE; } diff --git a/source/graph_ref/roialign.c b/source/graph_ref/roialign.c index e9e26127..0adf6977 100644 --- a/source/graph_ref/roialign.c +++ b/source/graph_ref/roialign.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_roi_align(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_align_params *params) +int shl_gref_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params) { - csi_debug_error("csi_gref_roi_align unsupport\n"); + shl_debug_error("shl_gref_roi_align unsupport\n"); return CSINN_FALSE; } diff --git a/source/graph_ref/roipool.c b/source/graph_ref/roipool.c index cbdae26b..67ff8aae 100644 --- a/source/graph_ref/roipool.c +++ b/source/graph_ref/roipool.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_roipool(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params) +int shl_gref_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params) { - csi_debug_error("csi_gref_roipool unsupport\n"); + shl_debug_error("shl_gref_roipool unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/round.c b/source/graph_ref/round.c index 63e4da48..fbe466b7 100644 --- a/source/graph_ref/round.c +++ b/source/graph_ref/round.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_round(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_round(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_ROUND, params); + shl_gref_siso_op(input, output, CSINN_OP_ROUND, params); return CSINN_TRUE; } diff --git a/source/graph_ref/rsqrt.c b/source/graph_ref/rsqrt.c index 74f1ce8a..1d0ae937 100644 --- a/source/graph_ref/rsqrt.c +++ b/source/graph_ref/rsqrt.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_rsqrt(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_RSQRT, params); + shl_gref_siso_op(input, output, CSINN_OP_RSQRT, params); return CSINN_TRUE; } diff --git a/source/graph_ref/scatter.c b/source/graph_ref/scatter.c index 5fc59448..8b568cd5 100644 --- a/source/graph_ref/scatter.c +++ b/source/graph_ref/scatter.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_scatter_nd(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *updates, - struct csi_tensor *output, - struct scatter_nd_params *params) +int shl_gref_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params) { - csi_debug_error("csi_gref_scatter_nd unsupport\n"); + shl_debug_error("shl_gref_scatter_nd unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/segment_max.c b/source/graph_ref/segment_max.c index 891b5e51..e5e0d281 100644 --- a/source/graph_ref/segment_max.c +++ b/source/graph_ref/segment_max.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_segment_max(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int shl_gref_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MAX, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MAX, params); return CSINN_TRUE; } diff --git a/source/graph_ref/segment_mean.c b/source/graph_ref/segment_mean.c index 70d9304b..e1642466 100644 --- a/source/graph_ref/segment_mean.c +++ b/source/graph_ref/segment_mean.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_segment_mean(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int shl_gref_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MEAN, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MEAN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/segment_min.c b/source/graph_ref/segment_min.c index 788207c8..a10b20f1 100644 --- a/source/graph_ref/segment_min.c +++ b/source/graph_ref/segment_min.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_segment_min(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int shl_gref_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MIN, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MIN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/segment_prod.c b/source/graph_ref/segment_prod.c index d57f0277..79ad2bfe 100644 --- a/source/graph_ref/segment_prod.c +++ b/source/graph_ref/segment_prod.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_segment_prod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int shl_gref_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_PROD, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_PROD, params); return CSINN_TRUE; } diff --git a/source/graph_ref/segment_sum.c b/source/graph_ref/segment_sum.c index 2e94c56c..d0bb76bb 100644 --- a/source/graph_ref/segment_sum.c +++ b/source/graph_ref/segment_sum.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_segment_sum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int shl_gref_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_SUM, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_SUM, params); return CSINN_TRUE; } diff --git a/source/graph_ref/select.c b/source/graph_ref/select.c index 18651d96..57e6add7 100644 --- a/source/graph_ref/select.c +++ b/source/graph_ref/select.c @@ -16,16 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_select(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params) +int shl_gref_select(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { - csi_debug_error("csi_gref_select unsupport\n"); + shl_debug_error("shl_gref_select unsupport\n"); return CSINN_FALSE; } diff --git a/source/graph_ref/sequence_mask.c b/source/graph_ref/sequence_mask.c index ba30de23..d72c7a47 100644 --- a/source/graph_ref/sequence_mask.c +++ b/source/graph_ref/sequence_mask.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sequence_mask(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct sequence_mask_params *params) +int shl_gref_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_sequence_mask_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SEQUENCE_MASK, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SEQUENCE_MASK, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/setup.c b/source/graph_ref/setup.c index 59a3e654..a51e02ae 100644 --- a/source/graph_ref/setup.c +++ b/source/graph_ref/setup.c @@ -16,71 +16,70 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" -#include "csi_utils.h" +#include "shl_gref.h" -void csi_gref_set_output_number(int number, struct csi_session *sess) +void shl_gref_set_output_number(int number, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); graph->output_num = number; - graph->output = csi_mem_alloc(sizeof(struct csi_node *) * number); + graph->output = shl_mem_alloc(sizeof(struct shl_node *) * number); } -void csi_gref_set_input_number(int number, struct csi_session *sess) +void shl_gref_set_input_number(int number, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); graph->input_num = number; - graph->input = csi_mem_alloc(sizeof(struct csi_node *) * number); + graph->input = shl_mem_alloc(sizeof(struct shl_node *) * number); } -int csi_gref_get_output(int index, struct csi_tensor *output, struct csi_session *sess) +int shl_gref_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - csi_tensor_copy(output, graph->output[index]->data); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + csinn_tensor_copy(output, graph->output[index]->data); return CSINN_TRUE; } -int csi_gref_get_input(int index, struct csi_tensor *input, struct csi_session *sess) +int shl_gref_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - csi_tensor_copy(input, graph->input[index]->data); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + csinn_tensor_copy(input, graph->input[index]->data); return CSINN_TRUE; } -void csi_gref_update_input(int index, struct csi_tensor *input, struct csi_session *sess) +void shl_gref_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - struct csi_tensor *t = graph->input[index]->data; + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + struct csinn_tensor *t = graph->input[index]->data; t->data = input->data; } -void csi_gref_update_output(int index, struct csi_tensor *output, struct csi_session *sess) +void shl_gref_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - struct csi_tensor *t = graph->output[index]->data; + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + struct csinn_tensor *t = graph->output[index]->data; t->data = output->data; } -void csi_gref_session_init(struct csi_session *sess) +void shl_gref_session_init(struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_mem_alloc(sizeof(struct csi_ref_graph)); - struct csi_gref_target_data *target_data = csi_mem_alloc(sizeof(struct csi_gref_target_data)); + struct shl_ref_graph *graph = shl_mem_alloc(sizeof(struct shl_ref_graph)); + struct shl_gref_target_data *target_data = shl_mem_alloc(sizeof(struct shl_gref_target_data)); target_data->graph = graph; sess->td = target_data; sess->base_layout = CSINN_LAYOUT_NCHW; } -static int call_layer_func(void *fn, struct csi_node *node) +static int call_layer_func(void *fn, struct shl_node *node) { /* base has same address with params */ - struct csi_params_base *params = node->data; + struct csinn_params_base *params = node->data; int (*func)(); func = fn; int ret = CSINN_TRUE; - struct csi_tensor **inputs; - struct csi_tensor **outputs; + struct csinn_tensor **inputs; + struct csinn_tensor **outputs; switch (node->type) { case CSINN_OP_ABS: @@ -106,6 +105,7 @@ static int call_layer_func(void *fn, struct csi_node *node) case CSINN_OP_CROP: case CSINN_OP_CUMPROD: case CSINN_OP_CUMSUM: + case CSINN_OP_DATA_CONVERT: case CSINN_OP_DEPTH_TO_SPACE: case CSINN_OP_ELU: case CSINN_OP_ERF: @@ -260,75 +260,75 @@ static int call_layer_func(void *fn, struct csi_node *node) node->in[4]->data, node->out[0]->data, params); break; case CSINN_OP_CONCAT: - inputs = csi_mem_alloc(sizeof(struct csi_tensor *) * - ((struct concat_params *)params)->inputs_count); - for (int i = 0; i < ((struct concat_params *)params)->inputs_count; i++) { + inputs = shl_mem_alloc(sizeof(struct csinn_tensor *) * + ((struct csinn_concat_params *)params)->inputs_count); + for (int i = 0; i < ((struct csinn_concat_params *)params)->inputs_count; i++) { inputs[i] = node->in[i]->data; } ret = func(inputs, node->out[0]->data, params); - csi_mem_free(inputs); + shl_mem_free(inputs); break; case CSINN_OP_SPLIT: - outputs = csi_mem_alloc(sizeof(struct csi_tensor *) * - ((struct split_params *)params)->output_num); - for (int i = 0; i < ((struct split_params *)params)->output_num; i++) { + outputs = shl_mem_alloc(sizeof(struct csinn_tensor *) * + ((struct csinn_split_params *)params)->output_num); + for (int i = 0; i < ((struct csinn_split_params *)params)->output_num; i++) { outputs[i] = node->out[i]->data; } ret = func(node->in[0]->data, outputs, params); - csi_mem_free(outputs); + shl_mem_free(outputs); break; case CSINN_OP_ALL: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ALL\n")); + shl_debug_error("unsupported CSINN_OP_ALL\n"); break; case CSINN_OP_ARANGE: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ARANGE\n")); + shl_debug_error("unsupported CSINN_OP_ARANGE\n"); break; case CSINN_OP_BN: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_BN\n")); + shl_debug_error("unsupported CSINN_OP_BN\n"); break; case CSINN_OP_MIN_STRIDE: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_MIN_STRIDE\n")); + shl_debug_error("unsupported CSINN_OP_MIN_STRIDE\n"); break; case CSINN_OP_ONE_HOT: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ONE_HOT\n")); + shl_debug_error("unsupported CSINN_OP_ONE_HOT\n"); break; case CSINN_OP_PROPOSAL: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_PROPOSAL\n")); + shl_debug_error("unsupported CSINN_OP_PROPOSAL\n"); break; case CSINN_OP_PSROIPOOLING: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_PSROIPOOLING\n")); + shl_debug_error("unsupported CSINN_OP_PSROIPOOLING\n"); break; case CSINN_OP_ROIALIGN: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ROIALIGN\n")); + shl_debug_error("unsupported CSINN_OP_ROIALIGN\n"); break; case CSINN_OP_ROIPOOL: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ROIPOOL\n")); + shl_debug_error("unsupported CSINN_OP_ROIPOOL\n"); break; case CSINN_OP_SCATTER_ND: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_SCATTER_ND\n")); + shl_debug_error("unsupported CSINN_OP_SCATTER_ND\n"); break; case CSINN_OP_SELECT: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_SELECT\n")); + shl_debug_error("unsupported CSINN_OP_SELECT\n"); break; case CSINN_OP_TOPK: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_TOPK\n")); + shl_debug_error("unsupported CSINN_OP_TOPK\n"); break; case CSINN_OP_WHERE: - CSI_DEBUG_CALL(printf("unsupported CSINN_OP_WHERE\n")); + shl_debug_error("unsupported CSINN_OP_WHERE\n"); break; default: - CSI_DEBUG_CALL(printf("unknown op\n")); + shl_debug_error("unknown op\n"); return CSINN_FALSE; } return ret; } -void csi_gref_reset_graph_visit(struct csi_ref_graph *graph) +void shl_gref_reset_graph_visit(struct shl_ref_graph *graph) { for (int i = 0; i < graph->layer_index; i++) { if (graph->layer[i]->type == CSINN_SUBGRAPH) { graph->layer[i]->visited = 0; - struct csi_ref_graph *s_subgraph = graph->layer[i]->data; + struct shl_ref_graph *s_subgraph = graph->layer[i]->data; for (int j = 0; j < s_subgraph->layer_index; j++) { s_subgraph->layer[j]->visited = 0; } @@ -341,126 +341,123 @@ void csi_gref_reset_graph_visit(struct csi_ref_graph *graph) /* * transform graph as gloal graph and sub graph */ -static struct csi_ref_graph *transform_graph(struct csi_ref_graph *ograph) +static struct shl_ref_graph *transform_graph(struct shl_ref_graph *ograph) { - struct csi_ref_graph *ggraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *ggraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); ggraph->input = ograph->input; ggraph->output = ograph->output; ggraph->input_num = ograph->input_num; ggraph->output_num = ograph->output_num; for (int i = 0; i < ograph->layer_index; i++) { - struct csi_node *n = ograph->layer[i]; - struct csi_params_base *params = n->data; + struct shl_node *n = ograph->layer[i]; + struct csinn_params_base *params = n->data; if (params->sess->base_api != params->api) { - csi_subgraph_alloc(n, ograph, ggraph); + shl_subgraph_alloc(n, ograph, ggraph); } else { - csi_gref_graph_insert(n, ggraph); + shl_gref_graph_insert(n, ggraph); } } return ggraph; } -static int init_op(struct csi_node *node) +static int init_op(struct shl_node *node) { /* base has same address with params */ - struct csi_params_base *params = node->data; + struct csinn_params_base *params = node->data; int (*func)(); - struct csi_tensor *input = node->in[0]->data; + struct csinn_tensor *input = node->in[0]->data; - func = csi_init_map(params->api, node->type, input->dtype); - if (func != NULL) { - if (call_layer_func(func, node) == CSINN_TRUE) { - return CSINN_TRUE; - } else { - func = NULL; + int org_rm = params->sess->base_run_mode; + params->sess->base_run_mode = CSINN_RM_LAYER; + shl_op_callback_map(params, node->type, input->dtype); + struct csinn_callback *cb = params->cb; + if (cb->init != NULL) { + if (call_layer_func(cb->init, node) != CSINN_TRUE) { + return CSINN_FALSE; } } + params->sess->base_run_mode = org_rm; - if (func == NULL) { - params->bc = csi_bc_map(params->api, CSINN_RM_LAYER, node->type, params->sess->base_dtype); - return CSINN_TRUE; - } - - return CSINN_FALSE; + return CSINN_TRUE; } -void csi_subgraph_fvisit_create(struct csi_ref_graph *graph, struct csi_node *node) +void shl_subgraph_fvisit_create(struct shl_ref_graph *graph, struct shl_node *node) { - csi_gref_graph_insert(node, graph); + shl_gref_graph_insert(node, graph); } /* * transform graph as gloal graph and sub graph */ -static struct csi_ref_graph *convert_graph(struct csi_ref_graph *ograph) +static struct shl_ref_graph *convert_graph(struct shl_ref_graph *ograph) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) { - csi_debug_info("\nOriginal graph:\n"); - csi_gref_post_dfs(ograph, csi_subgraph_fvisit_print); - csi_gref_reset_graph_visit(ograph); + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) { + shl_debug_info("\nOriginal graph:\n"); + shl_gref_post_dfs(ograph, shl_subgraph_fvisit_print); + shl_gref_reset_graph_visit(ograph); } - struct csi_ref_graph *subgraph = csi_subgraph_generate(ograph); - csi_gref_reset_graph_visit(subgraph); + struct shl_ref_graph *subgraph = shl_subgraph_generate(ograph); + shl_gref_reset_graph_visit(subgraph); - csi_debug_info("\nGenerated subgraph:\n"); + shl_debug_info("\nGenerated subgraph:\n"); for (int i = 0; i < subgraph->layer_index; i++) { if (subgraph->layer[i]->type == CSINN_SUBGRAPH) { - struct csi_ref_graph *s_subgraph = subgraph->layer[i]->data; + struct shl_ref_graph *s_subgraph = subgraph->layer[i]->data; if (s_subgraph->layer_size == 0) continue; - csi_gref_update_input_output(subgraph, i); - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) { - csi_debug_info("---- subgraph_%d: ----\n", i); - csi_gref_reset_graph_visit(s_subgraph); - csi_gref_post_dfs(s_subgraph, csi_subgraph_fvisit_print); - csi_gref_reset_graph_visit(s_subgraph); - csi_debug_info("----subgraph_%d end.----\n", i); + shl_gref_update_input_output(subgraph, i); + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) { + shl_debug_info("---- subgraph_%d: ----\n", i); + shl_gref_reset_graph_visit(s_subgraph); + shl_gref_post_dfs(s_subgraph, shl_subgraph_fvisit_print); + shl_gref_reset_graph_visit(s_subgraph); + shl_debug_info("----subgraph_%d end.----\n", i); } - struct csi_ref_graph *new_sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *new_sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); new_sgraph->input = s_subgraph->input; new_sgraph->output = s_subgraph->output; new_sgraph->input_num = s_subgraph->input_num; new_sgraph->output_num = s_subgraph->output_num; - csi_gref_post_dfs(new_sgraph, csi_subgraph_fvisit_create); + shl_gref_post_dfs(new_sgraph, shl_subgraph_fvisit_create); subgraph->layer[i]->data = new_sgraph; - csi_gref_reset_graph_visit(s_subgraph); + shl_gref_reset_graph_visit(s_subgraph); } else { - csi_debug_info("%s\n", subgraph->layer[i]->name); + shl_debug_info("%s\n", subgraph->layer[i]->name); } } - csi_gref_reset_graph_visit(subgraph); - struct csi_ref_graph *ggraph = csi_subgraph_rebuild(subgraph); + shl_gref_reset_graph_visit(subgraph); + struct shl_ref_graph *ggraph = shl_subgraph_rebuild(subgraph); - struct csi_ref_graph *sorted_graph = csi_subgraph_topology_sort(ggraph); - csi_debug_info("\nsorted subgraph:\n"); + struct shl_ref_graph *sorted_graph = shl_subgraph_topology_sort(ggraph); + shl_debug_info("\nsorted subgraph:\n"); for (int i = 0; i < sorted_graph->layer_index; i++) { if (sorted_graph->layer[i]->type == CSINN_SUBGRAPH) { - struct csi_ref_graph *s_subgraph = sorted_graph->layer[i]->data; + struct shl_ref_graph *s_subgraph = sorted_graph->layer[i]->data; if (s_subgraph->layer_size == 0) continue; - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) { - csi_debug_info("---- subgraph_%d: ----\n", i); - csi_gref_reset_graph_visit(s_subgraph); - csi_gref_post_dfs(s_subgraph, csi_subgraph_fvisit_print); - csi_gref_reset_graph_visit(s_subgraph); - csi_debug_info("----subgraph_%d end.----\n", i); + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) { + shl_debug_info("---- subgraph_%d: ----\n", i); + shl_gref_reset_graph_visit(s_subgraph); + shl_gref_post_dfs(s_subgraph, shl_subgraph_fvisit_print); + shl_gref_reset_graph_visit(s_subgraph); + shl_debug_info("----subgraph_%d end.----\n", i); } - csi_gref_reset_graph_visit(s_subgraph); + shl_gref_reset_graph_visit(s_subgraph); } else { - csi_debug_info("%s\n", sorted_graph->layer[i]->name); + shl_debug_info("%s\n", sorted_graph->layer[i]->name); } } return sorted_graph; } -void csi_gref_session_setup(struct csi_session *sess) +void shl_gref_session_setup(struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - struct csi_node *n; + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + struct shl_node *n; for (int i = 0; i < graph->layer_index; i++) { n = graph->layer[i]; @@ -478,27 +475,27 @@ void csi_gref_session_setup(struct csi_session *sess) graph->output[i]->ref_count_init++; } - struct csi_ref_graph *ggraph = convert_graph(graph); + struct shl_ref_graph *ggraph = convert_graph(graph); for (int i = 0; i < ggraph->layer_index; i++) { - struct csi_node *n = ggraph->layer[i]; + struct shl_node *n = ggraph->layer[i]; if (n->type == CSINN_SUBGRAPH) { - csi_subgraph_init(n); - } else if (n->type >= 0 && n->type < CSINN_SESSION_INIT) { + shl_subgraph_setup(n); + } else if (n->type >= 0 && n->type < CSINN_OP_SIZE) { init_op(n); } else { - csi_debug_error("Unknown layer\n"); + shl_debug_error("Unknown layer\n"); return; } } - struct csi_gref_target_data *td = sess->td; + struct shl_gref_target_data *td = sess->td; td->graph = ggraph; } -static void node_ref_reset(struct csi_session *sess) +static void node_ref_reset(struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - struct csi_node *n; + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + struct shl_node *n; for (int i = 0; i < graph->layer_index; i++) { n = graph->layer[i]; @@ -510,23 +507,23 @@ static void node_ref_reset(struct csi_session *sess) } } -static int op_run_init(struct csi_node *node) +static int op_run_init(struct shl_node *node) { for (int i = 0; i < node->out_num; i++) { - struct csi_tensor *t = node->out[i]->data; - t->data = csi_mem_alloc(csi_tensor_byte_size(t)); + struct csinn_tensor *t = node->out[i]->data; + t->data = shl_mem_alloc(csinn_tensor_byte_size(t)); } return CSINN_TRUE; } -static int op_run_deinit(struct csi_node *node) +static int op_run_deinit(struct shl_node *node) { for (int i = 0; i < node->in_num; i++) { if (node->in[i]->ref_count > 0) { node->in[i]->ref_count--; if (node->in[i]->ref_count == 0) { - struct csi_tensor *t = node->in[i]->data; - csi_mem_free(t->data); + struct csinn_tensor *t = node->in[i]->data; + shl_mem_free(t->data); } } } @@ -536,34 +533,34 @@ static int op_run_deinit(struct csi_node *node) return CSINN_TRUE; } -static int op_run(struct csi_node *node) +static int op_run(struct shl_node *node) { /* base has same address with params */ - struct csi_params_base *params = node->data; + struct csinn_params_base *params = node->data; int (*func)(); - - func = params->bc; + struct csinn_callback *cb = params->cb; + func = cb->exec; return call_layer_func(func, node); } -int csi_gref_session_run(struct csi_session *sess) +int shl_gref_session_run(struct csinn_session *sess) { - struct csi_ref_graph *g = csi_gref_get_graph(sess); + struct shl_ref_graph *g = shl_gref_get_graph(sess); uint64_t time_acc = 0; node_ref_reset(sess); for (int i = 0; i < g->layer_index; i++) { - struct csi_node *n = g->layer[i]; + struct shl_node *n = g->layer[i]; if (n->type == CSINN_SUBGRAPH) { - csi_subgraph_run_init(n); - csi_subgraph_run(n); - csi_subgraph_run_deinit(n); - } else if (n->type >= 0 && n->type < CSINN_SESSION_INIT) { + shl_subgraph_run_init(n); + shl_subgraph_run(n); + shl_subgraph_run_deinit(n); + } else if (n->type >= 0 && n->type < CSINN_OP_SIZE) { op_run_init(n); -#ifdef CSINN_LAYER_BENCHMARK - uint64_t start_time = csi_get_timespec(); +#ifdef SHL_LAYER_BENCHMARK + uint64_t start_time = shl_get_timespec(); op_run(n); - uint64_t end_time = csi_get_timespec(); - csi_benchmark_layer(n, start_time, end_time, i); + uint64_t end_time = shl_get_timespec(); + shl_benchmark_layer(n, start_time, end_time, i); time_acc += end_time - start_time; #else op_run(n); @@ -573,62 +570,62 @@ int csi_gref_session_run(struct csi_session *sess) return CSINN_FALSE; } } -#ifdef CSINN_LAYER_BENCHMARK - csi_debug_info("[layer-benchmark]: network exec time = %f\n", time_acc / 1000000.0f); +#ifdef SHL_LAYER_BENCHMARK + shl_debug_info("[layer-benchmark]: network exec time = %f\n", time_acc / 1000000.0f); #endif return CSINN_TRUE; } -void csi_gref_set_tensor(struct csi_tensor *input, struct csi_session *sess) +void shl_gref_set_tensor(struct csinn_tensor *input, struct csinn_session *sess) { - struct csi_node *in = csi_node_var_alloc(input->name, input); + struct shl_node *in = shl_node_var_alloc(input->name, input); input->data = in; } -void csi_gref_set_input(int index, struct csi_tensor *input, struct csi_session *sess) +void shl_gref_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); graph->input[index] = input->data; } -void csi_gref_set_output(int index, struct csi_tensor *output, struct csi_session *sess) +void shl_gref_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { - struct csi_ref_graph *graph = csi_gref_get_graph(sess); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); /* FIXME: const output's data is real value, not node */ if (output->is_const) { - struct csi_node *const_output_node = csi_node_const_var_alloc(output->name, output); + struct shl_node *const_output_node = shl_node_const_var_alloc(output->name, output); graph->output[index] = const_output_node; } else { graph->output[index] = output->data; } } -void csi_gref_session_deinit(struct csi_session *sess) +void shl_gref_session_deinit(struct csinn_session *sess) { - struct csi_ref_graph *g = csi_gref_get_graph(sess); + struct shl_ref_graph *g = shl_gref_get_graph(sess); for (int i = 0; i < g->layer_index; i++) { - struct csi_node *n = g->layer[i]; + struct shl_node *n = g->layer[i]; if (n->type == CSINN_SUBGRAPH) { - csi_subgraph_deinit(n); + shl_subgraph_deinit(n); } } - struct csi_ref_graph *graph = csi_gref_get_graph(sess); - csi_mem_free(graph->input); - csi_mem_free(graph->output); + struct shl_ref_graph *graph = shl_gref_get_graph(sess); + shl_mem_free(graph->input); + shl_mem_free(graph->output); } -struct csi_ref_graph *csi_gref_get_graph(struct csi_session *sess) +struct shl_ref_graph *shl_gref_get_graph(struct csinn_session *sess) { - struct csi_gref_target_data *td = sess->td; + struct shl_gref_target_data *td = sess->td; return td->graph; } -int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node) +int shl_gref_is_root_node(struct shl_ref_graph *graph, struct shl_node *node) { int is_root = 1; for (int i = 0; i < node->in_num; i++) { - struct csi_tensor *in_tensor = node->in[i]->data; + struct csinn_tensor *in_tensor = node->in[i]->data; if (in_tensor->is_const) continue; int find_res = 0; for (int j = 0; j < graph->input_num; j++) { @@ -645,25 +642,25 @@ int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node) return is_root; } -void csi_gref_post_dfs(struct csi_ref_graph *graph, - void (*fvisit)(struct csi_ref_graph *, struct csi_node *)) +void shl_gref_post_dfs(struct shl_ref_graph *graph, + void (*fvisit)(struct shl_ref_graph *, struct shl_node *)) { int stack_size = 32; - struct csi_node **node_stack = csi_mem_alloc(sizeof(struct csi_node *) * stack_size); - int *input_idx_stack = csi_mem_alloc(sizeof(int) * stack_size); + struct shl_node **node_stack = shl_mem_alloc(sizeof(struct shl_node *) * stack_size); + int *input_idx_stack = shl_mem_alloc(sizeof(int) * stack_size); int stack_top = -1; - struct csi_node *curr_node; + struct shl_node *curr_node; for (int i = 0; i < graph->output_num; i++) { - struct csi_tensor *ot = graph->output[i]->data; + struct csinn_tensor *ot = graph->output[i]->data; if (ot->is_const) continue; curr_node = graph->output[i]->in[0]; if (curr_node->visited == 0) { ++stack_top; if (stack_top >= stack_size) { stack_size += 32; - node_stack = csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size); - input_idx_stack = csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size); + node_stack = shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size); + input_idx_stack = shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size); } node_stack[stack_top] = curr_node; input_idx_stack[stack_top] = 0; @@ -671,12 +668,12 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph, } while (stack_top != -1) { curr_node = node_stack[stack_top]; - if (input_idx_stack[stack_top] == csi_node_get_non_const_in_number(curr_node)) { + if (input_idx_stack[stack_top] == shl_node_get_non_const_in_number(curr_node)) { fvisit(graph, curr_node); --stack_top; } else { - struct csi_node *next_node = NULL; - if (csi_node_find(graph->input, graph->input_num, + struct shl_node *next_node = NULL; + if (shl_node_find(graph->input, graph->input_num, curr_node->in[input_idx_stack[stack_top]]) == -1) { next_node = curr_node->in[input_idx_stack[stack_top]]->in[0]; if (next_node && next_node->type == CSINN_SUBGRAPH_RETURN) { @@ -689,9 +686,9 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph, if (stack_top >= stack_size) { stack_size += 32; node_stack = - csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size); + shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size); input_idx_stack = - csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size); + shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size); } node_stack[stack_top] = next_node; input_idx_stack[stack_top] = 0; @@ -701,54 +698,54 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph, } } - csi_mem_free(node_stack); - csi_mem_free(input_idx_stack); + shl_mem_free(node_stack); + shl_mem_free(input_idx_stack); } -void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index) +void shl_gref_update_input_output(struct shl_ref_graph *ograph, int index) { if (ograph->layer[index]->type != CSINN_SUBGRAPH) { return; } - struct csi_ref_graph *graph = ograph->layer[index]->data; + struct shl_ref_graph *graph = ograph->layer[index]->data; if (graph->layer_size == 0) return; /* update inputs */ graph->input = NULL; graph->input_num = 0; - struct csi_node **tensor_node_set = NULL; + struct shl_node **tensor_node_set = NULL; int set_num = 0; for (int i = 0; i < graph->layer_index; i++) { - for (int j = 0; j < csi_node_get_non_const_in_number(graph->layer[i]); j++) { - struct csi_node *in_tensor_node = graph->layer[i]->in[j]; - if (csi_node_find(graph->layer, graph->layer_index, in_tensor_node->in[0]) == -1 && - csi_node_find(tensor_node_set, set_num, in_tensor_node) == -1) { - graph->input = csi_mem_realloc(graph->input, - sizeof(struct csi_node *) * (graph->input_num + 1)); + for (int j = 0; j < shl_node_get_non_const_in_number(graph->layer[i]); j++) { + struct shl_node *in_tensor_node = graph->layer[i]->in[j]; + if (shl_node_find(graph->layer, graph->layer_index, in_tensor_node->in[0]) == -1 && + shl_node_find(tensor_node_set, set_num, in_tensor_node) == -1) { + graph->input = shl_mem_realloc(graph->input, + sizeof(struct shl_node *) * (graph->input_num + 1)); graph->input[graph->input_num] = in_tensor_node; graph->input_num++; // tensor_node_set[set_num] = in_tensor_node; tensor_node_set = - csi_mem_realloc(tensor_node_set, sizeof(struct csi_node *) * (set_num + 1)); + shl_mem_realloc(tensor_node_set, sizeof(struct shl_node *) * (set_num + 1)); tensor_node_set[set_num] = in_tensor_node; set_num++; } } } - csi_mem_free(tensor_node_set); + shl_mem_free(tensor_node_set); /* update outputs */ graph->output = NULL; graph->output_num = 0; for (int i = 0; i < graph->layer_index; i++) { for (int j = 0; j < graph->layer[i]->out_num; j++) { - struct csi_node *out_tensor_node = graph->layer[i]->out[j]; + struct shl_node *out_tensor_node = graph->layer[i]->out[j]; int find_res_inside = 0; for (int k = 0; k < graph->layer_index; k++) { if (k == i) continue; - if (csi_node_find(graph->layer[k]->in, graph->layer[k]->in_num, out_tensor_node) > + if (shl_node_find(graph->layer[k]->in, graph->layer[k]->in_num, out_tensor_node) > -1) { find_res_inside = 1; break; @@ -759,17 +756,17 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index) for (int s_idx = 0; s_idx < ograph->layer_index; s_idx++) { if (s_idx == index) continue; if (ograph->layer[s_idx]->type != CSINN_SUBGRAPH) { - if (csi_node_find(ograph->layer[s_idx]->in, ograph->layer[s_idx]->in_num, + if (shl_node_find(ograph->layer[s_idx]->in, ograph->layer[s_idx]->in_num, out_tensor_node) > -1) { find_res_outside = 1; break; } } else { - struct csi_ref_graph *outside_sgraph = ograph->layer[s_idx]->data; + struct shl_ref_graph *outside_sgraph = ograph->layer[s_idx]->data; if (outside_sgraph->layer_size == 0) continue; for (int inner_idx = 0; inner_idx < outside_sgraph->layer_index; inner_idx++) { - if (csi_node_find(outside_sgraph->layer[inner_idx]->in, + if (shl_node_find(outside_sgraph->layer[inner_idx]->in, outside_sgraph->layer[inner_idx]->in_num, out_tensor_node) > -1) { find_res_outside = 1; @@ -783,8 +780,8 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index) } if (!find_res_inside || find_res_outside) { - graph->output = csi_mem_realloc( - graph->output, sizeof(struct csi_node *) * (graph->output_num + 1)); + graph->output = shl_mem_realloc( + graph->output, sizeof(struct shl_node *) * (graph->output_num + 1)); graph->output[graph->output_num] = out_tensor_node; graph->output_num++; } @@ -792,204 +789,238 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index) } } -static void *setup_bc_map() +static void *setup_cb_map() { - static void *bc_map[CSINN_OP_AND_UTILS_SIZE]; - - bc_map[CSINN_OP_ABS] = csi_gref_abs; - bc_map[CSINN_OP_ACOS] = csi_gref_acos; - bc_map[CSINN_OP_ACOSH] = csi_gref_acosh; - bc_map[CSINN_OP_ADD] = csi_gref_add; - bc_map[CSINN_OP_ALL] = csi_gref_all; - bc_map[CSINN_OP_AND] = csi_gref_and; - bc_map[CSINN_OP_ANY] = csi_gref_any; - bc_map[CSINN_OP_ARANGE] = csi_gref_arange; - bc_map[CSINN_OP_ARGMAX] = csi_gref_argmax; - bc_map[CSINN_OP_ARGMIN] = csi_gref_argmin; - bc_map[CSINN_OP_ASIN] = csi_gref_asin; - bc_map[CSINN_OP_ASINH] = csi_gref_asinh; - bc_map[CSINN_OP_ATAN] = csi_gref_atan; - bc_map[CSINN_OP_ATANH] = csi_gref_atanh; - bc_map[CSINN_OP_AVGPOOL2D] = csi_gref_avgpool2d; - bc_map[CSINN_OP_AVGPOOL3D] = csi_gref_avgpool3d; - bc_map[CSINN_OP_BN] = csi_gref_batch_normalization; - bc_map[CSINN_OP_BATCH_TO_SPACE] = csi_gref_batch_to_space; - bc_map[CSINN_OP_BATCH_TO_SPACE_ND] = csi_gref_batch_to_space_nd; - bc_map[CSINN_OP_BROADCOST] = csi_gref_broadcast_to; - bc_map[CSINN_OP_CACHE_MATMUL] = csi_gref_cache_matmul; - bc_map[CSINN_OP_CACHE_CONV1D] = csi_gref_cache_conv1d; - bc_map[CSINN_OP_CEIL] = csi_gref_ceil; - bc_map[CSINN_OP_CLIP] = csi_gref_clip; - bc_map[CSINN_OP_COL2IM] = csi_gref_col2im; - bc_map[CSINN_OP_CONCAT] = csi_gref_concat; - bc_map[CSINN_OP_CONV1D] = csi_gref_conv1d; - bc_map[CSINN_OP_CONV2D] = csi_gref_conv2d; - bc_map[CSINN_OP_CONV2D_RELU] = csi_gref_conv2d_relu; - bc_map[CSINN_OP_CONV2D_RELU6] = csi_gref_conv2d_relu6; - bc_map[CSINN_OP_DEPTHWISE_CONV2D] = csi_gref_depthwise_conv2d; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU] = csi_gref_depthwise_conv2d_relu; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6] = csi_gref_depthwise_conv2d_relu6; - bc_map[CSINN_OP_GROUP_CONV2D] = csi_gref_group_conv2d; - bc_map[CSINN_OP_CONV3D] = csi_gref_conv3d; - bc_map[CSINN_OP_DECONV2D] = csi_gref_deconv2d; - bc_map[CSINN_OP_DEPTHWISE_DECONV2D] = csi_gref_depthwise_deconv2d; - bc_map[CSINN_OP_DECONV3D] = csi_gref_deconv3d; - bc_map[CSINN_OP_COS] = csi_gref_cos; - bc_map[CSINN_OP_COSH] = csi_gref_cosh; - bc_map[CSINN_OP_CUMPROD] = csi_gref_cumprod; - bc_map[CSINN_OP_CUMSUM] = csi_gref_cumsum; - bc_map[CSINN_OP_DEPTH_TO_SPACE] = csi_gref_depth_to_space; - bc_map[CSINN_OP_DIV] = csi_gref_div; - bc_map[CSINN_OP_ELU] = csi_gref_elu; - bc_map[CSINN_OP_EQUANL] = csi_gref_equal; - bc_map[CSINN_OP_ERF] = csi_gref_erf; - bc_map[CSINN_OP_EXP] = csi_gref_exp; - bc_map[CSINN_OP_EXPAND_DIMS] = csi_gref_expand_dims; - bc_map[CSINN_OP_EXPM1] = csi_gref_expm1; - bc_map[CSINN_OP_FLATTEN] = csi_gref_flatten; - bc_map[CSINN_OP_FLOOR_DIVIDE] = csi_gref_floor_divide; - bc_map[CSINN_OP_FLOOR_MOD] = csi_gref_floor_mod; - bc_map[CSINN_OP_FLOOR] = csi_gref_floor; - bc_map[CSINN_OP_FSMN] = csi_gref_fsmn; - bc_map[CSINN_OP_FULLYCONNECTED] = csi_gref_fullyconnected; - bc_map[CSINN_OP_GATHER_ND] = csi_gref_gather_nd; - bc_map[CSINN_OP_GATHER] = csi_gref_gather; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D] = csi_gref_global_avgpool2d; - bc_map[CSINN_OP_GLOBAL_MAXPOOL2D] = csi_gref_global_maxpool2d; - bc_map[CSINN_OP_GREATHER_EQUAL] = csi_gref_greater_equal; - bc_map[CSINN_OP_GREATHER] = csi_gref_greater; - bc_map[CSINN_OP_HARD_SIGMOID] = csi_gref_hard_sigmoid; - bc_map[CSINN_OP_IM2COL] = csi_gref_im2col; - bc_map[CSINN_OP_ISNAN] = csi_gref_isnan_bool; - bc_map[CSINN_OP_LAYER_NORM] = csi_gref_layer_norm; - bc_map[CSINN_OP_L2N] = csi_gref_l2_normalization; - bc_map[CSINN_OP_L2POOL2D] = csi_gref_l2pool; - bc_map[CSINN_OP_LEAKY_RELU] = csi_gref_leaky_relu; - bc_map[CSINN_OP_LESS_EQUAL] = csi_gref_less_equal; - bc_map[CSINN_OP_LESS] = csi_gref_less; - bc_map[CSINN_OP_LOG_SOFTMAX] = csi_gref_log_softmax; - bc_map[CSINN_OP_LOG] = csi_gref_log; - bc_map[CSINN_OP_LOG1P] = csi_gref_log1p; - bc_map[CSINN_OP_LOGICAL_AND] = csi_gref_logical_and; - bc_map[CSINN_OP_LOGICAL_NOT] = csi_gref_logical_not; - bc_map[CSINN_OP_LOGICAL_OR] = csi_gref_logical_or; - bc_map[CSINN_OP_LOGICAL_XOR] = csi_gref_logical_xor; - bc_map[CSINN_OP_LRN] = csi_gref_lrn; - bc_map[CSINN_OP_MATMUL] = csi_gref_matmul; - bc_map[CSINN_OP_MAX] = csi_gref_max; - bc_map[CSINN_OP_MAXIMUM] = csi_gref_maximum; - bc_map[CSINN_OP_MAXPOOL2D] = csi_gref_maxpool2d; - bc_map[CSINN_OP_MAXPOOL2D_LOCAT] = csi_gref_maxpool2d_locat; - bc_map[CSINN_OP_MAXPOOL3D] = csi_gref_maxpool3d; - bc_map[CSINN_OP_MEAN] = csi_gref_mean; - bc_map[CSINN_OP_MEAN_STRIDE] = csi_gref_mean; - bc_map[CSINN_OP_MIN] = csi_gref_min; - bc_map[CSINN_OP_MINIMUM] = csi_gref_minimum; - bc_map[CSINN_OP_MOD] = csi_gref_mod; - bc_map[CSINN_OP_MUL] = csi_gref_mul; - bc_map[CSINN_OP_NDARRAY_SIZE] = csi_gref_ndarray_size; - bc_map[CSINN_OP_NEGATIIVE] = csi_gref_negative; - bc_map[CSINN_OP_NON_MAX_SUPPRESSION] = csi_gref_non_max_suppression; - bc_map[CSINN_OP_NOT_EQUAL] = csi_gref_not_equal; - bc_map[CSINN_OP_NOT] = csi_gref_not; - bc_map[CSINN_OP_OR] = csi_gref_or; - bc_map[CSINN_OP_PAD] = csi_gref_pad; - bc_map[CSINN_OP_POWER] = csi_gref_power; - bc_map[CSINN_OP_PRELU] = csi_gref_prelu; - bc_map[CSINN_OP_PROD] = csi_gref_prod; - bc_map[CSINN_OP_PROPOSAL] = csi_gref_proposal; - bc_map[CSINN_OP_PSROIPOOLING] = csi_gref_psroipooling; - bc_map[CSINN_OP_REDUCE_LOGSUMEXP] = csi_gref_reduce_logsumexp; - bc_map[CSINN_OP_REDUCE_MAX] = csi_gref_reduce_max; - bc_map[CSINN_OP_REDUCE_MEAN] = csi_gref_reduce_mean; - bc_map[CSINN_OP_REDUCE_MIN] = csi_gref_reduce_min; - bc_map[CSINN_OP_REDUCE_PROD] = csi_gref_reduce_prod; - bc_map[CSINN_OP_REDUCE_SUM] = csi_gref_reduce_sum; - bc_map[CSINN_OP_RELU] = csi_gref_relu; - bc_map[CSINN_OP_RELU1] = csi_gref_relu1; - bc_map[CSINN_OP_RELU6] = csi_gref_relu6; - bc_map[CSINN_OP_RELUN] = csi_gref_relun; - bc_map[CSINN_OP_RESHAPE] = csi_gref_reshape; - bc_map[CSINN_OP_RESIZE] = csi_gref_resize; - bc_map[CSINN_OP_REVERSE] = csi_gref_reverse; - bc_map[CSINN_OP_ROIALIGN] = csi_gref_roi_align; - bc_map[CSINN_OP_ROIPOOL] = csi_gref_roipool; - bc_map[CSINN_OP_ROUND] = csi_gref_round; - bc_map[CSINN_OP_RSQRT] = csi_gref_rsqrt; - bc_map[CSINN_OP_SCATTER_ND] = csi_gref_scatter_nd; - bc_map[CSINN_OP_SEGMENT_MAX] = csi_gref_segment_max; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX] = NULL; - bc_map[CSINN_OP_SEGMENT_MEAN] = csi_gref_segment_mean; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN] = NULL; - bc_map[CSINN_OP_SEGMENT_MIN] = csi_gref_segment_min; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN] = NULL; - bc_map[CSINN_OP_SEGMENT_PROD] = csi_gref_segment_prod; - bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD] = NULL; - bc_map[CSINN_OP_SEGMENT_SUM] = csi_gref_segment_sum; - bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM] = NULL; - bc_map[CSINN_OP_SELECT] = csi_gref_select; - bc_map[CSINN_OP_SEQUENCE_MASK] = csi_gref_sequence_mask; - bc_map[CSINN_OP_SHAPE] = csi_gref_shape; - bc_map[CSINN_OP_SHUFFLE_CHANNEL] = csi_gref_shuffle_channel; - bc_map[CSINN_OP_SIGMOID] = csi_gref_sigmoid; - bc_map[CSINN_OP_SIGN] = csi_gref_sign; - bc_map[CSINN_OP_SIN] = csi_gref_sin; - bc_map[CSINN_OP_SINH] = csi_gref_sinh; - bc_map[CSINN_OP_SLICE] = csi_gref_slice; - bc_map[CSINN_OP_SOFTMAX] = csi_gref_softmax; - bc_map[CSINN_OP_SOFTPLUS] = csi_gref_softplus; - bc_map[CSINN_OP_SOFTRELU] = csi_gref_softrelu; - bc_map[CSINN_OP_SOFTSIGN] = csi_gref_softsign; - bc_map[CSINN_OP_SPACE_TO_BATCH] = csi_gref_space_to_batch; - bc_map[CSINN_OP_SPACE_TO_BATCH_ND] = csi_gref_space_to_batch_nd; - bc_map[CSINN_OP_SPACE_TO_DEPTH] = csi_gref_space_to_depth; - bc_map[CSINN_OP_SPLIT] = csi_gref_split; - bc_map[CSINN_OP_SQRT] = csi_gref_sqrt; - bc_map[CSINN_OP_SQUARE] = csi_gref_square; - bc_map[CSINN_OP_SQUEEZE] = csi_gref_squeeze; - bc_map[CSINN_OP_STACK] = csi_gref_stack; - bc_map[CSINN_OP_STRIDED_SLICE] = csi_gref_strided_slice; - bc_map[CSINN_OP_SUB] = csi_gref_sub; - bc_map[CSINN_OP_SUM] = csi_gref_sum; - bc_map[CSINN_OP_TAN] = csi_gref_tan; - bc_map[CSINN_OP_TANH] = csi_gref_tanh; - bc_map[CSINN_OP_THRESHOLD_RELU] = csi_gref_threshold_relu; - bc_map[CSINN_OP_TILE] = csi_gref_tile; - bc_map[CSINN_OP_TOPK] = csi_gref_topk; - bc_map[CSINN_OP_TRUNC] = csi_gref_trunc; - bc_map[CSINN_OP_TRANSPOSE] = csi_gref_transpose; - bc_map[CSINN_OP_UNPOOLING] = csi_gref_unpooling; - bc_map[CSINN_OP_UNSTACK] = csi_gref_unstack; - bc_map[CSINN_OP_WHERE] = csi_gref_where; - bc_map[CSINN_OP_XOR] = csi_gref_xor; - bc_map[CSINN_OP_YUV_RGB_SCALE] = csi_gref_yuv_rgb_scale; - - bc_map[CSINN_SESSION_INIT] = csi_gref_session_init; - bc_map[CSINN_SESSION_DEINIT] = csi_gref_session_deinit; - bc_map[CSINN_SESSION_SETUP] = csi_gref_session_setup; - bc_map[CSINN_SESSION_RUN] = csi_gref_session_run; - bc_map[CSINN_UPDATE_INPUT] = csi_gref_update_input; - bc_map[CSINN_UPDATE_OUTPUT] = csi_gref_update_output; - bc_map[CSINN_SET_INPUT_NUMBER] = csi_gref_set_input_number; - bc_map[CSINN_SET_OUTPUT_NUMBER] = csi_gref_set_output_number; - bc_map[CSINN_SET_INPUT] = csi_gref_set_input; - bc_map[CSINN_SET_OUTPUT] = csi_gref_set_output; - bc_map[CSINN_GET_INPUT] = csi_gref_get_input; - bc_map[CSINN_GET_OUTPUT] = csi_gref_get_output; - bc_map[CSINN_TENSOR_ENTRY] = csi_gref_set_tensor; - - return bc_map; + static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE]; + memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE); + + cb_map[CSINN_OP_ABS].est = shl_gref_abs; + cb_map[CSINN_OP_ACOS].est = shl_gref_acos; + cb_map[CSINN_OP_ACOSH].est = shl_gref_acosh; + cb_map[CSINN_OP_ADD].est = shl_gref_add; + cb_map[CSINN_OP_ALL].est = shl_gref_all; + cb_map[CSINN_OP_AND].est = shl_gref_and; + cb_map[CSINN_OP_ANY].est = shl_gref_any; + cb_map[CSINN_OP_ARANGE].est = shl_gref_arange; + cb_map[CSINN_OP_ARGMAX].est = shl_gref_argmax; + cb_map[CSINN_OP_ARGMIN].est = shl_gref_argmin; + cb_map[CSINN_OP_ASIN].est = shl_gref_asin; + cb_map[CSINN_OP_ASINH].est = shl_gref_asinh; + cb_map[CSINN_OP_ATAN].est = shl_gref_atan; + cb_map[CSINN_OP_ATANH].est = shl_gref_atanh; + cb_map[CSINN_OP_AVGPOOL2D].est = shl_gref_avgpool2d; + cb_map[CSINN_OP_AVGPOOL3D].est = shl_gref_avgpool3d; + cb_map[CSINN_OP_BN].est = shl_gref_batch_normalization; + cb_map[CSINN_OP_BATCH_TO_SPACE].est = shl_gref_batch_to_space; + cb_map[CSINN_OP_BATCH_TO_SPACE_ND].est = shl_gref_batch_to_space_nd; + cb_map[CSINN_OP_BROADCOST].est = shl_gref_broadcast_to; + cb_map[CSINN_OP_CACHE_MATMUL].est = shl_gref_cache_matmul; + cb_map[CSINN_OP_CACHE_CONV1D].est = shl_gref_cache_conv1d; + cb_map[CSINN_OP_CEIL].est = shl_gref_ceil; + cb_map[CSINN_OP_CLIP].est = shl_gref_clip; + cb_map[CSINN_OP_COL2IM].est = shl_gref_col2im; + cb_map[CSINN_OP_CONCAT].est = shl_gref_concat; + cb_map[CSINN_OP_CONV1D].est = shl_gref_conv1d; + cb_map[CSINN_OP_CONV2D].est = shl_gref_conv2d; + cb_map[CSINN_OP_CONV2D_RELU].est = shl_gref_conv2d_relu; + cb_map[CSINN_OP_CONV2D_RELU6].est = shl_gref_conv2d_relu6; + cb_map[CSINN_OP_DATA_CONVERT].est = shl_gref_data_convert; + cb_map[CSINN_OP_DEPTHWISE_CONV2D].est = shl_gref_depthwise_conv2d; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU].est = shl_gref_depthwise_conv2d_relu; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6].est = shl_gref_depthwise_conv2d_relu6; + cb_map[CSINN_OP_GROUP_CONV2D].est = shl_gref_group_conv2d; + cb_map[CSINN_OP_CONV3D].est = shl_gref_conv3d; + cb_map[CSINN_OP_DECONV2D].est = shl_gref_deconv2d; + cb_map[CSINN_OP_DEPTHWISE_DECONV2D].est = shl_gref_depthwise_deconv2d; + cb_map[CSINN_OP_DECONV3D].est = shl_gref_deconv3d; + cb_map[CSINN_OP_COS].est = shl_gref_cos; + cb_map[CSINN_OP_COSH].est = shl_gref_cosh; + cb_map[CSINN_OP_CUMPROD].est = shl_gref_cumprod; + cb_map[CSINN_OP_CUMSUM].est = shl_gref_cumsum; + cb_map[CSINN_OP_DEPTH_TO_SPACE].est = shl_gref_depth_to_space; + cb_map[CSINN_OP_DIV].est = shl_gref_div; + cb_map[CSINN_OP_ELU].est = shl_gref_elu; + cb_map[CSINN_OP_EQUANL].est = shl_gref_equal; + cb_map[CSINN_OP_ERF].est = shl_gref_erf; + cb_map[CSINN_OP_EXP].est = shl_gref_exp; + cb_map[CSINN_OP_EXPAND_DIMS].est = shl_gref_expand_dims; + cb_map[CSINN_OP_EXPM1].est = shl_gref_expm1; + cb_map[CSINN_OP_FLATTEN].est = shl_gref_flatten; + cb_map[CSINN_OP_FLOOR_DIVIDE].est = shl_gref_floor_divide; + cb_map[CSINN_OP_FLOOR_MOD].est = shl_gref_floor_mod; + cb_map[CSINN_OP_FLOOR].est = shl_gref_floor; + cb_map[CSINN_OP_FSMN].est = shl_gref_fsmn; + cb_map[CSINN_OP_FULLYCONNECTED].est = shl_gref_fullyconnected; + cb_map[CSINN_OP_GATHER_ND].est = shl_gref_gather_nd; + cb_map[CSINN_OP_GATHER].est = shl_gref_gather; + cb_map[CSINN_OP_GLOBAL_AVGPOOL2D].est = shl_gref_global_avgpool2d; + cb_map[CSINN_OP_GLOBAL_MAXPOOL2D].est = shl_gref_global_maxpool2d; + cb_map[CSINN_OP_GREATHER_EQUAL].est = shl_gref_greater_equal; + cb_map[CSINN_OP_GREATHER].est = shl_gref_greater; + cb_map[CSINN_OP_HARD_SIGMOID].est = shl_gref_hard_sigmoid; + cb_map[CSINN_OP_IM2COL].est = shl_gref_im2col; + cb_map[CSINN_OP_ISNAN].est = shl_gref_isnan_bool; + cb_map[CSINN_OP_LAYER_NORM].est = shl_gref_layer_norm; + cb_map[CSINN_OP_L2N].est = shl_gref_l2_normalization; + cb_map[CSINN_OP_L2POOL2D].est = shl_gref_l2pool; + cb_map[CSINN_OP_LEAKY_RELU].est = shl_gref_leaky_relu; + cb_map[CSINN_OP_LESS_EQUAL].est = shl_gref_less_equal; + cb_map[CSINN_OP_LESS].est = shl_gref_less; + cb_map[CSINN_OP_LOG_SOFTMAX].est = shl_gref_log_softmax; + cb_map[CSINN_OP_LOG].est = shl_gref_log; + cb_map[CSINN_OP_LOG1P].est = shl_gref_log1p; + cb_map[CSINN_OP_LOGICAL_AND].est = shl_gref_logical_and; + cb_map[CSINN_OP_LOGICAL_NOT].est = shl_gref_logical_not; + cb_map[CSINN_OP_LOGICAL_OR].est = shl_gref_logical_or; + cb_map[CSINN_OP_LOGICAL_XOR].est = shl_gref_logical_xor; + cb_map[CSINN_OP_LRN].est = shl_gref_lrn; + cb_map[CSINN_OP_MATMUL].est = shl_gref_matmul; + cb_map[CSINN_OP_MAX].est = shl_gref_max; + cb_map[CSINN_OP_MAXIMUM].est = shl_gref_maximum; + cb_map[CSINN_OP_MAXPOOL2D].est = shl_gref_maxpool2d; + cb_map[CSINN_OP_MAXPOOL2D_LOCAT].est = shl_gref_maxpool2d_locat; + cb_map[CSINN_OP_MAXPOOL3D].est = shl_gref_maxpool3d; + cb_map[CSINN_OP_MEAN].est = shl_gref_mean; + cb_map[CSINN_OP_MEAN_STRIDE].est = shl_gref_mean; + cb_map[CSINN_OP_MIN].est = shl_gref_min; + cb_map[CSINN_OP_MINIMUM].est = shl_gref_minimum; + cb_map[CSINN_OP_MOD].est = shl_gref_mod; + cb_map[CSINN_OP_MUL].est = shl_gref_mul; + cb_map[CSINN_OP_NDARRAY_SIZE].est = shl_gref_ndarray_size; + cb_map[CSINN_OP_NEGATIIVE].est = shl_gref_negative; + cb_map[CSINN_OP_NON_MAX_SUPPRESSION].est = shl_gref_non_max_suppression; + cb_map[CSINN_OP_NOT_EQUAL].est = shl_gref_not_equal; + cb_map[CSINN_OP_NOT].est = shl_gref_not; + cb_map[CSINN_OP_OR].est = shl_gref_or; + cb_map[CSINN_OP_PAD].est = shl_gref_pad; + cb_map[CSINN_OP_POWER].est = shl_gref_power; + cb_map[CSINN_OP_PRELU].est = shl_gref_prelu; + cb_map[CSINN_OP_PROD].est = shl_gref_prod; + cb_map[CSINN_OP_PROPOSAL].est = shl_gref_proposal; + cb_map[CSINN_OP_PSROIPOOLING].est = shl_gref_psroipooling; + cb_map[CSINN_OP_REDUCE_LOGSUMEXP].est = shl_gref_reduce_logsumexp; + cb_map[CSINN_OP_REDUCE_MAX].est = shl_gref_reduce_max; + cb_map[CSINN_OP_REDUCE_MEAN].est = shl_gref_reduce_mean; + cb_map[CSINN_OP_REDUCE_MIN].est = shl_gref_reduce_min; + cb_map[CSINN_OP_REDUCE_PROD].est = shl_gref_reduce_prod; + cb_map[CSINN_OP_REDUCE_SUM].est = shl_gref_reduce_sum; + cb_map[CSINN_OP_RELU].est = shl_gref_relu; + cb_map[CSINN_OP_RELU1].est = shl_gref_relu1; + cb_map[CSINN_OP_RELU6].est = shl_gref_relu6; + cb_map[CSINN_OP_RELUN].est = shl_gref_relun; + cb_map[CSINN_OP_RESHAPE].est = shl_gref_reshape; + cb_map[CSINN_OP_RESIZE].est = shl_gref_resize; + cb_map[CSINN_OP_REVERSE].est = shl_gref_reverse; + cb_map[CSINN_OP_ROIALIGN].est = shl_gref_roi_align; + cb_map[CSINN_OP_ROIPOOL].est = shl_gref_roipool; + cb_map[CSINN_OP_ROUND].est = shl_gref_round; + cb_map[CSINN_OP_RSQRT].est = shl_gref_rsqrt; + cb_map[CSINN_OP_SCATTER_ND].est = shl_gref_scatter_nd; + cb_map[CSINN_OP_SEGMENT_MAX].est = shl_gref_segment_max; + cb_map[CSINN_OP_SEGMENT_MEAN].est = shl_gref_segment_mean; + cb_map[CSINN_OP_SEGMENT_MIN].est = shl_gref_segment_min; + cb_map[CSINN_OP_SEGMENT_PROD].est = shl_gref_segment_prod; + cb_map[CSINN_OP_SEGMENT_SUM].est = shl_gref_segment_sum; + cb_map[CSINN_OP_SELECT].est = shl_gref_select; + cb_map[CSINN_OP_SEQUENCE_MASK].est = shl_gref_sequence_mask; + cb_map[CSINN_OP_SHAPE].est = shl_gref_shape; + cb_map[CSINN_OP_SHUFFLE_CHANNEL].est = shl_gref_shuffle_channel; + cb_map[CSINN_OP_SIGMOID].est = shl_gref_sigmoid; + cb_map[CSINN_OP_SIGN].est = shl_gref_sign; + cb_map[CSINN_OP_SIN].est = shl_gref_sin; + cb_map[CSINN_OP_SINH].est = shl_gref_sinh; + cb_map[CSINN_OP_SLICE].est = shl_gref_slice; + cb_map[CSINN_OP_SOFTMAX].est = shl_gref_softmax; + cb_map[CSINN_OP_SOFTPLUS].est = shl_gref_softplus; + cb_map[CSINN_OP_SOFTRELU].est = shl_gref_softrelu; + cb_map[CSINN_OP_SOFTSIGN].est = shl_gref_softsign; + cb_map[CSINN_OP_SPACE_TO_BATCH].est = shl_gref_space_to_batch; + cb_map[CSINN_OP_SPACE_TO_BATCH_ND].est = shl_gref_space_to_batch_nd; + cb_map[CSINN_OP_SPACE_TO_DEPTH].est = shl_gref_space_to_depth; + cb_map[CSINN_OP_SPLIT].est = shl_gref_split; + cb_map[CSINN_OP_SQRT].est = shl_gref_sqrt; + cb_map[CSINN_OP_SQUARE].est = shl_gref_square; + cb_map[CSINN_OP_SQUEEZE].est = shl_gref_squeeze; + cb_map[CSINN_OP_STACK].est = shl_gref_stack; + cb_map[CSINN_OP_STRIDED_SLICE].est = shl_gref_strided_slice; + cb_map[CSINN_OP_SUB].est = shl_gref_sub; + cb_map[CSINN_OP_SUM].est = shl_gref_sum; + cb_map[CSINN_OP_TAN].est = shl_gref_tan; + cb_map[CSINN_OP_TANH].est = shl_gref_tanh; + cb_map[CSINN_OP_THRESHOLD_RELU].est = shl_gref_threshold_relu; + cb_map[CSINN_OP_TILE].est = shl_gref_tile; + cb_map[CSINN_OP_TOPK].est = shl_gref_topk; + cb_map[CSINN_OP_TRUNC].est = shl_gref_trunc; + cb_map[CSINN_OP_TRANSPOSE].est = shl_gref_transpose; + cb_map[CSINN_OP_UNPOOLING].est = shl_gref_unpooling; + cb_map[CSINN_OP_UNSTACK].est = shl_gref_unstack; + cb_map[CSINN_OP_WHERE].est = shl_gref_where; + cb_map[CSINN_OP_XOR].est = shl_gref_xor; + cb_map[CSINN_OP_YUV_RGB_SCALE].est = shl_gref_yuv_rgb_scale; + + return cb_map; } -static int get_bc_map_index(int op, int dtype) { return op; } +static int get_cb_map_index(int op, int dtype) { return op; } +static struct csinn_callback *__cb_map_table_gref; -void *csi_bc_map_gref(int op, int dtype) +struct csinn_callback *shl_cb_map_gref(int op, int dtype) { - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; + return &__cb_map_table_gref[get_cb_map_index(op, dtype)]; +} + +void *shl_gref_runtime_callback(int api) +{ + switch (api) { + case CSINN_SESSION_INIT: + return shl_gref_session_init; + break; + case CSINN_SESSION_DEINIT: + return shl_gref_session_deinit; + break; + case CSINN_SESSION_SETUP: + return shl_gref_session_setup; + break; + case CSINN_SESSION_RUN: + return shl_gref_session_run; + break; + case CSINN_UPDATE_INPUT: + return shl_gref_update_input; + break; + case CSINN_UPDATE_OUTPUT: + return shl_gref_update_output; + break; + case CSINN_SET_INPUT_NUMBER: + return shl_gref_set_input_number; + break; + case CSINN_SET_OUTPUT_NUMBER: + return shl_gref_set_output_number; + break; + case CSINN_SET_INPUT: + return shl_gref_set_input; + break; + case CSINN_SET_OUTPUT: + return shl_gref_set_output; + break; + case CSINN_GET_INPUT: + return shl_gref_get_input; + break; + case CSINN_GET_OUTPUT: + return shl_gref_get_output; + break; + case CSINN_TENSOR_ENTRY: + return shl_gref_set_tensor; + break; + default: + shl_debug_info("%s: Cannot find callback\n", __func__); + break; } - return bc_map_table[get_bc_map_index(op, dtype)]; + return NULL; +} + +void shl_target_init_gref() +{ + __cb_map_table_gref = setup_cb_map(); + shl_register_runtime_callback(CSINN_GREF, shl_gref_runtime_callback); + shl_register_op_callback(CSINN_GREF, shl_cb_map_gref); } diff --git a/source/graph_ref/shape.c b/source/graph_ref/shape.c index b06cb619..84f3257d 100644 --- a/source/graph_ref/shape.c +++ b/source/graph_ref/shape.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_shape(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params) +int shl_gref_shape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SHAPE, params); + shl_gref_siso_op(input, output, CSINN_OP_SHAPE, params); return CSINN_TRUE; } diff --git a/source/graph_ref/shuffle_channel.c b/source/graph_ref/shuffle_channel.c index eb419830..0bed0ce9 100644 --- a/source/graph_ref/shuffle_channel.c +++ b/source/graph_ref/shuffle_channel.c @@ -1,4 +1,4 @@ - /* +/* * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_shuffle_channel(struct csi_tensor *input, - struct csi_tensor *output, - struct shuffle_channel_params *params) +int shl_gref_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SHUFFLE_CHANNEL, params); + shl_gref_siso_op(input, output, CSINN_OP_SHUFFLE_CHANNEL, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/sigmoid.c b/source/graph_ref/sigmoid.c index 0758aa11..d7459363 100644 --- a/source/graph_ref/sigmoid.c +++ b/source/graph_ref/sigmoid.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_gref_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SIGMOID, params); + shl_gref_siso_op(input, output, CSINN_OP_SIGMOID, params); return CSINN_TRUE; } diff --git a/source/graph_ref/sign.c b/source/graph_ref/sign.c index 75bd150e..cb21b727 100644 --- a/source/graph_ref/sign.c +++ b/source/graph_ref/sign.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sign(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_sign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SIGN, params); + shl_gref_siso_op(input, output, CSINN_OP_SIGN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/sin.c b/source/graph_ref/sin.c index 8ac236aa..67e15ec6 100644 --- a/source/graph_ref/sin.c +++ b/source/graph_ref/sin.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_sin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SIN, params); + shl_gref_siso_op(input, output, CSINN_OP_SIN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/sinh.c b/source/graph_ref/sinh.c index dc3fbf0f..59c2153c 100644 --- a/source/graph_ref/sinh.c +++ b/source/graph_ref/sinh.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_sinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SINH, params); + shl_gref_siso_op(input, output, CSINN_OP_SINH, params); return CSINN_TRUE; } diff --git a/source/graph_ref/slice.c b/source/graph_ref/slice.c index 252f0834..cbebe99d 100644 --- a/source/graph_ref/slice.c +++ b/source/graph_ref/slice.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_slice(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params) +int shl_gref_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SLICE, params); + shl_gref_siso_op(input, output, CSINN_OP_SLICE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/softmax.c b/source/graph_ref/softmax.c index 1ab06362..423d850e 100644 --- a/source/graph_ref/softmax.c +++ b/source/graph_ref/softmax.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_softmax(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_gref_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SOFTMAX, params); + shl_gref_siso_op(input, output, CSINN_OP_SOFTMAX, params); return CSINN_TRUE; } diff --git a/source/graph_ref/softplus.c b/source/graph_ref/softplus.c index b4ec8933..cf0b4993 100644 --- a/source/graph_ref/softplus.c +++ b/source/graph_ref/softplus.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_softplus(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_softplus(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SOFTPLUS, params); + shl_gref_siso_op(input, output, CSINN_OP_SOFTPLUS, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/softrelu.c b/source/graph_ref/softrelu.c index 0a9972e6..3a8182b2 100644 --- a/source/graph_ref/softrelu.c +++ b/source/graph_ref/softrelu.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_softrelu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_softrelu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SOFTRELU, params); + shl_gref_siso_op(input, output, CSINN_OP_SOFTRELU, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/softsign.c b/source/graph_ref/softsign.c index 023ad975..0f8a8ee2 100644 --- a/source/graph_ref/softsign.c +++ b/source/graph_ref/softsign.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_softsign(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_softsign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SOFTSIGN, params); + shl_gref_siso_op(input, output, CSINN_OP_SOFTSIGN, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/space_to_batch.c b/source/graph_ref/space_to_batch.c index 3d6a7679..a6da5ef7 100644 --- a/source/graph_ref/space_to_batch.c +++ b/source/graph_ref/space_to_batch.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_space_to_batch(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params) +int shl_gref_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH, params); + shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/space_to_batch_nd.c b/source/graph_ref/space_to_batch_nd.c index 7cdf00aa..92cbad09 100644 --- a/source/graph_ref/space_to_batch_nd.c +++ b/source/graph_ref/space_to_batch_nd.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_space_to_batch_nd(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_nd_params *params) +int shl_gref_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH_ND, params); + shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH_ND, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/space_to_depth.c b/source/graph_ref/space_to_depth.c index d4da69da..495cb9e5 100644 --- a/source/graph_ref/space_to_depth.c +++ b/source/graph_ref/space_to_depth.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_space_to_depth(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params) +int shl_gref_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_DEPTH, params); + shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_DEPTH, params); return CSINN_TRUE; } diff --git a/source/graph_ref/split.c b/source/graph_ref/split.c index 2c675495..4eecdf3c 100644 --- a/source/graph_ref/split.c +++ b/source/graph_ref/split.c @@ -16,25 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_split(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params) +int shl_gref_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { - struct csi_node *layer = csi_node_alloc(CSINN_OP_SPLIT, params->base.name, 1, params->output_num, params); + struct shl_node *layer = + shl_node_alloc(CSINN_OP_SPLIT, params->base.name, 1, params->output_num, params); - struct csi_node *in_tensor = (struct csi_node *)(input->data); - csi_node_add_in(layer, in_tensor, 0); + struct shl_node *in_tensor = (struct shl_node *)(input->data); + shl_node_add_in(layer, in_tensor, 0); - for (int i = 0; i< params->output_num; i++){ - struct csi_node *out = csi_node_var_alloc(output[i]->name, output[i]); - csi_node_add_out(layer, out, i); + for (int i = 0; i < params->output_num; i++) { + struct shl_node *out = shl_node_var_alloc(output[i]->name, output[i]); + shl_node_add_out(layer, out, i); output[i]->data = out; } - struct csi_ref_graph *graph = csi_gref_get_graph(input->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(input->sess); + shl_gref_graph_insert(layer, graph); return CSINN_FALSE; } diff --git a/source/graph_ref/sqrt.c b/source/graph_ref/sqrt.c index 649941d3..d4b791b4 100644 --- a/source/graph_ref/sqrt.c +++ b/source/graph_ref/sqrt.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sqrt(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_sqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SQRT, params); + shl_gref_siso_op(input, output, CSINN_OP_SQRT, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/square.c b/source/graph_ref/square.c index d68bacc8..d3e4928f 100644 --- a/source/graph_ref/square.c +++ b/source/graph_ref/square.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_square(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_square(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SQUARE, params); + shl_gref_siso_op(input, output, CSINN_OP_SQUARE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/squeeze.c b/source/graph_ref/squeeze.c index 8d4dbe2f..1b682641 100644 --- a/source/graph_ref/squeeze.c +++ b/source/graph_ref/squeeze.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_squeeze(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params) +int shl_gref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SQUEEZE, params); + shl_gref_siso_op(input, output, CSINN_OP_SQUEEZE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/stack.c b/source/graph_ref/stack.c index e0185f4b..3d9814e9 100644 --- a/source/graph_ref/stack.c +++ b/source/graph_ref/stack.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_stack(struct csi_tensor **input, - struct csi_tensor *output, - struct stack_params *params) +int shl_gref_stack(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params) { - csi_debug_error("csi_gref_stack unsupport\n"); + shl_debug_error("shl_gref_stack unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/strided_slice.c b/source/graph_ref/strided_slice.c index e29f899b..48eee8b0 100644 --- a/source/graph_ref/strided_slice.c +++ b/source/graph_ref/strided_slice.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_strided_slice(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) +int shl_gref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_STRIDED_SLICE, params); + shl_gref_siso_op(input, output, CSINN_OP_STRIDED_SLICE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/sub.c b/source/graph_ref/sub.c index 33e7f6f7..cda43bc5 100644 --- a/source/graph_ref/sub.c +++ b/source/graph_ref/sub.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sub(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_SUB, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_SUB, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/subgraph.c b/source/graph_ref/subgraph.c index f721fc77..694decad 100644 --- a/source/graph_ref/subgraph.c +++ b/source/graph_ref/subgraph.c @@ -16,38 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" -#include "csi_utils.h" +#include "shl_gref.h" -void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph, - struct csi_ref_graph *ggraph) +void shl_subgraph_alloc(struct shl_node *node, struct shl_ref_graph *ograph, + struct shl_ref_graph *ggraph) { int node_input_num = 0; for (int i = 0; i < node->in_num; i++) { - struct csi_tensor *node_in = node->in[i]->data; + struct csinn_tensor *node_in = node->in[i]->data; if (!node_in->is_const) { node_input_num++; } } - struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); sgraph->input_num = node_input_num; sgraph->output_num = node->out_num; - sgraph->input = csi_mem_alloc(sgraph->input_num * sizeof(struct csi_node *)); - sgraph->output = csi_mem_alloc(sgraph->output_num * sizeof(struct csi_node *)); - csi_gref_graph_insert(node, sgraph); + sgraph->input = shl_mem_alloc(sgraph->input_num * sizeof(struct shl_node *)); + sgraph->output = shl_mem_alloc(sgraph->output_num * sizeof(struct shl_node *)); + shl_gref_graph_insert(node, sgraph); - struct csi_node *sg_in = - csi_node_alloc(CSINN_SUBGRAPH, "graph_in", node_input_num, node_input_num, sgraph); - csi_gref_graph_insert(sg_in, ggraph); + struct shl_node *sg_in = + shl_node_alloc(CSINN_SUBGRAPH, "graph_in", node_input_num, node_input_num, sgraph); + shl_gref_graph_insert(sg_in, ggraph); sg_in->subgraph_idx = ggraph->layer_index - 1; node->subgraph_idx = ggraph->layer_index - 1; for (int i = 0; i < node_input_num; i++) { sg_in->in[i] = node->in[i]; - struct csi_tensor *sg_in_tensor = csi_alloc_tensor(NULL); - csi_tensor_copy(sg_in_tensor, node->in[i]->data); - struct csi_node *sg_in_node = csi_node_var_alloc("graph_in_tensor", sg_in_tensor); + struct csinn_tensor *sg_in_tensor = csinn_alloc_tensor(NULL); + csinn_tensor_copy(sg_in_tensor, node->in[i]->data); + struct shl_node *sg_in_node = shl_node_var_alloc("graph_in_tensor", sg_in_tensor); sg_in_node->subgraph_idx = ggraph->layer_index - 1; node->in[i] = sg_in_node; sg_in_node->out[0] = node; @@ -58,16 +57,16 @@ void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph, // sgraph->input[0] = node->in[0]; // sgraph->output[0] = node->out[0]; - struct csi_node *sg_out = csi_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", node->out_num, + struct shl_node *sg_out = shl_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", node->out_num, node->out_num, ggraph->layer[ggraph->layer_index]); - csi_gref_graph_insert(sg_out, sgraph); + shl_gref_graph_insert(sg_out, sgraph); sg_out->subgraph_idx = ggraph->layer_index - 1; for (int i = 0; i < node->out_num; i++) { sg_out->out[i] = node->out[i]; node->out[i]->in[0] = sg_out; - struct csi_tensor *sg_out_tensor = csi_alloc_tensor(NULL); - csi_tensor_copy(sg_out_tensor, node->out[i]->data); - struct csi_node *sg_out_node = csi_node_var_alloc("graph_out_tensor", sg_out_tensor); + struct csinn_tensor *sg_out_tensor = csinn_alloc_tensor(NULL); + csinn_tensor_copy(sg_out_tensor, node->out[i]->data); + struct shl_node *sg_out_node = shl_node_var_alloc("graph_out_tensor", sg_out_tensor); sg_out_node->subgraph_idx = ggraph->layer_index - 1; node->out[i] = sg_out_node; sg_out_node->in[0] = node; @@ -77,62 +76,81 @@ void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph, } } -static void set_sub_session(struct csi_session *sub_sess, struct csi_params_base *params, - struct csi_ref_graph *graph) +static void set_sub_session(struct csinn_session *sub_sess, struct csinn_params_base *params, + struct shl_ref_graph *graph) { - struct csi_session *base_sess = params->sess; + struct csinn_session *base_sess = params->sess; sub_sess->base_api = params->api; if (params->api == CSINN_LIGHT) { sub_sess->base_dtype = base_sess->base_dtype; sub_sess->debug_level = base_sess->debug_level; sub_sess->base_run_mode = CSINN_RM_NPU_GRAPH; - sub_sess->base_quant_type = base_sess->base_quant_type; + if (params->quant_type != CSINN_QUANT_UNSET) { + sub_sess->base_quant_type = params->quant_type; + } else { + sub_sess->base_quant_type = base_sess->base_quant_type; + } + + if (params->quant_type == CSINN_QUANT_INT16_SYM) { + sub_sess->base_dtype = CSINN_DTYPE_INT16; + } else if (params->quant_type == CSINN_QUANT_INT8_ASYM || + params->quant_type == CSINN_QUANT_INT8_SYM) { + sub_sess->base_dtype = CSINN_DTYPE_INT8; + } else if (params->quant_type == CSINN_QUANT_UINT8_ASYM || + params->quant_type == CSINN_QUANT_UINT8_SYM) { + sub_sess->base_dtype = CSINN_DTYPE_UINT8; + } else if (params->quant_type == CSINN_QUANT_INT4_SYM) { + sub_sess->base_dtype = CSINN_DTYPE_INT4; + } } else if (params->api = CSINN_ASP) { sub_sess->base_dtype = base_sess->base_dtype; sub_sess->debug_level = base_sess->debug_level; sub_sess->base_quant_type = base_sess->base_quant_type; - sub_sess->td = csi_mem_alloc(sizeof(struct csi_gref_target_data)); + sub_sess->td = shl_mem_alloc(sizeof(struct shl_gref_target_data)); /* ASP: reuse gref graph */ - struct csi_gref_target_data *td = sub_sess->td; + struct shl_gref_target_data *td = sub_sess->td; td->graph = graph; } else { - csi_debug_error("sub session api unsupport\n"); + shl_debug_error("sub session api unsupport\n"); } } -int csi_subgraph_init(struct csi_node *n) +int shl_subgraph_setup(struct shl_node *n) { - struct csi_ref_graph *sgraph = n->data; - struct csi_node *init_node = sgraph->layer[0]; - struct csi_params_base *init_params = init_node->data; - struct csi_session *sub_sess = csi_alloc_session(); + struct shl_ref_graph *sgraph = n->data; + struct shl_node *init_node = sgraph->layer[0]; + struct csinn_params_base *init_params = init_node->data; + struct csinn_session *sub_sess = csinn_alloc_session(); set_sub_session(sub_sess, init_params, sgraph); - csi_session_init(sub_sess); + csinn_session_init(sub_sess); - csi_set_input_number(sgraph->input_num, sub_sess); - csi_set_output_number(sgraph->output_num, sub_sess); + csinn_set_input_number(sgraph->input_num, sub_sess); + csinn_set_output_number(sgraph->output_num, sub_sess); /* set input tensor */ for (int i = 0; i < sgraph->input_num; i++) { - struct csi_tensor *input_t; + struct csinn_tensor *input_t; input_t = sgraph->input[i]->data; input_t->sess = sub_sess; - csi_set_tensor_entry(input_t, sub_sess); - csi_set_input(i, input_t, sub_sess); + csinn_set_tensor_entry(input_t, sub_sess); + csinn_set_input(i, input_t, sub_sess); } int ret = CSINN_TRUE; for (int idx = 0; idx < sgraph->layer_index; idx++) { - struct csi_node *node = sgraph->layer[idx]; + struct shl_node *node = sgraph->layer[idx]; if (node->type == CSINN_SUBGRAPH_RETURN) continue; - struct csi_params_base *params = node->data; + struct csinn_params_base *params = node->data; params->sess = sub_sess; int (*func)(); - struct csi_tensor *input0, *output, *kernel, *bias; + struct csinn_tensor *input0, *output, *kernel, *bias; input0 = node->in[0]->data; input0->sess = sub_sess; - func = csi_bc_map(params->api, CSINN_RM_LAYER, node->type, input0->dtype); + + shl_op_callback_map(params, node->type, input0->dtype); + struct csinn_callback *cb = params->cb; + func = cb->est; switch (node->type) { case CSINN_OP_ABS: @@ -243,7 +261,7 @@ int csi_subgraph_init(struct csi_node *n) case CSINN_OP_MUL: { output = node->out[0]->data; output->sess = sub_sess; - struct csi_tensor *rhs = node->in[1]->data; + struct csinn_tensor *rhs = node->in[1]->data; rhs->sess = sub_sess; ret = func(input0, rhs, output, params); break; @@ -279,8 +297,8 @@ int csi_subgraph_init(struct csi_node *n) ret = func(input0, output, kernel, bias, params); break; case CSINN_OP_SPLIT: { - struct csi_tensor **split_output = - csi_mem_alloc(sizeof(struct csi_tensor *) * node->out_num); + struct csinn_tensor **split_output = + shl_mem_alloc(sizeof(struct csinn_tensor *) * node->out_num); for (int i = 0; i < node->out_num; i++) { split_output[i] = node->out[i]->data; split_output[i]->sess = sub_sess; @@ -289,8 +307,8 @@ int csi_subgraph_init(struct csi_node *n) break; } case CSINN_OP_CONCAT: { - struct csi_tensor **concat_input = - csi_mem_alloc(sizeof(struct csi_tensor *) * node->in_num); + struct csinn_tensor **concat_input = + shl_mem_alloc(sizeof(struct csinn_tensor *) * node->in_num); for (int i = 0; i < node->in_num; i++) { concat_input[i] = node->in[i]->data; concat_input[i]->sess = sub_sess; @@ -301,7 +319,7 @@ int csi_subgraph_init(struct csi_node *n) break; } default: - CSI_DEBUG_CALL(printf("unknown op1\n")); + shl_debug_error("%s unknown op\n", __func__); return CSINN_FALSE; } } @@ -312,79 +330,106 @@ int csi_subgraph_init(struct csi_node *n) break; } } - struct csi_node *return_node = sgraph->layer[i]; + struct shl_node *return_node = sgraph->layer[i]; for (int i = 0; i < return_node->in_num; i++) { - struct csi_tensor *output_t; + struct csinn_tensor *output_t; output_t = return_node->in[i]->data; output_t->sess = sub_sess; - csi_set_output(i, output_t, sub_sess); + csinn_set_output(i, output_t, sub_sess); } - csi_session_setup(sub_sess); + csinn_session_setup(sub_sess); return ret; } -int csi_subgraph_deinit(struct csi_node *n) +int shl_subgraph_deinit(struct shl_node *n) { - struct csi_ref_graph *sgraph = n->data; - struct csi_node *node = sgraph->layer[0]; - struct csi_params_base *params = node->data; - csi_session_deinit(params->sess); + struct shl_ref_graph *sgraph = n->data; + struct shl_node *node = sgraph->layer[0]; + struct csinn_params_base *params = node->data; + csinn_session_deinit(params->sess); return 0; } -static int csi_subgraph_entry(struct csi_node *n) +static int shl_subgraph_entry(struct shl_node *n) { - struct csi_ref_graph *sgraph = n->data; + struct shl_ref_graph *sgraph = n->data; for (int i = 0; i < n->in_num; i++) { - struct csi_tensor *tsrc = n->in[i]->data; - struct csi_tensor *tdst = sgraph->input[i]->data; + struct csinn_tensor *tsrc = n->in[i]->data; + struct csinn_tensor *tdst = sgraph->input[i]->data; + + if (tdst->sess->base_api == CSINN_LIGHT && + (tdst->sess->base_quant_type == CSINN_QUANT_INT16_SYM || + tdst->sess->base_quant_type == CSINN_QUANT_INT8_SYM)) { + struct csinn_tensor *tdst_cp = csinn_alloc_tensor(NULL); + csinn_tensor_copy(tdst_cp, tdst); + tdst_cp->data = shl_mem_alloc(csinn_tensor_byte_size(tdst_cp)); + csinn_tensor_data_convert(tdst_cp, tsrc); + + tdst->data = tdst_cp->data; + } else { + tdst->data = tsrc->data; + } // if (tdst->data == NULL) { - tdst->data = tsrc->data; + // tdst->data = tsrc->data; // } else if (tdst->data != tsrc->data) { - // memcpy(tdst->data, tsrc->data, csi_tensor_byte_size(tsrc)); + // memcpy(tdst->data, tsrc->data, csinn_tensor_byte_size(tsrc)); // } } for (int i = 0; i < sgraph->output_num; i++) { - struct csi_tensor *out = sgraph->output[i]->data; + struct csinn_tensor *out = sgraph->output[i]->data; out->data = NULL; } return CSINN_TRUE; } -static int csi_subgraph_return(struct csi_ref_graph *graph, struct csi_node *ret_node) +static int shl_subgraph_return(struct shl_ref_graph *graph, struct shl_node *ret_node) { for (int i = 0; i < graph->output_num; i++) { - struct csi_tensor *tsrc = ret_node->in[i]->data; - struct csi_tensor *tdst = graph->output[i]->data; + struct csinn_tensor *tsrc = ret_node->in[i]->data; + struct csinn_tensor *tdst = graph->output[i]->data; + + if (tsrc->sess->base_api == CSINN_LIGHT && + (tsrc->sess->base_quant_type == CSINN_QUANT_INT16_SYM || + tsrc->sess->base_quant_type == CSINN_QUANT_INT8_SYM)) { + struct csinn_tensor *tdst_cp = csinn_alloc_tensor(NULL); + csinn_tensor_copy(tdst_cp, tdst); + tdst_cp->data = shl_mem_alloc(csinn_tensor_byte_size(tdst_cp)); + csinn_tensor_data_convert(tdst_cp, tsrc); + + tdst->data = tdst_cp->data; + } else { + tdst->data = tsrc->data; + } + // if (tdst->data == NULL) { - tdst->data = tsrc->data; + // tdst->data = tsrc->data; // } else if (tdst->data != tsrc->data) { - // memcpy(tdst->data, tsrc->data, csi_tensor_byte_size(tsrc)); + // memcpy(tdst->data, tsrc->data, csinn_tensor_byte_size(tsrc)); // } } return CSINN_TRUE; } -int csi_subgraph_run_init(struct csi_node *n) { csi_subgraph_entry(n); } +int shl_subgraph_run_init(struct shl_node *n) { shl_subgraph_entry(n); } -int csi_subgraph_run_deinit(struct csi_node *n) {} +int shl_subgraph_run_deinit(struct shl_node *n) {} -int csi_subgraph_run(struct csi_node *n) +int shl_subgraph_run(struct shl_node *n) { - struct csi_ref_graph *sgraph = n->data; - struct csi_node *node = sgraph->layer[0]; - struct csi_params_base *params = node->data; + struct shl_ref_graph *sgraph = n->data; + struct shl_node *node = sgraph->layer[0]; + struct csinn_params_base *params = node->data; int ret = CSINN_TRUE; - struct csi_tensor **inputs; - struct csi_tensor **outputs; + struct csinn_tensor **inputs; + struct csinn_tensor **outputs; for (int i = 0; i < sgraph->input_num; i++) { - csi_update_input(i, sgraph->input[i]->data, params->sess); + csinn_update_input(i, sgraph->input[i]->data, params->sess); } - csi_session_run(params->sess); + csinn_session_run(params->sess); int i; for (i = 0; i < sgraph->layer_index; i++) { @@ -392,102 +437,166 @@ int csi_subgraph_run(struct csi_node *n) break; } } - struct csi_node *return_node = sgraph->layer[i]; + struct shl_node *return_node = sgraph->layer[i]; for (int i = 0; i < return_node->in_num; i++) { - csi_get_output(i, return_node->in[i]->data, params->sess); + csinn_get_output(i, return_node->in[i]->data, params->sess); } /* CSINN_SUBGRAPH_RETURN */ - csi_subgraph_return(sgraph, return_node); + shl_subgraph_return(sgraph, return_node); return ret; } -struct csi_node *csi_gref_get_input_subgraph(struct csi_ref_graph *graph, struct csi_node *node, +struct shl_node *shl_gref_get_input_subgraph(struct shl_ref_graph *graph, struct shl_node *node, int index) { - struct csi_node *next_node = node->in[index]->in[0]; + struct shl_node *next_node = node->in[index]->in[0]; if (next_node && next_node->type == CSINN_SUBGRAPH_RETURN) { next_node = graph->layer[next_node->subgraph_idx]; } return next_node; } -int csi_subgraph_get_device(struct csi_node *node) +int shl_subgraph_get_device(struct shl_node *node) { int device = -1; - struct csi_params_base *params; + struct csinn_params_base *params; if (node->type == CSINN_SUBGRAPH) { - struct csi_ref_graph *sgraph = node->data; + struct shl_ref_graph *sgraph = node->data; params = sgraph->layer[0]->data; device = params->api; - } else if (node->type >= 0 && node->type < CSINN_SESSION_INIT) { + } else if (node->type >= 0 && node->type < CSINN_OP_SIZE) { params = node->data; device = params->api; } else { - CSI_DEBUG_CALL(printf("unknown node type.\n")); + shl_debug_error("unknown node type.\n"); } return device; } -void csi_subgraph_fvisit_print(struct csi_ref_graph *graph, struct csi_node *node) +void shl_subgraph_fvisit_print(struct shl_ref_graph *graph, struct shl_node *node) { printf("%s\n", node->name); } -void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node) +int shl_is_restricted_by_node(int subgraph_idx, struct shl_node *node, struct shl_ref_graph *graph) +{ + int find_flag = 0; + + int queue_size = 32; + struct shl_node **node_queue = shl_mem_alloc(sizeof(struct shl_node *) * queue_size); + int queue_left = 0; + int queue_right = 0; + /* add current node into queue */ + node_queue[queue_right++] = node; + while (queue_right > queue_left) { + struct shl_node *curr_node = node_queue[queue_left]; + queue_left++; + /* determine whether subgraph_idx is restricted by node */ + for (int i = 0; i < curr_node->restricted_map_num; i++) { + if (subgraph_idx == curr_node->restricted_map[i]) { + find_flag = 1; + /* break loop */ + queue_left = queue_right; + break; + } + } + /* add input nodes of curr_node into queue. */ + /* FIXME(@chenf) it's possible to add node into queue repeatly. */ + int input_num = 0; + if (curr_node->type == CSINN_SUBGRAPH) { + input_num = ((struct shl_ref_graph *)curr_node->data)->input_num; + } else { + input_num = curr_node->in_num; + } + for (int i = 0; i < input_num; i++) { + struct shl_node *next_node = NULL; + if (curr_node->type == CSINN_SUBGRAPH) { + if (((struct shl_ref_graph *)curr_node->data)->input[i]->in) { + next_node = ((struct shl_ref_graph *)curr_node->data)->input[i]->in[0]; + } + } else { + if (curr_node->in[i]->in) { + next_node = curr_node->in[i]->in[0]; + } + } + if (next_node) { + next_node = graph->layer[next_node->subgraph_idx]; + } + + if (next_node) { + if (queue_right >= queue_size) { + queue_size += 32; + node_queue = + shl_mem_realloc(node_queue, sizeof(struct shl_node *) * queue_size); + } + node_queue[queue_right++] = next_node; + } + } + } + shl_mem_free(node_queue); + return find_flag; +} + +void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node) { /* CPU nodes needn't be added into subgraph. */ - struct csi_params_base *params = node->data; + struct csinn_params_base *params = node->data; if (params->api == params->sess->base_api) { node->subgraph_idx = graph->layer_index; - csi_gref_graph_insert(node, graph); + shl_gref_graph_insert(node, graph); - for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) { - struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m); + for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) { + struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m); if (m_node) { - csi_node_restrict_map_insert(m_node->subgraph_idx, + shl_node_restrict_map_insert(m_node->subgraph_idx, graph->layer[node->subgraph_idx]); } } return; } - if (csi_gref_is_root_node(graph, node)) { + if (shl_gref_is_root_node(graph, node)) { /* create subgraph node */ - struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); - struct csi_node *sg_in = csi_node_alloc(CSINN_SUBGRAPH, "graph_in", 0, 0, sgraph); + struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); + struct shl_node *sg_in = shl_node_alloc(CSINN_SUBGRAPH, "graph_in", 0, 0, sgraph); node->subgraph_idx = graph->layer_index; sg_in->subgraph_idx = graph->layer_index; - csi_gref_graph_insert(node, sgraph); - csi_gref_graph_insert(sg_in, graph); + shl_gref_graph_insert(node, sgraph); + shl_gref_graph_insert(sg_in, graph); + + shl_gref_update_input_output(graph, sg_in->subgraph_idx); return; } int i; int can_fuse = 0; - for (i = 0; i < csi_node_get_non_const_in_number(node); i++) { - struct csi_node *i_node = csi_gref_get_input_subgraph(graph, node, i); + for (i = 0; i < shl_node_get_non_const_in_number(node); i++) { + struct shl_node *i_node = shl_gref_get_input_subgraph(graph, node, i); if (!i_node) continue; - int i_device = csi_subgraph_get_device(i_node); - int curr_device = csi_subgraph_get_device(node); + int i_device = shl_subgraph_get_device(i_node); + int curr_device = shl_subgraph_get_device(node); if (i_device == curr_device) { int is_restrict = 0; /* determine whether the i-th input subgraph is restricted by other input subgraph. */ - for (int j = 0; j < csi_node_get_non_const_in_number(node); j++) { + for (int j = 0; j < shl_node_get_non_const_in_number(node); j++) { if (i == j) continue; - struct csi_node *j_node = csi_gref_get_input_subgraph(graph, node, j); + struct shl_node *j_node = shl_gref_get_input_subgraph(graph, node, j); if (!j_node) continue; int find_flag = 0; - struct csi_node *j_subgraph = graph->layer[j_node->subgraph_idx]; + struct shl_node *j_subgraph = graph->layer[j_node->subgraph_idx]; // if (j_subgraph->restricted_map_num == 0) break; - for (int k = 0; k < j_subgraph->restricted_map_num; k++) { - if (i_node->subgraph_idx == j_subgraph->restricted_map[k]) { - find_flag = 1; - break; - } - } + // for (int k = 0; k < j_subgraph->restricted_map_num; k++) { + // if (i_node->subgraph_idx == j_subgraph->restricted_map[k]) { + // find_flag = 1; + // break; + // } + // } + + find_flag = shl_is_restricted_by_node(i_node->subgraph_idx, j_subgraph, graph); + if (find_flag) { is_restrict = 1; break; @@ -496,8 +605,10 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node if (!is_restrict) { /* add current node into its i-th input subgraph. */ node->subgraph_idx = i_node->subgraph_idx; - struct csi_ref_graph *sgraph = graph->layer[i_node->subgraph_idx]->data; - csi_gref_graph_insert(node, sgraph); + struct shl_ref_graph *sgraph = graph->layer[i_node->subgraph_idx]->data; + shl_gref_graph_insert(node, sgraph); + + shl_gref_update_input_output(graph, i_node->subgraph_idx); can_fuse = 1; break; } @@ -506,19 +617,19 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node if (can_fuse) { /* Try to fuse input subgraph into current subgraph. */ - for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) { + for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) { if (m == i) continue; - struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m); + struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m); if (!m_node) continue; if (m_node->subgraph_idx == node->subgraph_idx) continue; - int curr_device = csi_subgraph_get_device(node); - int m_device = csi_subgraph_get_device(m_node); + int curr_device = shl_subgraph_get_device(node); + int m_device = shl_subgraph_get_device(m_node); if (curr_device == m_device) { /* fusing subgraphs. */ - struct csi_node *m_subgraph = graph->layer[m_node->subgraph_idx]; - struct csi_ref_graph *sgraph = m_subgraph->data; - csi_gref_update_input_output(graph, m_node->subgraph_idx); + struct shl_node *m_subgraph = graph->layer[m_node->subgraph_idx]; + struct shl_ref_graph *sgraph = m_subgraph->data; + shl_gref_update_input_output(graph, m_node->subgraph_idx); int is_restrict = 0; for (int n = 0; n < sgraph->input_num; n++) { @@ -528,22 +639,24 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node } int in_m_subgraph_index = sgraph->input[n]->in[0]->subgraph_idx; int find_flag = 0; - for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num; - nr++) { - if (node->subgraph_idx == - graph->layer[in_m_subgraph_index]->restricted_map[nr]) { - find_flag = 1; - break; - } - } + // for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num; + // nr++) { + // if (node->subgraph_idx == + // graph->layer[in_m_subgraph_index]->restricted_map[nr]) { + // find_flag = 1; + // break; + // } + // } + find_flag = shl_is_restricted_by_node(node->subgraph_idx, + graph->layer[in_m_subgraph_index], graph); if (find_flag) { is_restrict = 1; break; } } - struct csi_ref_graph *curr_sgraph = graph->layer[node->subgraph_idx]->data; - csi_gref_update_input_output(graph, node->subgraph_idx); + struct shl_ref_graph *curr_sgraph = graph->layer[node->subgraph_idx]->data; + shl_gref_update_input_output(graph, node->subgraph_idx); int is_restrict2 = 0; for (int n = 0; n < curr_sgraph->input_num; n++) { @@ -553,14 +666,16 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node } int in_m_subgraph_index = curr_sgraph->input[n]->in[0]->subgraph_idx; int find_flag = 0; - for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num; - nr++) { - if (m_node->subgraph_idx == - graph->layer[in_m_subgraph_index]->restricted_map[nr]) { - find_flag = 1; - break; - } - } + // for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num; + // nr++) { + // if (m_node->subgraph_idx == + // graph->layer[in_m_subgraph_index]->restricted_map[nr]) { + // find_flag = 1; + // break; + // } + // } + find_flag = shl_is_restricted_by_node(m_node->subgraph_idx, + graph->layer[in_m_subgraph_index], graph); if (find_flag) { is_restrict2 = 1; break; @@ -570,21 +685,23 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node if (!is_restrict && !is_restrict2) { /* can fuse subgraph into current subgraph. */ for (int n = 0; n < sgraph->layer_index; n++) { - struct csi_node *subgraph_node = sgraph->layer[n]; + struct shl_node *subgraph_node = sgraph->layer[n]; subgraph_node->subgraph_idx = node->subgraph_idx; - csi_gref_graph_insert(subgraph_node, curr_sgraph); + shl_gref_graph_insert(subgraph_node, curr_sgraph); + + shl_gref_update_input_output(graph, node->subgraph_idx); } for (int n = 0; n < m_subgraph->restricted_map_num; n++) { - csi_node_restrict_map_insert(m_subgraph->restricted_map[n], + shl_node_restrict_map_insert(m_subgraph->restricted_map[n], graph->layer[node->subgraph_idx]); } sgraph->layer_index = 0; sgraph->layer_size = 0; } else { - csi_node_restrict_map_insert(node->subgraph_idx, m_subgraph); + shl_node_restrict_map_insert(node->subgraph_idx, m_subgraph); } } else { - csi_node_restrict_map_insert(m_node->subgraph_idx, + shl_node_restrict_map_insert(m_node->subgraph_idx, graph->layer[node->subgraph_idx]); } } @@ -592,17 +709,19 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node /* current node is restricted from being fused into input subgraph by other subgraph. * so create new subgraph and update its restricted_map. */ - struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); - struct csi_node *sg_in = csi_node_alloc(CSINN_SUBGRAPH, "graph_in", 1, 1, sgraph); + struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); + struct shl_node *sg_in = shl_node_alloc(CSINN_SUBGRAPH, "graph_in", 1, 1, sgraph); node->subgraph_idx = graph->layer_index; sg_in->subgraph_idx = graph->layer_index; - csi_gref_graph_insert(node, sgraph); - csi_gref_graph_insert(sg_in, graph); + shl_gref_graph_insert(node, sgraph); + shl_gref_graph_insert(sg_in, graph); + + shl_gref_update_input_output(graph, sg_in->subgraph_idx); - for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) { - struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m); + for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) { + struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m); if (m_node) { - csi_node_restrict_map_insert(m_node->subgraph_idx, + shl_node_restrict_map_insert(m_node->subgraph_idx, graph->layer[node->subgraph_idx]); } } @@ -610,30 +729,30 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node return; } -struct csi_ref_graph *csi_subgraph_generate(struct csi_ref_graph *ograph) +struct shl_ref_graph *shl_subgraph_generate(struct shl_ref_graph *ograph) { - struct csi_ref_graph *ggraph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *ggraph = shl_mem_alloc(sizeof(struct shl_ref_graph)); ggraph->input = ograph->input; ggraph->output = ograph->output; ggraph->input_num = ograph->input_num; ggraph->output_num = ograph->output_num; - csi_gref_post_dfs(ggraph, csi_subgraph_fvisit_fuse); + shl_gref_post_dfs(ggraph, shl_subgraph_fvisit_fuse); return ggraph; } -void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph, - struct csi_ref_graph *old_graph) +void shl_subgraph_topology_sort_internal(struct shl_ref_graph *new_graph, + struct shl_ref_graph *old_graph) { int stack_size = 32; - struct csi_node **node_stack = csi_mem_alloc(sizeof(struct csi_node *) * stack_size); - int *input_idx_stack = csi_mem_alloc(sizeof(int) * stack_size); + struct shl_node **node_stack = shl_mem_alloc(sizeof(struct shl_node *) * stack_size); + int *input_idx_stack = shl_mem_alloc(sizeof(int) * stack_size); int stack_top = -1; - struct csi_node *curr_node; + struct shl_node *curr_node; for (int i = 0; i < new_graph->output_num; i++) { - struct csi_tensor *ot = new_graph->output[i]->data; + struct csinn_tensor *ot = new_graph->output[i]->data; if (ot->is_const) continue; curr_node = new_graph->output[i]->in[0]; if (curr_node->subgraph_idx != -1 && @@ -645,8 +764,8 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph, ++stack_top; if (stack_top >= stack_size) { stack_size += 32; - node_stack = csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size); - input_idx_stack = csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size); + node_stack = shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size); + input_idx_stack = shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size); } node_stack[stack_top] = curr_node; input_idx_stack[stack_top] = 0; @@ -654,13 +773,13 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph, } while (stack_top != -1) { curr_node = node_stack[stack_top]; - if (input_idx_stack[stack_top] == csi_node_get_non_const_in_number(curr_node) || - csi_gref_is_root_node(new_graph, curr_node)) { - csi_gref_graph_insert(curr_node, new_graph); + if (input_idx_stack[stack_top] == shl_node_get_non_const_in_number(curr_node) || + shl_gref_is_root_node(new_graph, curr_node)) { + shl_gref_graph_insert(curr_node, new_graph); --stack_top; } else { - struct csi_node *next_node = curr_node->in[input_idx_stack[stack_top]]->in[0]; + struct shl_node *next_node = curr_node->in[input_idx_stack[stack_top]]->in[0]; if (next_node && next_node->subgraph_idx != -1 && old_graph->layer[next_node->subgraph_idx]->type == CSINN_SUBGRAPH) { next_node = old_graph->layer[next_node->subgraph_idx]; @@ -671,9 +790,9 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph, if (stack_top >= stack_size) { stack_size += 32; node_stack = - csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size); + shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size); input_idx_stack = - csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size); + shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size); } node_stack[stack_top] = next_node; input_idx_stack[stack_top] = 0; @@ -683,79 +802,79 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph, } } - csi_mem_free(node_stack); - csi_mem_free(input_idx_stack); + shl_mem_free(node_stack); + shl_mem_free(input_idx_stack); } -struct csi_ref_graph *csi_subgraph_topology_sort(struct csi_ref_graph *graph) +struct shl_ref_graph *shl_subgraph_topology_sort(struct shl_ref_graph *graph) { - struct csi_ref_graph *sorted_graph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *sorted_graph = shl_mem_alloc(sizeof(struct shl_ref_graph)); sorted_graph->input = graph->input; sorted_graph->output = graph->output; sorted_graph->input_num = graph->input_num; sorted_graph->output_num = graph->output_num; - csi_subgraph_topology_sort_internal(sorted_graph, graph); - csi_gref_reset_graph_visit(sorted_graph); + shl_subgraph_topology_sort_internal(sorted_graph, graph); + shl_gref_reset_graph_visit(sorted_graph); return sorted_graph; } -struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph) +struct shl_ref_graph *shl_subgraph_rebuild(struct shl_ref_graph *subgraph) { - struct csi_ref_graph *splited_graph = csi_mem_alloc(sizeof(struct csi_ref_graph)); + struct shl_ref_graph *splited_graph = shl_mem_alloc(sizeof(struct shl_ref_graph)); splited_graph->input = subgraph->input; splited_graph->output = subgraph->output; splited_graph->input_num = subgraph->input_num; splited_graph->output_num = subgraph->output_num; for (int i = 0; i < subgraph->layer_index; i++) { - struct csi_node *node = subgraph->layer[i]; + struct shl_node *node = subgraph->layer[i]; if (node->type == CSINN_SUBGRAPH) { - struct csi_ref_graph *sgraph = node->data; + struct shl_ref_graph *sgraph = node->data; if (sgraph->layer_size == 0) continue; /* split graph */ /* for input formal parameters */ - node->in = csi_mem_realloc(node->in, sgraph->input_num * sizeof(struct csi_node *)); + node->in = shl_mem_realloc(node->in, sgraph->input_num * sizeof(struct shl_node *)); node->in_num = sgraph->input_num; for (int in_idx = 0; in_idx < sgraph->input_num; in_idx++) { - struct csi_node *in_tensor_node = sgraph->input[in_idx]; + struct shl_node *in_tensor_node = sgraph->input[in_idx]; node->in[in_idx] = in_tensor_node; - struct csi_tensor *sg_in_tensor = csi_alloc_tensor(NULL); - csi_tensor_copy(sg_in_tensor, in_tensor_node->data); - struct csi_node *sg_in_node = csi_node_var_alloc("graph_in_tensor", sg_in_tensor); + struct csinn_tensor *sg_in_tensor = csinn_alloc_tensor(NULL); + csinn_tensor_copy(sg_in_tensor, in_tensor_node->data); + struct shl_node *sg_in_node = shl_node_var_alloc("graph_in_tensor", sg_in_tensor); sgraph->input[in_idx] = sg_in_node; for (int l_idx = 0; l_idx < sgraph->layer_index; l_idx++) { - struct csi_node *curr_node = sgraph->layer[l_idx]; - int index = csi_node_find(curr_node->in, curr_node->in_num, in_tensor_node); + struct shl_node *curr_node = sgraph->layer[l_idx]; + int index = shl_node_find(curr_node->in, curr_node->in_num, in_tensor_node); if (index > -1) { curr_node->in[index] = sg_in_node; } } } /* for output formal parameters */ - struct csi_node *sg_out = csi_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", + struct shl_node *sg_out = shl_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", sgraph->output_num, sgraph->output_num, NULL); for (int out_idx = 0; out_idx < sgraph->output_num; out_idx++) { - struct csi_node *out_tensor_node = sgraph->output[out_idx]; + struct shl_node *out_tensor_node = sgraph->output[out_idx]; sg_out->in[out_idx] = out_tensor_node; for (int l_idx = 0; l_idx < sgraph->layer_index; l_idx++) { - struct csi_node *curr_node = sgraph->layer[l_idx]; - int index = csi_node_find(curr_node->out, curr_node->out_num, out_tensor_node); + struct shl_node *curr_node = sgraph->layer[l_idx]; + int index = shl_node_find(curr_node->out, curr_node->out_num, out_tensor_node); if (index > -1) { - struct csi_tensor *sg_out_tensor = csi_alloc_tensor(NULL); - csi_tensor_copy(sg_out_tensor, curr_node->out[index]->data); - struct csi_node *sg_out_node = - csi_node_var_alloc("graph_out_tensor", sg_out_tensor); + struct csinn_tensor *sg_out_tensor = csinn_alloc_tensor(NULL); + csinn_tensor_copy(sg_out_tensor, curr_node->out[index]->data); + struct shl_node *sg_out_node = + shl_node_var_alloc("graph_out_tensor", sg_out_tensor); sg_out->out[out_idx] = sg_out_node; } } } - csi_gref_graph_insert(sg_out, sgraph); + shl_gref_graph_insert(sg_out, sgraph); /* update subgraph_idx */ int curr_subgraph_idx = splited_graph->layer_index; @@ -763,11 +882,11 @@ struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph) sgraph->layer[idx]->subgraph_idx = curr_subgraph_idx; } node->subgraph_idx = curr_subgraph_idx; - csi_gref_graph_insert(node, splited_graph); + shl_gref_graph_insert(node, splited_graph); } else { /* update subgraph_idx */ node->subgraph_idx = splited_graph->layer_index; - csi_gref_graph_insert(node, splited_graph); + shl_gref_graph_insert(node, splited_graph); } } return splited_graph; diff --git a/source/graph_ref/sum.c b/source/graph_ref/sum.c index fcbaaf08..ffe16214 100644 --- a/source/graph_ref/sum.c +++ b/source/graph_ref/sum.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_sum(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int shl_gref_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_SUM, params); + shl_gref_siso_op(input, output, CSINN_OP_SUM, params); return CSINN_TRUE; } diff --git a/source/graph_ref/tan.c b/source/graph_ref/tan.c index b5693260..b8016949 100644 --- a/source/graph_ref/tan.c +++ b/source/graph_ref/tan.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_tan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_tan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_TAN, params); + shl_gref_siso_op(input, output, CSINN_OP_TAN, params); return CSINN_TRUE; } diff --git a/source/graph_ref/tanh.c b/source/graph_ref/tanh.c index b3fc2406..dd707f80 100644 --- a/source/graph_ref/tanh.c +++ b/source/graph_ref/tanh.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_tanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_TANH, params); + shl_gref_siso_op(input, output, CSINN_OP_TANH, params); return CSINN_TRUE; } diff --git a/source/graph_ref/threshold_relu.c b/source/graph_ref/threshold_relu.c index 381ca44b..5f325e53 100644 --- a/source/graph_ref/threshold_relu.c +++ b/source/graph_ref/threshold_relu.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_threshold_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_gref_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_THRESHOLD_RELU, params); + shl_gref_siso_op(input, output, CSINN_OP_THRESHOLD_RELU, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/tile.c b/source/graph_ref/tile.c index 0d276b47..6d9d374c 100644 --- a/source/graph_ref/tile.c +++ b/source/graph_ref/tile.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_tile(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params) +int shl_gref_tile(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_TILE, params); + shl_gref_siso_op(input, output, CSINN_OP_TILE, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/topk.c b/source/graph_ref/topk.c index 3cae010c..6933d397 100644 --- a/source/graph_ref/topk.c +++ b/source/graph_ref/topk.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_topk(struct csi_tensor *input, - struct csi_tensor *output1, - struct csi_tensor *output2, - struct topk_params *params) +int shl_gref_topk(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params) { - csi_debug_error("csi_gref_topk unsupport\n"); + shl_debug_error("shl_gref_topk unsupport\n"); return CSINN_FALSE; } diff --git a/source/graph_ref/transpose.c b/source/graph_ref/transpose.c index daf0f6ee..c96a2add 100644 --- a/source/graph_ref/transpose.c +++ b/source/graph_ref/transpose.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_transpose(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) +int shl_gref_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_TRANSPOSE, params); + shl_gref_siso_op(input, output, CSINN_OP_TRANSPOSE, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/trunc.c b/source/graph_ref/trunc.c index 474c6800..ed4d5fb2 100644 --- a/source/graph_ref/trunc.c +++ b/source/graph_ref/trunc.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_trunc(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_trunc(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_TRUNC, params); + shl_gref_siso_op(input, output, CSINN_OP_TRUNC, params); return CSINN_TRUE; } - diff --git a/source/graph_ref/unpooling.c b/source/graph_ref/unpooling.c index 9fcd33dc..d58b65f6 100644 --- a/source/graph_ref/unpooling.c +++ b/source/graph_ref/unpooling.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_unpooling(struct csi_tensor *input, - struct csi_tensor *mask, - struct csi_tensor *output, - struct unpooling_params *params) +int shl_gref_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params) { - csi_debug_error("csi_gref_unpooling unsupport\n"); + shl_debug_error("shl_gref_unpooling unsupport\n"); return CSINN_FALSE; } - diff --git a/source/graph_ref/unstack.c b/source/graph_ref/unstack.c index a7569080..fb414295 100644 --- a/source/graph_ref/unstack.c +++ b/source/graph_ref/unstack.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_unstack(struct csi_tensor *input, - struct csi_tensor **output, - struct unstack_params *params) +int shl_gref_unstack(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params) { - csi_debug_error("csi_gref_unstack unsupport\n"); + shl_debug_error("shl_gref_unstack unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/utils.c b/source/graph_ref/utils.c index f0452aef..b63f3e79 100644 --- a/source/graph_ref/utils.c +++ b/source/graph_ref/utils.c @@ -16,85 +16,74 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_graph_insert(struct csi_node *node, struct csi_ref_graph *graph) +int shl_gref_graph_insert(struct shl_node *node, struct shl_ref_graph *graph) { if (graph->layer_size == 0 || graph->layer_index == graph->layer_size - 1) { graph->layer_size += 128; - graph->layer = csi_mem_realloc(graph->layer, graph->layer_size * sizeof(struct csi_node *)); + graph->layer = shl_mem_realloc(graph->layer, graph->layer_size * sizeof(struct shl_node *)); } graph->layer[graph->layer_index] = node; graph->layer_index++; return CSINN_TRUE; } -int csi_gref_siso_op(struct csi_tensor *input, - struct csi_tensor *output, - int op, - void *params) +int shl_gref_siso_op(struct csinn_tensor *input, struct csinn_tensor *output, int op, void *params) { - struct csi_params_base *ptr = params; - struct csi_node *layer = csi_node_alloc(op, ptr->name, 1, 1, params); - struct csi_node *in0 = (struct csi_node *)input->data; - struct csi_node *out = csi_node_var_alloc(output->name, output); - csi_node_add_in(layer, in0, 0); - csi_node_add_out(layer, out, 0); + struct csinn_params_base *ptr = params; + struct shl_node *layer = shl_node_alloc(op, ptr->name, 1, 1, params); + struct shl_node *in0 = (struct shl_node *)input->data; + struct shl_node *out = shl_node_var_alloc(output->name, output); + shl_node_add_in(layer, in0, 0); + shl_node_add_out(layer, out, 0); output->data = out; - struct csi_ref_graph *graph = csi_gref_get_graph(input->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(input->sess); + shl_gref_graph_insert(layer, graph); return CSINN_TRUE; } -int csi_gref_diso_op(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - int op, - void *params) +int shl_gref_diso_op(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, int op, void *params) { - struct csi_params_base *ptr = params; - struct csi_node *layer = csi_node_alloc(op, ptr->name, 2, 1, params); - struct csi_node *in0 = (struct csi_node *)input0->data; - struct csi_node *in1; + struct csinn_params_base *ptr = params; + struct shl_node *layer = shl_node_alloc(op, ptr->name, 2, 1, params); + struct shl_node *in0 = (struct shl_node *)input0->data; + struct shl_node *in1; if (input1->is_const) { - in1 = csi_node_const_var_alloc(input1->name, input1); + in1 = shl_node_const_var_alloc(input1->name, input1); } else { - in1 = (struct csi_node *)input1->data; + in1 = (struct shl_node *)input1->data; } - struct csi_node *out = csi_node_var_alloc(output->name, output); - csi_node_add_in(layer, in0, 0); - csi_node_add_in(layer, in1, 1); - csi_node_add_out(layer, out, 0); + struct shl_node *out = shl_node_var_alloc(output->name, output); + shl_node_add_in(layer, in0, 0); + shl_node_add_in(layer, in1, 1); + shl_node_add_out(layer, out, 0); output->data = out; - struct csi_ref_graph *graph = csi_gref_get_graph(input0->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(input0->sess); + shl_gref_graph_insert(layer, graph); return CSINN_TRUE; } - /* single input double const single output */ -int csi_gref_sidcso_op(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *const0, - struct csi_tensor *const1, - int op, +int shl_gref_sidcso_op(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *const0, struct csinn_tensor *const1, int op, void *params) { - struct csi_params_base *ptr = params; - struct csi_node *layer = csi_node_alloc(op, ptr->name, 3, 1, params); - struct csi_node *in0 = (struct csi_node *)input->data; - struct csi_node *in1 = csi_node_const_var_alloc(const0->name, const0); - struct csi_node *in2 = csi_node_const_var_alloc(const1->name, const1); - struct csi_node *out = csi_node_var_alloc(output->name, output); - csi_node_add_in(layer, in0, 0); - csi_node_add_in(layer, in1, 1); - csi_node_add_in(layer, in2, 2); - csi_node_add_out(layer, out, 0); + struct csinn_params_base *ptr = params; + struct shl_node *layer = shl_node_alloc(op, ptr->name, 3, 1, params); + struct shl_node *in0 = (struct shl_node *)input->data; + struct shl_node *in1 = shl_node_const_var_alloc(const0->name, const0); + struct shl_node *in2 = shl_node_const_var_alloc(const1->name, const1); + struct shl_node *out = shl_node_var_alloc(output->name, output); + shl_node_add_in(layer, in0, 0); + shl_node_add_in(layer, in1, 1); + shl_node_add_in(layer, in2, 2); + shl_node_add_out(layer, out, 0); output->data = out; - struct csi_ref_graph *graph = csi_gref_get_graph(input->sess); - csi_gref_graph_insert(layer, graph); + struct shl_ref_graph *graph = shl_gref_get_graph(input->sess); + shl_gref_graph_insert(layer, graph); return CSINN_TRUE; } - diff --git a/source/graph_ref/where.c b/source/graph_ref/where.c index fcf824b9..45cc8451 100644 --- a/source/graph_ref/where.c +++ b/source/graph_ref/where.c @@ -16,16 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_where(struct csi_tensor *condition, - struct csi_tensor *x, - struct csi_tensor *y, - struct csi_tensor *output, - struct where_params *params) +int shl_gref_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params) { - csi_debug_error("csi_gref_where unsupport\n"); + shl_debug_error("shl_gref_where unsupport\n"); return CSINN_FALSE; } \ No newline at end of file diff --git a/source/graph_ref/xor.c b/source/graph_ref/xor.c index 9f6157a1..7e42c438 100644 --- a/source/graph_ref/xor.c +++ b/source/graph_ref/xor.c @@ -16,15 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_xor(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_gref_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - csi_gref_diso_op(input0, input1, output, CSINN_OP_XOR, params); + shl_gref_diso_op(input0, input1, output, CSINN_OP_XOR, params); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/graph_ref/yuv_rgb_scale.c b/source/graph_ref/yuv_rgb_scale.c index 6ad42e13..af350e15 100644 --- a/source/graph_ref/yuv_rgb_scale.c +++ b/source/graph_ref/yuv_rgb_scale.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_gref.h" +#include "shl_gref.h" -int csi_gref_yuv_rgb_scale(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_gref_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - csi_gref_siso_op(input, output, CSINN_OP_YUV_RGB_SCALE, params); + shl_gref_siso_op(input, output, CSINN_OP_YUV_RGB_SCALE, params); return CSINN_TRUE; } diff --git a/source/i805_opt/activation/csi_i805_clip_8.S b/source/i805_opt/activation/shl_i805_clip_8.S similarity index 91% rename from source/i805_opt/activation/csi_i805_clip_8.S rename to source/i805_opt/activation/shl_i805_clip_8.S index 722b1b6e..56176851 100644 --- a/source/i805_opt/activation/csi_i805_clip_8.S +++ b/source/i805_opt/activation/shl_i805_clip_8.S @@ -16,18 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_clip_8.S + * @file shl_i805_clip_8.S * @brief uint8 clip layer function. * @version V1.0 * @date 2. Aug 2021 ******************************************************************************/ /* - void csi_i805_clip_opt_u8(uint8_t *input_data, + void shl_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_qmin, @@ -56,13 +56,13 @@ vr7: output left shift */ - .file "csi_i805_clip_8.S" - .section .text.csi_i805_clip_opt_u8,"ax",@progbits + .file "shl_i805_clip_8.S" + .section .text.shl_i805_clip_opt_u8,"ax",@progbits .align 2 - .global csi_i805_clip_opt_u8 - .type csi_i805_clip_opt_u8, @function + .global shl_i805_clip_opt_u8 + .type shl_i805_clip_opt_u8, @function -csi_i805_clip_opt_u8: +shl_i805_clip_opt_u8: ld.w t7, (sp, 0x00) // clip_qmax ld.w t0, (sp, 0x04) // input_zp @@ -127,4 +127,4 @@ csi_i805_clip_opt_u8: .END: rts - .size csi_i805_clip_opt_u8, .-csi_i805_clip_opt_u8 + .size shl_i805_clip_opt_u8, .-shl_i805_clip_opt_u8 diff --git a/source/i805_opt/activation/csi_i805_relu6_8.S b/source/i805_opt/activation/shl_i805_relu6_8.S similarity index 89% rename from source/i805_opt/activation/csi_i805_relu6_8.S rename to source/i805_opt/activation/shl_i805_relu6_8.S index 5960f022..d17b8807 100644 --- a/source/i805_opt/activation/csi_i805_relu6_8.S +++ b/source/i805_opt/activation/shl_i805_relu6_8.S @@ -16,18 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_rel8_8.S + * @file shl_i805_rel8_8.S * @brief uint8 asym relu6 layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_relu6_opt_u8(uint8_t *data, + void shl_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, int32_t out_multiplier, @@ -51,13 +51,13 @@ t0: temp loop count */ - .file "csi_i805_relu6_8.S" - .section .text.csi_i805_relu6_opt_u8,"ax",@progbits + .file "shl_i805_relu6_8.S" + .section .text.shl_i805_relu6_opt_u8,"ax",@progbits .align 2 - .global csi_i805_relu6_opt_u8 - .type csi_i805_relu6_opt_u8, @function + .global shl_i805_relu6_opt_u8 + .type shl_i805_relu6_opt_u8, @function -csi_i805_relu6_opt_u8: +shl_i805_relu6_opt_u8: push l0 ld.w l0, (sp, 0x04) // out_shift @@ -108,4 +108,4 @@ csi_i805_relu6_opt_u8: .END: pop l0 rts - .size csi_i805_relu6_opt_u8, .-csi_i805_relu6_opt_u8 + .size shl_i805_relu6_opt_u8, .-shl_i805_relu6_opt_u8 diff --git a/source/i805_opt/activation/csi_i805_relu_8.S b/source/i805_opt/activation/shl_i805_relu_8.S similarity index 89% rename from source/i805_opt/activation/csi_i805_relu_8.S rename to source/i805_opt/activation/shl_i805_relu_8.S index 0e4cc276..875ad049 100644 --- a/source/i805_opt/activation/csi_i805_relu_8.S +++ b/source/i805_opt/activation/shl_i805_relu_8.S @@ -16,18 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_relu_8.S + * @file shl_i805_relu_8.S * @brief uint8 relu layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_relu_opt_u8(uint8_t *data, + void shl_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, int32_t out_multiplier, @@ -55,13 +55,13 @@ */ - .file "csi_i805_relu_8.S" - .section .text.csi_i805_relu_opt_u8,"ax",@progbits + .file "shl_i805_relu_8.S" + .section .text.shl_i805_relu_opt_u8,"ax",@progbits .align 2 - .global csi_i805_relu_opt_u8 - .type csi_i805_relu_opt_u8, @function + .global shl_i805_relu_opt_u8 + .type shl_i805_relu_opt_u8, @function -csi_i805_relu_opt_u8: +shl_i805_relu_opt_u8: push l0 ld.w l0, (sp, 0x04) // out_shift @@ -112,4 +112,4 @@ csi_i805_relu_opt_u8: .END: pop l0 rts - .size csi_i805_relu_opt_u8, .-csi_i805_relu_opt_u8 + .size shl_i805_relu_opt_u8, .-shl_i805_relu_opt_u8 diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q15.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q15.S similarity index 92% rename from source/i805_opt/activation/csi_xt800v_nn_activations_q15.S rename to source/i805_opt/activation/shl_xt800v_nn_activations_q15.S index 3399f31c..985d7938 100644 --- a/source/i805_opt/activation/csi_xt800v_nn_activations_q15.S +++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q15.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800v_nn_activations_q15.S + * @file shl_xt800v_nn_activations_q15.S * @brief Q15 neural network activation function using direct table look-up. * @version V1.0 * @date 01. June 2018 @@ -26,19 +26,19 @@ .import tanhTable_q15 /* - *void csi_xt800v_nn_activations_direct_q15(q15_t * data, + *void shl_xt800v_nn_activations_direct_q15(q15_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800v_nn_activation_type type) + * shl_xt800v_nn_activation_type type) */ - .file "csi_xt800v_nn_activations_q15.S" - .section .text.csi_xt800v_nn_activations_direct_q15,"ax",@progbits + .file "shl_xt800v_nn_activations_q15.S" + .section .text.shl_xt800v_nn_activations_direct_q15,"ax",@progbits .align 2 - .global csi_xt800v_nn_activations_direct_q15 - .type csi_xt800v_nn_activations_direct_q15, @function + .global shl_xt800v_nn_activations_direct_q15 + .type shl_xt800v_nn_activations_direct_q15, @function -csi_xt800v_nn_activations_direct_q15: +shl_xt800v_nn_activations_direct_q15: push l0, l1, l2, l3 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -205,4 +205,4 @@ csi_xt800v_nn_activations_direct_q15: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3 - .size csi_xt800v_nn_activations_direct_q15, .-csi_xt800v_nn_activations_direct_q15 + .size shl_xt800v_nn_activations_direct_q15, .-shl_xt800v_nn_activations_direct_q15 diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S similarity index 84% rename from source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S rename to source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S index 9588c03b..30e6c171 100644 --- a/source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S +++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800v_nn_activations_q15_fast.S + * @file shl_xt800v_nn_activations_q15_fast.S * @brief Q15 neural network activation function using direct table look-up. * @version V1.0 * @date 01. June 2018 @@ -26,19 +26,19 @@ .import tanhTable_q15 /* - *void csi_xt800v_nn_activations_direct_q15(q15_t * data, + *void shl_xt800v_nn_activations_direct_q15(q15_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800v_nn_activation_type type) + * shl_xt800v_nn_activation_type type) */ - .file "csi_xt800v_nn_activations_q15.S" - .section .text.csi_xt800v_nn_activations_direct_q15,"ax",@progbits + .file "shl_xt800v_nn_activations_q15.S" + .section .text.shl_xt800v_nn_activations_direct_q15,"ax",@progbits .align 2 - .global csi_xt800v_nn_activations_direct_q15 - .type csi_xt800v_nn_activations_direct_q15, @function + .global shl_xt800v_nn_activations_direct_q15 + .type shl_xt800v_nn_activations_direct_q15, @function -csi_xt800v_nn_activations_direct_q15: +shl_xt800v_nn_activations_direct_q15: push l0, l1, l2, l3 subi sp, sp, 32 vstm.8 vr8-vr9, (sp) @@ -124,8 +124,7 @@ csi_xt800v_nn_activations_direct_q15: .L3: vldmu.8 vr8-vr9, (sp) pop l0, l1, l2, l3 - .size csi_xt800v_nn_activations_direct_q15, .-csi_xt800v_nn_activations_direct_q15 -.weak csi_nn_activations_direct_q15 -.set csi_nn_activations_direct_q15, csi_xt800v_nn_activations_direct_q15 + .size shl_xt800v_nn_activations_direct_q15, .-shl_xt800v_nn_activations_direct_q15 + .weak csky_vdsp2_nn_activations_direct_q15 -.set csky_vdsp2_nn_activations_direct_q15, csi_xt800v_nn_activations_direct_q15 +.set csky_vdsp2_nn_activations_direct_q15, shl_xt800v_nn_activations_direct_q15 diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q7.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q7.S similarity index 90% rename from source/i805_opt/activation/csi_xt800v_nn_activations_q7.S rename to source/i805_opt/activation/shl_xt800v_nn_activations_q7.S index 2309a326..cfea2a87 100644 --- a/source/i805_opt/activation/csi_xt800v_nn_activations_q7.S +++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q7.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800v_nn_activations_q7.S + * @file shl_xt800v_nn_activations_q7.S * @brief Q7 neural network activation function using direct table look-up. * @version V1.0 * @date 05. June 2018 @@ -26,19 +26,19 @@ .import sigmoidTable_q7 .import tanhTable_q7 /* - *void csi_xt800v_nn_activations_direct_q7(q7_t * data, + *void shl_xt800v_nn_activations_direct_q7(q7_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800v_nn_activation_type type) + * shl_xt800v_nn_activation_type type) */ - .file "csi_xt800v_nn_activations_q7.S" - .section .text.csi_xt800v_nn_activations_direct_q7,"ax",@progbits + .file "shl_xt800v_nn_activations_q7.S" + .section .text.shl_xt800v_nn_activations_direct_q7,"ax",@progbits .align 2 - .global csi_xt800v_nn_activations_direct_q7 - .type csi_xt800v_nn_activations_direct_q7, @function + .global shl_xt800v_nn_activations_direct_q7 + .type shl_xt800v_nn_activations_direct_q7, @function -csi_xt800v_nn_activations_direct_q7: +shl_xt800v_nn_activations_direct_q7: push l0, l1, l2, l3 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -162,4 +162,4 @@ csi_xt800v_nn_activations_direct_q7: vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3 - .size csi_xt800v_nn_activations_direct_q7, .-csi_xt800v_nn_activations_direct_q7 + .size shl_xt800v_nn_activations_direct_q7, .-shl_xt800v_nn_activations_direct_q7 diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S similarity index 75% rename from source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S rename to source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S index 404f05ea..c65009d5 100644 --- a/source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S +++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800v_nn_activations_q7_fast.S + * @file shl_xt800v_nn_activations_q7_fast.S * @brief Q7 neural network activation function using direct table look-up. * @version V1.0 * @date 05. June 2018 @@ -26,19 +26,19 @@ .import sigmoidTable_q7 .import tanhTable_q7 /* - *void csi_xt800v_nn_activations_direct_q7(q7_t * data, + *void shl_xt800v_nn_activations_direct_q7(q7_t * data, * uint16_t size, * uint16_t int_width, - * csi_xt800v_nn_activation_type type) + * shl_xt800v_nn_activation_type type) */ - .file "csi_xt800v_nn_activations_q7.S" - .section .text.csi_xt800v_nn_activations_direct_q7,"ax",@progbits + .file "shl_xt800v_nn_activations_q7.S" + .section .text.shl_xt800v_nn_activations_direct_q7,"ax",@progbits .align 2 - .global csi_xt800v_nn_activations_direct_q7 - .type csi_xt800v_nn_activations_direct_q7, @function + .global shl_xt800v_nn_activations_direct_q7 + .type shl_xt800v_nn_activations_direct_q7, @function -csi_xt800v_nn_activations_direct_q7: +shl_xt800v_nn_activations_direct_q7: push l0, l1, l2, l3 movi l0, 3 // shift_size = 3 - int_width subu l0, l0, a2 @@ -73,8 +73,7 @@ csi_xt800v_nn_activations_direct_q7: .L2: pop l0, l1, l2, l3 - .size csi_xt800v_nn_activations_direct_q7, .-csi_xt800v_nn_activations_direct_q7 -.weak csi_nn_activations_direct_q7 -.set csi_nn_activations_direct_q7, csi_xt800v_nn_activations_direct_q7 + .size shl_xt800v_nn_activations_direct_q7, .-shl_xt800v_nn_activations_direct_q7 + .weak csky_vdsp2_nn_activations_direct_q7 -.set csky_vdsp2_nn_activations_direct_q7, csi_xt800v_nn_activations_direct_q7 +.set csky_vdsp2_nn_activations_direct_q7, shl_xt800v_nn_activations_direct_q7 diff --git a/source/i805_opt/activation/csi_xt800v_relu_q15.S b/source/i805_opt/activation/shl_xt800v_relu_q15.S similarity index 78% rename from source/i805_opt/activation/csi_xt800v_relu_q15.S rename to source/i805_opt/activation/shl_xt800v_relu_q15.S index bd7869e1..594218c5 100644 --- a/source/i805_opt/activation/csi_xt800v_relu_q15.S +++ b/source/i805_opt/activation/shl_xt800v_relu_q15.S @@ -17,24 +17,24 @@ */ /****************************************************************************** - * @file csi_xt800v_relu_q15.S + * @file shl_xt800v_relu_q15.S * @brief Q15 version of ReLU. * @version V1.0 * @date 01. June 2018 ******************************************************************************/ /* - *void csi_xt800v_relu_q15(q15_t * data, + *void shl_xt800v_relu_q15(q15_t * data, * uint16_t size) */ - .file "csi_xt800v_relu_q15.S" - .section .text.csi_xt800v_relu_q15,"ax",@progbits + .file "shl_xt800v_relu_q15.S" + .section .text.shl_xt800v_relu_q15,"ax",@progbits .align 2 - .global csi_xt800v_relu_q15 - .type csi_xt800v_relu_q15, @function + .global shl_xt800v_relu_q15 + .type shl_xt800v_relu_q15, @function -csi_xt800v_relu_q15: +shl_xt800v_relu_q15: vmovi.8 vr7, 0 lsri t0, a1, 5 bez t0, .L1 @@ -72,8 +72,8 @@ csi_xt800v_relu_q15: .L4: rts - .size csi_xt800v_relu_q15, .-csi_xt800v_relu_q15 -.weak csi_relu_q15 -.set csi_relu_q15, csi_xt800v_relu_q15 + .size shl_xt800v_relu_q15, .-shl_xt800v_relu_q15 +.weak csinn_relu_q15 +.set csinn_relu_q15, shl_xt800v_relu_q15 .weak csky_vdsp2_relu_q15 -.set csky_vdsp2_relu_q15, csi_xt800v_relu_q15 +.set csky_vdsp2_relu_q15, shl_xt800v_relu_q15 diff --git a/source/i805_opt/activation/csi_xt800v_relu_q7.S b/source/i805_opt/activation/shl_xt800v_relu_q7.S similarity index 79% rename from source/i805_opt/activation/csi_xt800v_relu_q7.S rename to source/i805_opt/activation/shl_xt800v_relu_q7.S index 81cebfb7..ccf28755 100644 --- a/source/i805_opt/activation/csi_xt800v_relu_q7.S +++ b/source/i805_opt/activation/shl_xt800v_relu_q7.S @@ -17,24 +17,24 @@ */ /****************************************************************************** - * @file csi_xt800v_relu_q7.S + * @file shl_xt800v_relu_q7.S * @brief Q15 version of ReLU. * @version V1.0 * @date 01. June 2018 ******************************************************************************/ /* - *void csi_xt800v_relu_q7(q7_t * data, + *void shl_xt800v_relu_q7(q7_t * data, * uint8_t size) */ - .file "csi_xt800v_relu_q7.S" - .section .text.csi_xt800v_relu_q7,"ax",@progbits + .file "shl_xt800v_relu_q7.S" + .section .text.shl_xt800v_relu_q7,"ax",@progbits .align 2 - .global csi_xt800v_relu_q7 - .type csi_xt800v_relu_q7, @function + .global shl_xt800v_relu_q7 + .type shl_xt800v_relu_q7, @function -csi_xt800v_relu_q7: +shl_xt800v_relu_q7: vmovi.8 vr7, 0 lsri t0, a1, 6 bez t0, .L1 @@ -72,8 +72,8 @@ csi_xt800v_relu_q7: .L4: rts - .size csi_xt800v_relu_q7, .-csi_xt800v_relu_q7 -.weak csi_relu_q7 -.set csi_relu_q7, csi_xt800v_relu_q7 + .size shl_xt800v_relu_q7, .-shl_xt800v_relu_q7 +.weak csinn_relu_q7 +.set csinn_relu_q7, shl_xt800v_relu_q7 .weak csky_vdsp2_relu_q7 -.set csky_vdsp2_relu_q7, csi_xt800v_relu_q7 +.set csky_vdsp2_relu_q7, shl_xt800v_relu_q7 diff --git a/source/i805_opt/add.c b/source/i805_opt/add.c index 75eddf7f..e0ec8352 100644 --- a/source/i805_opt/add.c +++ b/source/i805_opt/add.c @@ -16,38 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_add_init_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_i805_add_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - + struct csinn_callback *cb = params->base.cb; // update multiplier and shift for s1/s3, s2/s3 - csi_quantize_multiplier(input0->qinfo->scale/output->qinfo->scale, &(input0->qinfo->multiplier), &(input0->qinfo->shift)); - csi_quantize_multiplier(input1->qinfo->scale/output->qinfo->scale, &(input1->qinfo->multiplier), &(input1->qinfo->shift)); - params->base.bc = csi_i805_add_u8; + shl_quantize_multiplier(input0->qinfo->scale / output->qinfo->scale, + &(input0->qinfo->multiplier), &(input0->qinfo->shift)); + shl_quantize_multiplier(input1->qinfo->scale / output->qinfo->scale, + &(input1->qinfo->multiplier), &(input1->qinfo->shift)); + cb->exec = shl_i805_add_u8; return CSINN_TRUE; } -int csi_i805_add_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_i805_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint8_t *input0_data = (uint8_t *)input0->data; uint8_t *input1_data = (uint8_t *)input1->data; uint8_t *output_data = (uint8_t *)output->data; - int32_t size = csi_tensor_size(input0); + int32_t size = csinn_tensor_size(input0); - csi_i805_elementwise_add_opt_u8(input0_data, input1_data, output_data, size, - input0->qinfo->zero_point, input0->qinfo->multiplier, -input0->qinfo->shift, - input1->qinfo->zero_point, input1->qinfo->multiplier, -input1->qinfo->shift, - output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift); + shl_i805_elementwise_add_opt_u8( + input0_data, input1_data, output_data, size, input0->qinfo->zero_point, + input0->qinfo->multiplier, -input0->qinfo->shift, input1->qinfo->zero_point, + input1->qinfo->multiplier, -input1->qinfo->shift, output->qinfo->zero_point, + output->qinfo->multiplier, -output->qinfo->shift); return CSINN_TRUE; } diff --git a/source/i805_opt/avgpool.c b/source/i805_opt/avgpool.c index a9d302fc..990817bc 100644 --- a/source/i805_opt/avgpool.c +++ b/source/i805_opt/avgpool.c @@ -16,19 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_i805.h" +/* CSI-NN2 version 2.0.x */ +#include "i805_function.h" +#include "shl_i805.h" /* constraint: 1.input tensor layout: NHWC 2. pad_left = pad_right; pad_top = pad_down FIXME: count_include_pad */ -static int csi_i805_avgpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_i805_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; @@ -47,31 +46,33 @@ static int csi_i805_avgpool2d_q7(struct csi_tensor *input, uint16_t stride_h = params->stride_height; uint16_t stride_w = params->stride_width; - uint16_t pad_x = params->pad_left; // i.e. pad_x = params->pad_right - uint16_t pad_y = params->pad_top; // i.e. pad_y = params->pad_down + uint16_t pad_x = params->pad_left; // i.e. pad_x = params->pad_right + uint16_t pad_y = params->pad_top; // i.e. pad_y = params->pad_down q7_t buffer_tmp[out_h * out_w * in_c]; // buffer_size = out_h * out_w * channel - if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) { + if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) { csky_vdsp2_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h, buffer_tmp, output_data); } else { - csky_vdsp2_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, - pad_x, pad_y, stride_w, stride_h, out_w, out_h, - buffer_tmp, output_data, output->qinfo->shift); + csky_vdsp2_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, pad_x, + pad_y, stride_w, stride_h, out_w, out_h, buffer_tmp, + output_data, output->qinfo->shift); } return CSINN_TRUE; } -int csi_i805_avgpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_i805_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { - csi_debug_warning("avgpool q7 unsupport asymmetric padddings on i805, call reference func replaced.\n"); - params->base.bc = csi_ref_avgpool2d_quant; // FIXME: csi_ref_avgpool2d_quant may be not applicable to i805 + struct csinn_callback *cb = params->base.cb; + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { + shl_debug_warning( + "avgpool q7 unsupport asymmetric padddings on i805, call reference func replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; // FIXME: shl_ref_avgpool2d_quant may be not + // applicable to i805 } else { - params->base.bc = csi_i805_avgpool2d_q7; + cb->exec = shl_i805_avgpool2d_q7; } return CSINN_TRUE; } diff --git a/source/i805_opt/basic_math/csi_i805_elementwise_add_8.S b/source/i805_opt/basic_math/shl_i805_elementwise_add_8.S similarity index 92% rename from source/i805_opt/basic_math/csi_i805_elementwise_add_8.S rename to source/i805_opt/basic_math/shl_i805_elementwise_add_8.S index b69212f3..d874c60d 100644 --- a/source/i805_opt/basic_math/csi_i805_elementwise_add_8.S +++ b/source/i805_opt/basic_math/shl_i805_elementwise_add_8.S @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_elementwise_add_8.S + * @file shl_i805_elementwise_add_8.S * @brief uint8 elementwise add layer function. * @version V1.0 * @date 9. Jul 2021 @@ -28,7 +28,7 @@ /* - void csi_i805_elementwise_add_opt_u8(uint8_t *input_0, + void shl_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, int32_t size, @@ -66,13 +66,13 @@ */ - .file "csi_i805_elementwise_add_8.S" - .section .text.csi_i805_elementwise_add_opt_u8,"ax",@progbits + .file "shl_i805_elementwise_add_8.S" + .section .text.shl_i805_elementwise_add_opt_u8,"ax",@progbits .align 2 - .global csi_i805_elementwise_add_opt_u8 - .type csi_i805_elementwise_add_opt_u8, @function + .global shl_i805_elementwise_add_opt_u8 + .type shl_i805_elementwise_add_opt_u8, @function -csi_i805_elementwise_add_opt_u8: +shl_i805_elementwise_add_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -168,5 +168,5 @@ csi_i805_elementwise_add_opt_u8: vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7 rts - .size csi_i805_elementwise_add_opt_u8, .-csi_i805_elementwise_add_opt_u8 + .size shl_i805_elementwise_add_opt_u8, .-shl_i805_elementwise_add_opt_u8 diff --git a/source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S b/source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S similarity index 90% rename from source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S rename to source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S index 1a539407..44e3c9bb 100644 --- a/source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S +++ b/source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_elementwise_mul_8.S + * @file shl_i805_elementwise_mul_8.S * @brief uint8 elementwise mul layer function. * @version V1.0 * @date 9. Jul 2021 @@ -28,7 +28,7 @@ /* - void csi_i805_elementwise_mul_opt_u8(uint8_t *input_0, + void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, int32_t size, @@ -53,13 +53,13 @@ sp+0x18/l1: input1_zp */ - .file "csi_i805_elementwise_mul_8.S" - .section .text.csi_i805_elementwise_mul_opt_u8,"ax",@progbits + .file "shl_i805_elementwise_mul_8.S" + .section .text.shl_i805_elementwise_mul_opt_u8,"ax",@progbits .align 2 - .global csi_i805_elementwise_mul_opt_u8 - .type csi_i805_elementwise_mul_opt_u8, @function + .global shl_i805_elementwise_mul_opt_u8 + .type shl_i805_elementwise_mul_opt_u8, @function -csi_i805_elementwise_mul_opt_u8: +shl_i805_elementwise_mul_opt_u8: push l0, l1, l2, l3, l4 ld.w l0, (sp, 0x14) // input_0_zeroponit ld.w l1, (sp, 0x18) // input_1_zeropoint @@ -138,5 +138,5 @@ csi_i805_elementwise_mul_opt_u8: .END: pop l0, l1, l2, l3, l4 rts - .size csi_i805_elementwise_mul_opt_u8, .-csi_i805_elementwise_mul_opt_u8 + .size shl_i805_elementwise_mul_opt_u8, .-shl_i805_elementwise_mul_opt_u8 diff --git a/source/i805_opt/clip.c b/source/i805_opt/clip.c index f68cde01..213712a6 100644 --- a/source/i805_opt/clip.c +++ b/source/i805_opt/clip.c @@ -16,33 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_clip_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int shl_i805_clip_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { float real_scale = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); - params->base.bc = csi_i805_clip_u8; + shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_clip_u8; return CSINN_TRUE; } -int csi_i805_clip_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int shl_i805_clip_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; - int32_t size = csi_tensor_size(input); + int32_t size = csinn_tensor_size(input); int32_t clip_qmin = floor(params->min_value / input->qinfo->scale) + input->qinfo->zero_point; int32_t clip_qmax = ceil(params->max_value / input->qinfo->scale) + input->qinfo->zero_point; - csi_i805_clip_opt_u8(input_data, output_data, size, clip_qmin, clip_qmax, input->qinfo->zero_point, output->qinfo->zero_point, + shl_i805_clip_opt_u8(input_data, output_data, size, clip_qmin, clip_qmax, + input->qinfo->zero_point, output->qinfo->zero_point, output->qinfo->multiplier, output->qinfo->shift); return CSINN_TRUE; } diff --git a/source/i805_opt/convolution.c b/source/i805_opt/convolution.c index 4cbe32aa..1fee6cd0 100644 --- a/source/i805_opt/convolution.c +++ b/source/i805_opt/convolution.c @@ -16,21 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -static int csi_i805_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -51,72 +49,71 @@ static int csi_i805_conv2d_q7(struct csi_tensor *input, uint16_t pad_x = params->pad_left; // e.g. pad_x = params->pad_right uint16_t pad_y = params->pad_top; // e.g. pad_y = params->pad_down - q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_h * + kernel_w]; // buffer_size = in_c * kernel_size * kernel_size - if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) { - if ( (kernel_h == 1) && (kernel_w == 1) ) { + if ((in_c % 4 == 0) && (out_c % 2 == 0)) { + if ((kernel_h == 1) && (kernel_w == 1)) { csky_vdsp2_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_w, out_h, buffer_tmp); } else { - csky_vdsp2_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c, - kernel_w, kernel_h, pad_x, pad_y, stride_w, stride_h, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + csky_vdsp2_convolve_HWC_q7_fast_nonsquare( + input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y, + stride_w, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_w, out_h, buffer_tmp); } } else if (in_c == 3) { - csky_vdsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + csky_vdsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y, + stride_h, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_h, buffer_tmp); } else { csky_vdsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + pad_y, stride_h, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_h, buffer_tmp); } return CSINN_TRUE; } -static int csi_i805_conv2d_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q15_t *input_data = (q15_t *)input->data; - q15_t *kernel_data = (q15_t *)kernel->data; - q15_t *bias_data = (q15_t *)bias->data; - q15_t *output_data = (q15_t *)output->data; + q15_t *input_data = (q15_t *)input->data; + q15_t *kernel_data = (q15_t *)kernel->data; + q15_t *bias_data = (q15_t *)bias->data; + q15_t *output_data = (q15_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; - uint16_t in_c = input->dim[3]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] uint16_t out_c = output->dim[3]; - uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; - uint16_t stride = params->stride_height; // e.g. stride = params->stride_width - uint16_t padding = params->pad_top; // e.g. padding = params->down = params->left = params->right + uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; + uint16_t stride = params->stride_height; // e.g. stride = params->stride_width + uint16_t padding = + params->pad_top; // e.g. padding = params->down = params->left = params->right - q15_t buffer_tmp[in_c * kernel_size * kernel_size]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[in_c * kernel_size * + kernel_size]; // buffer_size = in_c * kernel_size * kernel_size - csky_vdsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, - kernel_size, padding, stride, bias_data, bias->qinfo->shift, + csky_vdsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, + padding, stride, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, out_hw, buffer_tmp); return CSINN_TRUE; } -static int csi_i805_depthwise_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -136,57 +133,59 @@ static int csi_i805_depthwise_conv2d_q7(struct csi_tensor *input, uint16_t pad_x = params->pad_left; uint16_t pad_y = params->pad_top; - q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_h * + kernel_w]; // buffer_size = in_c * kernel_size * kernel_size - if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) { - csky_vdsp2_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) { + csky_vdsp2_depthwise_separable_conv_HWC_q7( + input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y, stride_h, bias_data, + bias->qinfo->shift, output->qinfo->shift, output_data, out_h, buffer_tmp); } else { - csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c, - kernel_w, kernel_h, pad_x, pad_y, stride_h, stride_w, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare( + input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y, + stride_h, stride_w, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, + out_w, out_h, buffer_tmp); } return CSINN_TRUE; } -int csi_i805_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { flag |= 0x01; } - if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) { - if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || - (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) { + if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) { + if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || + (params->pad_left != params->pad_top) || + (params->stride_height != params->stride_width)) { flag |= 0x02; } } if (flag > 0) { - csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q7 is not optimized to achieve under this condition on i805, call reference " + "func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_i805_conv2d_q7; + cb->exec = shl_i805_conv2d_q7; } return CSINN_TRUE; } -int csi_i805_conv2d_init_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { @@ -199,57 +198,55 @@ int csi_i805_conv2d_init_q15(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on i805, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q15 is not optimized to achieve under this condition on i805, call reference " + "func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_i805_conv2d_q15; + cb->exec = shl_i805_conv2d_q15; } return CSINN_TRUE; } -int csi_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { flag |= 0x01; } if (flag > 0) { - csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n"); - params->base.bc = csi_ref_depthwise_conv2d_quant; + shl_debug_warning( + "depthwise_conv2d q7 is not optimized to achieve under this condition on i805, call " + "reference func replaced.\n"); + cb->exec = shl_ref_depthwise_conv2d_quant; } else { - params->base.bc = csi_i805_depthwise_conv2d_q7; + cb->exec = shl_i805_depthwise_conv2d_q7; } return CSINN_TRUE; } - -int csi_i805_conv2d_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float real_scale = input->qinfo->scale * kernel->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); - params->base.bc = csi_i805_conv2d_u8; + shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_conv2d_u8; return CSINN_TRUE; } - -int csi_i805_conv2d_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - uint8_t *input_data = (uint8_t *)input->data; - uint8_t *kernel_data = (uint8_t *)kernel->data; - int32_t *bias_data = (int32_t *)bias->data; - uint8_t *output_data = (uint8_t *)output->data; + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *kernel_data = (uint8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + uint8_t *output_data = (uint8_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -269,47 +266,43 @@ int csi_i805_conv2d_u8(struct csi_tensor *input, uint16_t pad_w = params->pad_left; uint16_t pad_h = params->pad_top; - uint8_t *buffer_tmp = csi_mem_alloc(2 * in_c * kernel_h * kernel_w); + uint8_t *buffer_tmp = shl_mem_alloc(2 * in_c * kernel_h * kernel_w); - if ( (kernel_h == 1) && (kernel_w == 1) ) { - csi_i805_pwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, in_h*in_w, in_c, out_c, - input->qinfo->zero_point, kernel->qinfo->zero_point, output->qinfo->zero_point, - output->qinfo->multiplier, -output->qinfo->shift); + if ((kernel_h == 1) && (kernel_w == 1)) { + shl_i805_pwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, in_h * in_w, in_c, + out_c, input->qinfo->zero_point, kernel->qinfo->zero_point, + output->qinfo->zero_point, output->qinfo->multiplier, + -output->qinfo->shift); } else { - csi_i805_conv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, - in_h, in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, - stride_h, stride_w, out_h, out_w, out_c, input->qinfo->zero_point, - kernel->qinfo->zero_point, output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift); + shl_i805_conv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, in_h, + in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + out_h, out_w, out_c, input->qinfo->zero_point, + kernel->qinfo->zero_point, output->qinfo->zero_point, + output->qinfo->multiplier, -output->qinfo->shift); } - csi_mem_free(buffer_tmp); + shl_mem_free(buffer_tmp); return CSINN_TRUE; } - - -int csi_i805_depthwise_conv2d_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_depthwise_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float real_scale = input->qinfo->scale * kernel->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); - params->base.bc = csi_i805_depthwise_conv2d_u8; + shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift)); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_depthwise_conv2d_u8; return CSINN_TRUE; } - -int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_depthwise_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - uint8_t *input_data = (uint8_t *)input->data; - uint8_t *kernel_data = (uint8_t *)kernel->data; - int32_t *bias_data = (int32_t *)bias->data; - uint8_t *output_data = (uint8_t *)output->data; + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *kernel_data = (uint8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + uint8_t *output_data = (uint8_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -318,7 +311,7 @@ int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input, uint16_t out_h = output->dim[1]; uint16_t out_w = output->dim[2]; - uint16_t out_c = output->dim[3]; // assert(out_c == in_c) + uint16_t out_c = output->dim[3]; // assert(out_c == in_c) uint16_t kernel_h = kernel->dim[1]; uint16_t kernel_w = kernel->dim[2]; @@ -329,13 +322,14 @@ int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input, uint16_t pad_w = params->pad_left; uint16_t pad_h = params->pad_top; - uint8_t *buffer_tmp = csi_mem_alloc(4 * in_c * kernel_h * kernel_w); + uint8_t *buffer_tmp = shl_mem_alloc(4 * in_c * kernel_h * kernel_w); - csi_i805_dwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, - in_h, in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, - stride_h, stride_w, out_h, out_w, input->qinfo->zero_point, - kernel->qinfo->zero_point, output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift); + shl_i805_dwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, in_h, + in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + out_h, out_w, input->qinfo->zero_point, kernel->qinfo->zero_point, + output->qinfo->zero_point, output->qinfo->multiplier, + -output->qinfo->shift); - csi_mem_free(buffer_tmp); + shl_mem_free(buffer_tmp); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/i805_opt/convolution/csi_i805_convolution_1x1_8.S b/source/i805_opt/convolution/shl_i805_convolution_1x1_8.S similarity index 95% rename from source/i805_opt/convolution/csi_i805_convolution_1x1_8.S rename to source/i805_opt/convolution/shl_i805_convolution_1x1_8.S index e3d7c1bd..db6a8b9d 100644 --- a/source/i805_opt/convolution/csi_i805_convolution_1x1_8.S +++ b/source/i805_opt/convolution/shl_i805_convolution_1x1_8.S @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_depthwise_convolution_3x3_8.S + * @file shl_i805_depthwise_convolution_3x3_8.S * @brief uint8 pointwise convolution layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_pwconv2d_opt_u8(uint8_t * input_data, + void shl_i805_pwconv2d_opt_u8(uint8_t * input_data, uint8_t * kernel_data int32_t * bias_data, uint8_t * output_data, @@ -73,13 +73,13 @@ */ - .file "csi_i805_convolution_1x1_8.S" - .section .text.csi_i805_pwconv2d_opt_u8,"ax",@progbits + .file "shl_i805_convolution_1x1_8.S" + .section .text.shl_i805_pwconv2d_opt_u8,"ax",@progbits .align 2 - .global csi_i805_pwconv2d_opt_u8 - .type csi_i805_pwconv2d_opt_u8, @function + .global shl_i805_pwconv2d_opt_u8 + .type shl_i805_pwconv2d_opt_u8, @function -csi_i805_pwconv2d_opt_u8: +shl_i805_pwconv2d_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -294,4 +294,4 @@ csi_i805_pwconv2d_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_i805_pwconv2d_opt_u8, .-csi_i805_pwconv2d_opt_u8 + .size shl_i805_pwconv2d_opt_u8, .-shl_i805_pwconv2d_opt_u8 diff --git a/source/i805_opt/convolution/csi_i805_convolution_8.S b/source/i805_opt/convolution/shl_i805_convolution_8.S similarity index 97% rename from source/i805_opt/convolution/csi_i805_convolution_8.S rename to source/i805_opt/convolution/shl_i805_convolution_8.S index 4e61cb9e..63a766e2 100644 --- a/source/i805_opt/convolution/csi_i805_convolution_8.S +++ b/source/i805_opt/convolution/shl_i805_convolution_8.S @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_convolution_8.S + * @file shl_i805_convolution_8.S * @brief uint8 basic convolution layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_conv2d_opt_u8(uint8_t * input_data, + void shl_i805_conv2d_opt_u8(uint8_t * input_data, uint8_t * kernel_data, int32_t * bias_data, uint8_t * output_data, @@ -86,13 +86,13 @@ */ - .file "csi_i805_convolution_8.S" - .section .text.csi_i805_conv2d_opt_u8,"ax",@progbits + .file "shl_i805_convolution_8.S" + .section .text.shl_i805_conv2d_opt_u8,"ax",@progbits .align 2 - .global csi_i805_conv2d_opt_u8 - .type csi_i805_conv2d_opt_u8, @function + .global shl_i805_conv2d_opt_u8 + .type shl_i805_conv2d_opt_u8, @function -csi_i805_conv2d_opt_u8: +shl_i805_conv2d_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -422,4 +422,4 @@ csi_i805_conv2d_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_i805_conv2d_opt_u8, .-csi_i805_conv2d_opt_u8 + .size shl_i805_conv2d_opt_u8, .-shl_i805_conv2d_opt_u8 diff --git a/source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S b/source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S similarity index 98% rename from source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S rename to source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S index 6116347b..81f4be19 100644 --- a/source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S +++ b/source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_depthwise_convolution_8.S + * @file shl_i805_depthwise_convolution_8.S * @brief uint8 depthwise convolution layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_dwconv2d_opt_u8(uint8_t * input_data, + void shl_i805_dwconv2d_opt_u8(uint8_t * input_data, uint8_t * kernel_data int32_t * bias_data, uint8_t * output_data, @@ -73,13 +73,13 @@ */ - .file "csi_i805_depthwise_convolution_8.S" - .section .text.csi_i805_dwconv2d_opt_u8,"ax",@progbits + .file "shl_i805_depthwise_convolution_8.S" + .section .text.shl_i805_dwconv2d_opt_u8,"ax",@progbits .align 2 - .global csi_i805_dwconv2d_opt_u8 - .type csi_i805_dwconv2d_opt_u8, @function + .global shl_i805_dwconv2d_opt_u8 + .type shl_i805_dwconv2d_opt_u8, @function -csi_i805_dwconv2d_opt_u8: +shl_i805_dwconv2d_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -714,4 +714,4 @@ csi_i805_dwconv2d_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_i805_dwconv2d_opt_u8, .-csi_i805_dwconv2d_opt_u8 + .size shl_i805_dwconv2d_opt_u8, .-shl_i805_dwconv2d_opt_u8 diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S b/source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S similarity index 94% rename from source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S rename to source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S index 37fd1bd5..47958fb6 100644 --- a/source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S +++ b/source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800v_convolve_1x1_HWC_q7_fast.S + * @file shl_xt800v_convolve_1x1_HWC_q7_fast.S * @brief Fast Q7 vresion of 1x1 convolution (non-square shape). * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - * void csi_xt800v_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, + * void shl_xt800v_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, * const uint16_t dim_im_in_x, * const uint16_t dim_im_in_y, * const uint16_t ch_im_in, @@ -40,13 +40,13 @@ * */ - .file "csi_xt800v_convolve_1x1_HWC_q7_fast.S" - .section .text.csi_xt800v_convolve_HWC_q7_fast,"ax",@progbits + .file "shl_xt800v_convolve_1x1_HWC_q7_fast.S" + .section .text.shl_xt800v_convolve_HWC_q7_fast,"ax",@progbits .align 2 - .global csi_xt800v_convolve_1x1_HWC_q7_fast - .type csi_xt800v_convolve_1x1_HWC_q7_fast, @function + .global shl_xt800v_convolve_1x1_HWC_q7_fast + .type shl_xt800v_convolve_1x1_HWC_q7_fast, @function -csi_xt800v_convolve_1x1_HWC_q7_fast: +shl_xt800v_convolve_1x1_HWC_q7_fast: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -326,9 +326,7 @@ csi_xt800v_convolve_1x1_HWC_q7_fast: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_convolve_1x1_HWC_q7_fast, .-csi_xt800v_convolve_1x1_HWC_q7_fast + .size shl_xt800v_convolve_1x1_HWC_q7_fast, .-shl_xt800v_convolve_1x1_HWC_q7_fast -.weak csi_convolve_1x1_HWC_q7_fast -.set csi_convolve_1x1_HWC_q7_fast, csi_xt800v_convolve_1x1_HWC_q7_fast .weak csky_vdsp2_convolve_1x1_HWC_q7_fast -.set csky_vdsp2_convolve_1x1_HWC_q7_fast, csi_xt800v_convolve_1x1_HWC_q7_fast +.set csky_vdsp2_convolve_1x1_HWC_q7_fast, shl_xt800v_convolve_1x1_HWC_q7_fast diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S similarity index 94% rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S index 2d365f1a..b841da09 100644 --- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S +++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_convolve_HWC_q15_basic.S + * @file shl_xt800v_convolve_HWC_q15_basic.S * @brief Q15 vresion of convolution. * @version V1.0 * @date 04. June 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_convolve_HWC_q15_basic(const q15_t * Im_in, + * shl_xt800v_status + * shl_xt800v_convolve_HWC_q15_basic(const q15_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q15_t * wt, @@ -41,13 +41,13 @@ * q15_t * bufferA) */ - .file "csi_xt800v_convolve_HWC_q15_basic.S" - .section .text.csi_xt800v_convolve_HWC_q15_basic,"ax",@progbits + .file "shl_xt800v_convolve_HWC_q15_basic.S" + .section .text.shl_xt800v_convolve_HWC_q15_basic,"ax",@progbits .align 2 - .global csi_xt800v_convolve_HWC_q15_basic - .type csi_xt800v_convolve_HWC_q15_basic, @function + .global shl_xt800v_convolve_HWC_q15_basic + .type shl_xt800v_convolve_HWC_q15_basic, @function -csi_xt800v_convolve_HWC_q15_basic: +shl_xt800v_convolve_HWC_q15_basic: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -315,9 +315,7 @@ csi_xt800v_convolve_HWC_q15_basic: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_convolve_HWC_q15_basic, .-csi_xt800v_convolve_HWC_q15_basic + .size shl_xt800v_convolve_HWC_q15_basic, .-shl_xt800v_convolve_HWC_q15_basic -.weak csi_convolve_HWC_q15_basic -.set csi_convolve_HWC_q15_basic, csi_xt800v_convolve_HWC_q15_basic .weak csky_vdsp2_convolve_HWC_q15_basic -.set csky_vdsp2_convolve_HWC_q15_basic, csi_xt800v_convolve_HWC_q15_basic +.set csky_vdsp2_convolve_HWC_q15_basic, shl_xt800v_convolve_HWC_q15_basic diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S similarity index 94% rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S index 2b055d0a..571d7c16 100644 --- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S +++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_convolve_HWC_q7_RGB.S + * @file shl_xt800v_convolve_HWC_q7_RGB.S * @brief Q7 vresion of convolution for RGB image. * @version V1.0 * @date 04. june 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_convolve_HWC_q7_RGB(const q7_t * Im_in, + * shl_xt800v_status + * shl_xt800v_convolve_HWC_q7_RGB(const q7_t * Im_in, * const uint16_t dim_im_in, * const q7_t * wt, * const uint16_t ch_im_out, @@ -40,13 +40,13 @@ * q15_t * bufferA) */ - .file "csi_xt800v_convolve_HWC_q7_RGB.S" - .section .text.csi_xt800v_convolve_HWC_q7_RGB,"ax",@progbits + .file "shl_xt800v_convolve_HWC_q7_RGB.S" + .section .text.shl_xt800v_convolve_HWC_q7_RGB,"ax",@progbits .align 2 - .global csi_xt800v_convolve_HWC_q7_RGB - .type csi_xt800v_convolve_HWC_q7_RGB, @function + .global shl_xt800v_convolve_HWC_q7_RGB + .type shl_xt800v_convolve_HWC_q7_RGB, @function -csi_xt800v_convolve_HWC_q7_RGB: +shl_xt800v_convolve_HWC_q7_RGB: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -361,9 +361,7 @@ csi_xt800v_convolve_HWC_q7_RGB: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_convolve_HWC_q7_RGB, .-csi_xt800v_convolve_HWC_q7_RGB + .size shl_xt800v_convolve_HWC_q7_RGB, .-shl_xt800v_convolve_HWC_q7_RGB -.weak csi_convolve_HWC_q7_RGB -.set csi_convolve_HWC_q7_RGB, csi_xt800v_convolve_HWC_q7_RGB .weak csky_vdsp2_convolve_HWC_q7_RGB -.set csky_vdsp2_convolve_HWC_q7_RGB, csi_xt800v_convolve_HWC_q7_RGB +.set csky_vdsp2_convolve_HWC_q7_RGB, shl_xt800v_convolve_HWC_q7_RGB diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S similarity index 94% rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S index c371f69f..8993264d 100644 --- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S +++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_convolve_HWC_q7_basic.S + * @file shl_xt800v_convolve_HWC_q7_basic.S * @brief Q7 vresion of convolution. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_convolve_HWC_q7_basic(const q7_t * Im_in, + * shl_xt800v_status + * shl_xt800v_convolve_HWC_q7_basic(const q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q7_t * wt, @@ -41,13 +41,13 @@ * q15_t * bufferA) */ - .file "csi_xt800v_convolve_HWC_q7_basic.S" - .section .text.csi_xt800v_convolve_HWC_q7_basic,"ax",@progbits + .file "shl_xt800v_convolve_HWC_q7_basic.S" + .section .text.shl_xt800v_convolve_HWC_q7_basic,"ax",@progbits .align 2 - .global csi_xt800v_convolve_HWC_q7_basic - .type csi_xt800v_convolve_HWC_q7_basic, @function + .global shl_xt800v_convolve_HWC_q7_basic + .type shl_xt800v_convolve_HWC_q7_basic, @function -csi_xt800v_convolve_HWC_q7_basic: +shl_xt800v_convolve_HWC_q7_basic: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -390,9 +390,7 @@ csi_xt800v_convolve_HWC_q7_basic: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_convolve_HWC_q7_basic, .-csi_xt800v_convolve_HWC_q7_basic + .size shl_xt800v_convolve_HWC_q7_basic, .-shl_xt800v_convolve_HWC_q7_basic -.weak csi_convolve_HWC_q7_basic -.set csi_convolve_HWC_q7_basic, csi_xt800v_convolve_HWC_q7_basic .weak csky_vdsp2_convolve_HWC_q7_basic -.set csky_vdsp2_convolve_HWC_q7_basic, csi_xt800v_convolve_HWC_q7_basic +.set csky_vdsp2_convolve_HWC_q7_basic, shl_xt800v_convolve_HWC_q7_basic diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S similarity index 98% rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S index 6d83ceec..c788f316 100644 --- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S +++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800v_convolve_HWC_q7_fast_nonsquare.S + * @file shl_xt800v_convolve_HWC_q7_fast_nonsquare.S * @brief Fast Q7 vresion of convolution (non-square shape). * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - * csi_xt800v_status csi_xt800v_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, + * shl_xt800v_status shl_xt800v_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, * const uint16_t dim_im_in_x, * const uint16_t dim_im_in_y, * const uint16_t ch_im_in, @@ -46,13 +46,13 @@ * */ - .file "csi_xt800v_convolve_HWC_q7_fast_nonsquare.S" - .section .text.csi_xt800v_convolve_HWC_q7_fast_nonsquare,"ax",@progbits + .file "shl_xt800v_convolve_HWC_q7_fast_nonsquare.S" + .section .text.shl_xt800v_convolve_HWC_q7_fast_nonsquare,"ax",@progbits .align 2 - .global csi_xt800v_convolve_HWC_q7_fast_nonsquare - .type csi_xt800v_convolve_HWC_q7_fast_nonsquare, @function + .global shl_xt800v_convolve_HWC_q7_fast_nonsquare + .type shl_xt800v_convolve_HWC_q7_fast_nonsquare, @function -csi_xt800v_convolve_HWC_q7_fast_nonsquare: +shl_xt800v_convolve_HWC_q7_fast_nonsquare: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -1427,9 +1427,7 @@ csi_xt800v_convolve_HWC_q7_fast_nonsquare: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_convolve_HWC_q7_fast_nonsquare, .-csi_xt800v_convolve_HWC_q7_fast_nonsquare + .size shl_xt800v_convolve_HWC_q7_fast_nonsquare, .-shl_xt800v_convolve_HWC_q7_fast_nonsquare -.weak csi_convolve_HWC_q7_fast_nonsquare -.set csi_convolve_HWC_q7_fast_nonsquare, csi_xt800v_convolve_HWC_q7_fast_nonsquare .weak csky_vdsp2_convolve_HWC_q7_fast_nonsquare -.set csky_vdsp2_convolve_HWC_q7_fast_nonsquare, csi_xt800v_convolve_HWC_q7_fast_nonsquare +.set csky_vdsp2_convolve_HWC_q7_fast_nonsquare, shl_xt800v_convolve_HWC_q7_fast_nonsquare diff --git a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S similarity index 93% rename from source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S rename to source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S index a819b757..aeb5fc9c 100644 --- a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S +++ b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800v_depthwise_separable_conv_HWC_q7.S + * @file shl_xt800v_depthwise_separable_conv_HWC_q7.S * @brief Q7 depthwise separable convolution function. * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - *csi_xt800v_status csi_xt800v_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, + *shl_xt800v_status shl_xt800v_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const q7_t * wt, @@ -40,13 +40,13 @@ * q15_t * bufferA) */ - .file "csi_xt800v_depthwise_separable_conv_HWC_q7.S" - .section .text.csi_xt800v_depthwise_separatable_conv_HWC_q7,"ax",@progbits + .file "shl_xt800v_depthwise_separable_conv_HWC_q7.S" + .section .text.shl_xt800v_depthwise_separatable_conv_HWC_q7,"ax",@progbits .align 2 - .global csi_xt800v_depthwise_separable_conv_HWC_q7 - .type csi_xt800v_depthwise_separable_conv_HWC_q7, @function + .global shl_xt800v_depthwise_separable_conv_HWC_q7 + .type shl_xt800v_depthwise_separable_conv_HWC_q7, @function -csi_xt800v_depthwise_separable_conv_HWC_q7: +shl_xt800v_depthwise_separable_conv_HWC_q7: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -319,9 +319,7 @@ csi_xt800v_depthwise_separable_conv_HWC_q7: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_depthwise_separable_conv_HWC_q7, .-csi_xt800v_depthwise_separable_conv_HWC_q7 + .size shl_xt800v_depthwise_separable_conv_HWC_q7, .-shl_xt800v_depthwise_separable_conv_HWC_q7 -.weak csi_depthwise_separable_conv_HWC_q7 -.set csi_depthwise_separable_conv_HWC_q7, csi_xt800v_depthwise_separable_conv_HWC_q7 .weak csky_vdsp2_depthwise_separable_conv_HWC_q7 -.set csky_vdsp2_depthwise_separable_conv_HWC_q7, csi_xt800v_depthwise_separable_conv_HWC_q7 +.set csky_vdsp2_depthwise_separable_conv_HWC_q7, shl_xt800v_depthwise_separable_conv_HWC_q7 diff --git a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S similarity index 93% rename from source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S rename to source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S index c5409ea4..ca39a348 100644 --- a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S +++ b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S + * @file shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S * @brief Q7 depthwise separatble convolution function (non-square shape). * @version V1.0 * @date 05. June 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, + * shl_xt800v_status + * shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, * const uint16_t dim_im_in_x, * const uint16_t dim_im_in_y, * const uint16_t ch_im_in, @@ -47,13 +47,13 @@ * */ - .file "csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S" - .section .text.csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare,"ax",@progbits + .file "shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S" + .section .text.shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare,"ax",@progbits .align 2 - .global csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare - .type csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, @function + .global shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare + .type shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, @function -csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare: +shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -338,9 +338,7 @@ csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, .-csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare + .size shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, .-shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare -.weak csi_depthwise_separable_conv_HWC_q7_nonsquare -.set csi_depthwise_separable_conv_HWC_q7_nonsquare, csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare .weak csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare -.set csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare, csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare +.set csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare, shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare diff --git a/source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S b/source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S similarity index 94% rename from source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S rename to source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S index 5711f92d..6dd8457c 100644 --- a/source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S +++ b/source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_fullyconnected_8.S + * @file shl_i805_fullyconnected_8.S * @brief uint8 basic fully-connected layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_fullyconnected_opt_u8(uint8_t * input_data, + void shl_i805_fullyconnected_opt_u8(uint8_t * input_data, uint8_t * weight_data, int32_t * bias_data, uint8_t * output_data, @@ -60,13 +60,13 @@ */ - .file "csi_i805_fullyconnected_8.S" - .section .text.csi_i805_fullyconnected_opt_u8,"ax",@progbits + .file "shl_i805_fullyconnected_8.S" + .section .text.shl_i805_fullyconnected_opt_u8,"ax",@progbits .align 2 - .global csi_i805_fullyconnected_opt_u8 - .type csi_i805_fullyconnected_opt_u8, @function + .global shl_i805_fullyconnected_opt_u8 + .type shl_i805_fullyconnected_opt_u8, @function -csi_i805_fullyconnected_opt_u8: +shl_i805_fullyconnected_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -251,4 +251,4 @@ csi_i805_fullyconnected_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_i805_fullyconnected_opt_u8, .-csi_i805_fullyconnected_opt_u8 + .size shl_i805_fullyconnected_opt_u8, .-shl_i805_fullyconnected_opt_u8 diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S similarity index 93% rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S index 04ff897e..c76238f7 100644 --- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S +++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_fully_connected_mat_q7_vec_q15.S + * @file shl_xt800v_fully_connected_mat_q7_vec_q15.S * @brief Mixed Q15-Q7 fully-connected layer function. * @version V1.0 * @date 31. May 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_fully_connected_mat_q7_vec_q15(const q15_t * pV, + * shl_xt800v_status + * shl_xt800v_fully_connected_mat_q7_vec_q15(const q15_t * pV, * const q7_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q15_t * pOut) */ - .file "csi_xt800v_fully_connected_mat_q7_vec_q15.S" - .section .text.csi_xt800v_fully_connected_mat_q7_vec_q15,"ax",@progbits + .file "shl_xt800v_fully_connected_mat_q7_vec_q15.S" + .section .text.shl_xt800v_fully_connected_mat_q7_vec_q15,"ax",@progbits .align 2 - .global csi_xt800v_fully_connected_mat_q7_vec_q15 - .type csi_xt800v_fully_connected_mat_q7_vec_q15, @function + .global shl_xt800v_fully_connected_mat_q7_vec_q15 + .type shl_xt800v_fully_connected_mat_q7_vec_q15, @function -csi_xt800v_fully_connected_mat_q7_vec_q15: +shl_xt800v_fully_connected_mat_q7_vec_q15: push l0, l1, l2, l3, l4, l5, l6 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -330,8 +330,7 @@ csi_xt800v_fully_connected_mat_q7_vec_q15: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6 - .size csi_xt800v_fully_connected_mat_q7_vec_q15, .-csi_xt800v_fully_connected_mat_q7_vec_q15 -.weak csi_fully_connected_mat_q7_vec_q15 -.set csi_fully_connected_mat_q7_vec_q15, csi_xt800v_fully_connected_mat_q7_vec_q15 + .size shl_xt800v_fully_connected_mat_q7_vec_q15, .-shl_xt800v_fully_connected_mat_q7_vec_q15 + .weak csky_vdsp2_fully_connected_mat_q7_vec_q15 -.set csky_vdsp2_fully_connected_mat_q7_vec_q15, csi_xt800v_fully_connected_mat_q7_vec_q15 +.set csky_vdsp2_fully_connected_mat_q7_vec_q15, shl_xt800v_fully_connected_mat_q7_vec_q15 diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S similarity index 90% rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S index ae54ba1b..3796b62d 100644 --- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S +++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_fully_connected_q15.S + * @file shl_xt800v_fully_connected_q15.S * @brief Q15 basic fully-connected layer function. * @version V1.0 * @date 31. May 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_fully_connected_q15(const q15_t * pV, + * shl_xt800v_status + * shl_xt800v_fully_connected_q15(const q15_t * pV, * const q15_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q15_t * pOut) */ - .file "csi_xt800v_fully_connected_q15.S" - .section .text.csi_xt800v_fully_connected_q15,"ax",@progbits + .file "shl_xt800v_fully_connected_q15.S" + .section .text.shl_xt800v_fully_connected_q15,"ax",@progbits .align 2 - .global csi_xt800v_fully_connected_q15 - .type csi_xt800v_fully_connected_q15, @function + .global shl_xt800v_fully_connected_q15 + .type shl_xt800v_fully_connected_q15, @function -csi_xt800v_fully_connected_q15: +shl_xt800v_fully_connected_q15: push l0, l1, l2, l3, l4, l5, l6 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -197,8 +197,7 @@ csi_xt800v_fully_connected_q15: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6 - .size csi_xt800v_fully_connected_q15, .-csi_xt800v_fully_connected_q15 -.weak csi_fully_connected_q15 -.set csi_fully_connected_q15, csi_xt800v_fully_connected_q15 + .size shl_xt800v_fully_connected_q15, .-shl_xt800v_fully_connected_q15 + .weak csky_vdsp2_fully_connected_q15 -.set csky_vdsp2_fully_connected_q15, csi_xt800v_fully_connected_q15 +.set csky_vdsp2_fully_connected_q15, shl_xt800v_fully_connected_q15 diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S similarity index 89% rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S index 93af45f8..6363b240 100644 --- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S +++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S @@ -17,15 +17,15 @@ */ /****************************************************************************** - * @file csi_xt800v_fully_connected_q7.S + * @file shl_xt800v_fully_connected_q7.S * @brief Q7 basic fully-connected layer function. * @version V1.0 * @date 19. Mar 2018 ******************************************************************************/ /* - * csi_xt800v_status - * csi_xt800v_fully_connected_q7(const q7_t * pV, + * shl_xt800v_status + * shl_xt800v_fully_connected_q7(const q7_t * pV, * const q7_t * pM, * const uint16_t dim_vec, * const uint16_t num_of_rows, @@ -35,13 +35,13 @@ * q7_t * pOut) */ - .file "csi_xt800v_fully_connected_q7.S" - .section .text.csi_xt800v_fully_connected_q7,"ax",@progbits + .file "shl_xt800v_fully_connected_q7.S" + .section .text.shl_xt800v_fully_connected_q7,"ax",@progbits .align 2 - .global csi_xt800v_fully_connected_q7 - .type csi_xt800v_fully_connected_q7, @function + .global shl_xt800v_fully_connected_q7 + .type shl_xt800v_fully_connected_q7, @function -csi_xt800v_fully_connected_q7: +shl_xt800v_fully_connected_q7: push l0, l1, l2, l3, l4, l5, l6 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -174,8 +174,7 @@ csi_xt800v_fully_connected_q7: vldmu.8 vr12-vr12, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6 - .size csi_xt800v_fully_connected_q7, .-csi_xt800v_fully_connected_q7 -.weak csi_fully_connected_q7 -.set csi_fully_connected_q7, csi_xt800v_fully_connected_q7 + .size shl_xt800v_fully_connected_q7, .-shl_xt800v_fully_connected_q7 + .weak csky_vdsp2_fully_connected_q7 -.set csky_vdsp2_fully_connected_q7, csi_xt800v_fully_connected_q7 +.set csky_vdsp2_fully_connected_q7, shl_xt800v_fully_connected_q7 diff --git a/source/i805_opt/fullyconnected.c b/source/i805_opt/fullyconnected.c index 284ac4d5..29280c63 100644 --- a/source/i805_opt/fullyconnected.c +++ b/source/i805_opt/fullyconnected.c @@ -16,17 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_i805.h" +/* CSI-NN2 version 2.0.x */ +#include "i805_function.h" +#include "shl_i805.h" // contraints: input->dim[0] = 1 -int csi_i805_fullyconnected_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *weight_data = (q7_t *)weights->data; @@ -38,11 +36,9 @@ int csi_i805_fullyconnected_q7(struct csi_tensor *input, return CSINN_TRUE; } -int csi_i805_fullyconnected_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *weight_data = (q15_t *)weights->data; @@ -50,40 +46,38 @@ int csi_i805_fullyconnected_q15(struct csi_tensor *input, q15_t *output_data = (q15_t *)output->data; csky_vdsp2_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0], - bias->qinfo->shift, output->qinfo->shift, bias_data, output_data); + bias->qinfo->shift, output->qinfo->shift, bias_data, + output_data); return CSINN_TRUE; } - -int csi_i805_fullyconnected_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_fullyconnected_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { float real_scale = input->qinfo->scale * weights->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); - params->base.bc = csi_i805_fullyconnected_u8; + shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_fullyconnected_u8; return CSINN_TRUE; } -int csi_i805_fullyconnected_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_fullyconnected_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *weights_data = (uint8_t *)weights->data; int32_t *bias_data = (int32_t *)bias->data; uint8_t *output_data = (uint8_t *)output->data; - int32_t in_nodes = input->dim[1]; // i.e. in_nodes = weights->dim[1] + int32_t in_nodes = input->dim[1]; // i.e. in_nodes = weights->dim[1] int32_t out_nodes = weights->dim[0]; - csi_i805_fullyconnected_opt_u8(input_data, weights_data, bias_data, output_data, in_nodes, out_nodes, - input->qinfo->zero_point, weights->qinfo->zero_point, output->qinfo->zero_point, - output->qinfo->multiplier, -output->qinfo->shift); + shl_i805_fullyconnected_opt_u8(input_data, weights_data, bias_data, output_data, in_nodes, + out_nodes, input->qinfo->zero_point, weights->qinfo->zero_point, + output->qinfo->zero_point, output->qinfo->multiplier, + -output->qinfo->shift); return CSINN_FALSE; } diff --git a/source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S b/source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S similarity index 95% rename from source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S rename to source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S index d29b7282..ddabc20c 100644 --- a/source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S +++ b/source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S @@ -16,10 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_mat_mult_nt_t_8.S + * @file shl_i805_mat_mult_nt_t_8.S * @brief uint8 genenal matrix-multiplication(A * B_trans) function. * @version V1.0 * @date 9. Jul 2021 @@ -27,7 +27,7 @@ /* - void csi_i805_mat_mult_nt_t_opt_u8(uint8_t * lhs, // input + void shl_i805_mat_mult_nt_t_opt_u8(uint8_t * lhs, // input uint8_t * rhs, // kernel int32_t * bias, uint8_t * dst, @@ -66,13 +66,13 @@ */ - .file "csi_i805_mat_mult_nt_t_8.S" - .section .text.csi_i805_mat_mult_nt_t_opt_u8,"ax",@progbits + .file "shl_i805_mat_mult_nt_t_8.S" + .section .text.shl_i805_mat_mult_nt_t_opt_u8,"ax",@progbits .align 2 - .global csi_i805_mat_mult_nt_t_opt_u8 - .type csi_i805_mat_mult_nt_t_opt_u8, @function + .global shl_i805_mat_mult_nt_t_opt_u8 + .type shl_i805_mat_mult_nt_t_opt_u8, @function -csi_i805_mat_mult_nt_t_opt_u8: +shl_i805_mat_mult_nt_t_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -288,4 +288,4 @@ csi_i805_mat_mult_nt_t_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_i805_mat_mult_nt_t_opt_u8, .-csi_i805_mat_mult_nt_t_opt_u8 + .size shl_i805_mat_mult_nt_t_opt_u8, .-shl_i805_mat_mult_nt_t_opt_u8 diff --git a/source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S b/source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S similarity index 94% rename from source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S rename to source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S index 1fd7d64a..c2fd800d 100644 --- a/source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S +++ b/source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S @@ -16,10 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_vec_mat_mult_8.S + * @file shl_i805_vec_mat_mult_8.S * @brief uint8 vector(lhs) matrix(transpose) multiplication function. * @version V1.0 * @date 9. Jul 2021 @@ -27,7 +27,7 @@ /* - void csi_i805_vec_mat_mult_opt_u8(uint8_t * lhs, + void shl_i805_vec_mat_mult_opt_u8(uint8_t * lhs, uint8_t * rhs, int32_t * bias, uint8_t * dst, @@ -57,13 +57,13 @@ */ - .file "csi_i805_vec_mat_mult_8.S" - .section .text.csi_i805_vec_mat_mult_opt_u8,"ax",@progbits + .file "shl_i805_vec_mat_mult_8.S" + .section .text.shl_i805_vec_mat_mult_opt_u8,"ax",@progbits .align 2 - .global csi_i805_vec_mat_mult_opt_u8 - .type csi_i805_vec_mat_mult_opt_u8, @function + .global shl_i805_vec_mat_mult_opt_u8 + .type shl_i805_vec_mat_mult_opt_u8, @function -csi_i805_vec_mat_mult_opt_u8: +shl_i805_vec_mat_mult_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -246,4 +246,4 @@ csi_i805_vec_mat_mult_opt_u8: vldmu.8 vr12-vr15, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_i805_vec_mat_mult_opt_u8, .-csi_i805_vec_mat_mult_opt_u8 + .size shl_i805_vec_mat_mult_opt_u8, .-shl_i805_vec_mat_mult_opt_u8 diff --git a/source/i805_opt/i805_function.h b/source/i805_opt/i805_function.h new file mode 100644 index 00000000..86e60836 --- /dev/null +++ b/source/i805_opt/i805_function.h @@ -0,0 +1,1081 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csi_nnfunctions.h + * Description: Public header file for CSI NN Library + * + * -------------------------------------------------------------------- */ + +#ifndef SOURCE_I805_OPT_I805_FUNCTION_H_ +#define SOURCE_I805_OPT_I805_FUNCTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * @brief 8-bit fractional data type in 1.7 format. + */ +typedef int8_t q7_t; + +/** + * @brief 16-bit fractional data type in 1.15 format. + */ +typedef int16_t q15_t; + +/** + * @brief 32-bit fractional data type in 1.31 format. + */ +typedef int32_t q31_t; + +/** + * @brief u8 asym quant generic convolution optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in,out] bufferA pointer to buffer for input/im2col data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * bufferA size: 2*input_ch*kernel_h*kernel_w + */ +void shl_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, uint8_t *bufferA, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, + int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, + int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult, + int32_t out_shift); + +/** + * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] input_hxw input height mul width + * @param[in] input_ch input channel + * @param[in] output_ch output_channel + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * + */ +void shl_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, int32_t input_hxw, int32_t input_ch, + int32_t output_ch, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant depthwise convolution optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in,out] bufferA pointer to buffer for input/im2col data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * bufferA size: 4*input_ch*kernel_h*kernel_w + */ +void shl_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, uint8_t *bufferA, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, + int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, + int32_t out_h, int32_t out_w, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function + * @param[in] input pointer to input tensor data + * @param[in] kernel pointer to kernel tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * + */ +void shl_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output, + int32_t input_zero_point, int32_t kernel_zero_point, + int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift); + +/** + * @brief u8 asym quant fullyconnected optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] weight_data pointer to weight tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] in_nodes input nodes (weight cols) + * @param[in] out_nodes output nodes (weight rows) + * @param[in] input_zero_point input zero_point + * @param[in] weight_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] output_mult multiplier for s1 * s2 / s3 + * @param[in] output_shift output shift for s1 * s2 / s3. shift_right + * @return none. + * + */ +void shl_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data, + uint8_t *output_data, int32_t in_nodes, int32_t out_nodes, + int32_t input_zero_point, int32_t weight_zero_point, + int32_t output_zero_point, int32_t output_mult, + int32_t output_shift); + +/** + * @brief u8 asym quant generic maxpool optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @return none. + * bufferA size: 2*input_ch*kernel_h*kernel_w + */ +void shl_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, + int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h, + int32_t stride_w, int32_t output_h, int32_t output_w); + +/** + * @brief u8 asym quant relu optimized function + * @param[in,out] data pointer to input/output tensor data, compute inplace + * @param[in] size input tensor size, tensor length + * @param[in] input_zeropoint input zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant relu6 optimized function + * @param[in,out] data pointer to input/output tensor data, compute inplace + * @param[in] size input tensor size, tensor length + * @param[in] input_zeropoint input zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant clip optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size input tensor size, tensor length + * @param[in] clip_qmin clip min value(quant) + * @param[in] clip_qmax clip max value(quant) + * @param[in] input_zeropoint input zero_point + * @param[in] output_zeropoint output zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min, + int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant element add optimized function + * @param[in] input_0 pointer to input_0 tensor data + * @param[in] input_1 pointer to input_1 tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] size input tensor size, tensor length, element size + * @param[in] input_0_zeroponit input_0 zero_point. Range: Range: -255 to 0 + * @param[in] input_0_mult multiplier for sacle_input_0 + * @param[in] input_0_shift input_0 shift + * @param[in] input_1_zeropoint input_1 zero_point. Range: Range: -255 to 0 + * @param[in] input_1_mult multiplier for sacle_input_1 + * @param[in] input_1_shift input_1 shift + * @param[in] output_zeropoint output zero_point + * @param[in] output_mult multiplier for scale_output + * @param[in] output_shift output shift + * @return none. + * + */ +void shl_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, + int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult, + int32_t input_0_shift, int32_t input_1_zeropoint, + int32_t input_1_mult, int32_t input_1_shift, + int32_t output_zeropoint, int32_t output_mult, + int32_t output_shift); + +/** + * @brief u8 asym quant element mul optimized function + * @param[in] input_0 pointer to input_0 tensor data + * @param[in] input_1 pointer to input_1 tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] size input tensor size, tensor length, element size + * @param[in] input_0_zeroponit input_0 zero_point + * @param[in] input_1_zeropoint input_1 zero_point + * @param[in] output_zeropoint output zero_point + * @param[in] output_mult multiplier for s1 * s2 / s3 + * @param[in] output_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, + int32_t size, int32_t input_0_zeroponit, + int32_t input_1_zeropoint, int32_t output_zeropoint, + int32_t output_mult, int32_t output_shift); + +/** + * @brief u8 asym quant softmax optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size tensor size + * @param[in] out_mult multiplier + * @param[in] out_shift output shift + * @return none. + * + */ +void shl_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant reshape optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size tensor size + * @return none. + * + */ +void shl_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size); + +/** + * @brief u8 asym quant vec and matrix mul optimized function + * @param[in] lhs pointer to input tensor data + * @param[in] rhs pointer to weight tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] dst pointer to output tensor data + * @param[in] rhs_col input nodes (weight cols) + * @param[in] rhs_row output nodes (weight rows) + * @param[in] lhs_zero_point input zero_point + * @param[in] rhs_zero_point weight zero_point + * @param[in] dst_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, + int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point, + int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult, + int32_t dst_shift); + +/** + * @brief u8 asym quant matrix mul(A * B_trans) optimized function + * @param[in] lhs pointer to input tensor data + * @param[in] rhs pointer to weight tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] dst pointer to output tensor data + * @param[in] lhs_row input row / m + * @param[in] lhs_col input col / k + * @param[in] rhs_row weight row / n + * @param[in] lhs_zero_point input zero_point + * @param[in] rhs_zero_point weight zero_point + * @param[in] dst_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, + int32_t lhs_row, int32_t lhs_col, int32_t rhs_row, + int32_t lhs_zero_point, int32_t rhs_zero_point, + int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift); + +/** + * @brief u8 asym quant generic convolution optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in,out] bufferA pointer to buffer for input/im2col data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * bufferA size: 2*input_ch*kernel_h*kernel_w + */ +void shl_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, uint8_t *bufferA, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, + int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, + int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult, + int32_t out_shift); + +/** + * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] input_hxw input height mul width + * @param[in] input_ch input channel + * @param[in] output_ch output_channel + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * + */ +void shl_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, int32_t input_hxw, int32_t input_ch, + int32_t output_ch, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant depthwise convolution optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] kernel_data pointer to kernel tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in,out] bufferA pointer to buffer for input/im2col data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * bufferA size: 4*input_ch*kernel_h*kernel_w + */ +void shl_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data, + uint8_t *output_data, uint8_t *bufferA, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w, + int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, + int32_t out_h, int32_t out_w, int32_t input_zero_point, + int32_t weight_zero_point, int32_t output_zero_point, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function + * @param[in] input pointer to input tensor data + * @param[in] kernel pointer to kernel tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] input_zero_point input zero_point + * @param[in] kernel_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3, shift_right + * @return none. + * + */ +void shl_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output, + int32_t input_zero_point, int32_t kernel_zero_point, + int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift); + +/** + * @brief u8 asym quant fullyconnected optimized function + * @param[in] input_data pointer to input tensor data + * @param[in] weight_data pointer to weight tensor data + * @param[in] bias_data pointer to bias tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] in_nodes input nodes (weight cols) + * @param[in] out_nodes output nodes (weight rows) + * @param[in] input_zero_point input zero_point + * @param[in] weight_zero_point weight zero_point + * @param[in] output_zero_point output zero_point + * @param[in] output_mult multiplier for s1 * s2 / s3 + * @param[in] output_shift output shift for s1 * s2 / s3. shift_right + * @return none. + * + */ +void shl_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data, + uint8_t *output_data, int32_t in_nodes, int32_t out_nodes, + int32_t input_zero_point, int32_t weight_zero_point, + int32_t output_zero_point, int32_t output_mult, + int32_t output_shift); + +/** + * @brief u8 asym quant generic maxpool optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] input_h input height + * @param[in] input_w input width + * @param[in] input_ch input channel / output_channel + * @param[in] kernel_h kernel height + * @param[in] kernel_w kernel width + * @param[in] pad_h pad on height + * @param[in] pad_w pad on width + * @param[in] stride_h stride on height + * @param[in] stride_w stride on width + * @param[in] out_h output height + * @param[in] out_w output width + * @return none. + * bufferA size: 2*input_ch*kernel_h*kernel_w + */ +void shl_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h, + int32_t input_w, int32_t input_ch, int32_t kernel_h, + int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h, + int32_t stride_w, int32_t output_h, int32_t output_w); + +/** + * @brief u8 asym quant relu optimized function + * @param[in,out] data pointer to input/output tensor data, compute inplace + * @param[in] size input tensor size, tensor length + * @param[in] input_zeropoint input zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant relu6 optimized function + * @param[in,out] data pointer to input/output tensor data, compute inplace + * @param[in] size input tensor size, tensor length + * @param[in] input_zeropoint input zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant clip optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size input tensor size, tensor length + * @param[in] clip_qmin clip min value(quant) + * @param[in] clip_qmax clip max value(quant) + * @param[in] input_zeropoint input zero_point + * @param[in] output_zeropoint output zero_point + * @param[in] out_multiplier multiplier for sacle_in / scale_out + * @param[in] out_shift shift left > 0 + * @return none. + * can be fused with conv/fc + */ +void shl_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min, + int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint, + int32_t out_multiplier, int32_t out_shift); + +/** + * @brief u8 asym quant element add optimized function + * @param[in] input_0 pointer to input_0 tensor data + * @param[in] input_1 pointer to input_1 tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] size input tensor size, tensor length, element size + * @param[in] input_0_zeroponit input_0 zero_point. Range: Range: -255 to 0 + * @param[in] input_0_mult multiplier for sacle_input_0 + * @param[in] input_0_shift input_0 shift + * @param[in] input_1_zeropoint input_1 zero_point. Range: Range: -255 to 0 + * @param[in] input_1_mult multiplier for sacle_input_1 + * @param[in] input_1_shift input_1 shift + * @param[in] output_zeropoint output zero_point + * @param[in] output_mult multiplier for scale_output + * @param[in] output_shift output shift + * @return none. + * + */ +void shl_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, + int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult, + int32_t input_0_shift, int32_t input_1_zeropoint, + int32_t input_1_mult, int32_t input_1_shift, + int32_t output_zeropoint, int32_t output_mult, + int32_t output_shift); + +/** + * @brief u8 asym quant element mul optimized function + * @param[in] input_0 pointer to input_0 tensor data + * @param[in] input_1 pointer to input_1 tensor data + * @param[in,out] output pointer to output tensor data + * @param[in] size input tensor size, tensor length, element size + * @param[in] input_0_zeroponit input_0 zero_point + * @param[in] input_1_zeropoint input_1 zero_point + * @param[in] output_zeropoint output zero_point + * @param[in] output_mult multiplier for s1 * s2 / s3 + * @param[in] output_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output, + int32_t size, int32_t input_0_zeroponit, + int32_t input_1_zeropoint, int32_t output_zeropoint, + int32_t output_mult, int32_t output_shift); + +/** + * @brief u8 asym quant softmax optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size tensor size + * @param[in] out_mult multiplier + * @param[in] out_shift output shift + * @return none. + * + */ +void shl_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, + int32_t out_mult, int32_t out_shift); + +/** + * @brief u8 asym quant reshape optimized function + * @param[in] input_data pointer to input tensor data + * @param[in,out] output_data pointer to output tensor data + * @param[in] size tensor size + * @return none. + * + */ +void shl_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size); + +/** + * @brief u8 asym quant vec and matrix mul optimized function + * @param[in] lhs pointer to input tensor data + * @param[in] rhs pointer to weight tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] dst pointer to output tensor data + * @param[in] rhs_col input nodes (weight cols) + * @param[in] rhs_row output nodes (weight rows) + * @param[in] lhs_zero_point input zero_point + * @param[in] rhs_zero_point weight zero_point + * @param[in] dst_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, + int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point, + int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult, + int32_t dst_shift); + +/** + * @brief u8 asym quant matrix mul(A * B_trans) optimized function + * @param[in] lhs pointer to input tensor data + * @param[in] rhs pointer to weight tensor data + * @param[in] bias pointer to bias tensor data + * @param[in,out] dst pointer to output tensor data + * @param[in] lhs_row input row / m + * @param[in] lhs_col input col / k + * @param[in] rhs_row weight row / n + * @param[in] lhs_zero_point input zero_point + * @param[in] rhs_zero_point weight zero_point + * @param[in] dst_zero_point output zero_point + * @param[in] dst_mult multiplier for s1 * s2 / s3 + * @param[in] dst_shift output shift for s1 * s2 / s3 + * @return none. + * + */ +void shl_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst, + int32_t lhs_row, int32_t lhs_col, int32_t rhs_row, + int32_t lhs_zero_point, int32_t rhs_zero_point, + int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift); + +/** + * @brief Struct for specifying activation function types + * + */ +typedef enum { + CSKY_SIGMOID = 0, /**< Sigmoid activation function */ + CSKY_TANH = 1, /**< Tanh activation function */ +} csky_vdsp2_nn_activation_type; + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + */ + +void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + */ + +void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q15_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q15_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q15_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +void csky_vdsp2_convolve_HWC_q7_fast_nonsquare( + const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, + const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, + const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t *bufferA); + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for + * second half of MobileNets after depthwise separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ +void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, const uint16_t ch_im_in, + const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias, + const uint16_t bias_shift, const uint16_t out_shift, + q7_t *Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t *bufferA); + +/** + * @brief Q7 version of convolution for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t *bias, + const uint16_t bias_shift, const uint16_t out_shift, + q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t *wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, + const uint16_t dim_im_out, q15_t *bufferA); + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return none. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ +void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare( + const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, + const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, + const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t *bufferA); + +/** + * @brief Q7 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return none. + */ + +void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t *bias, q7_t *pOut); + +/** + * @brief Q15 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return none. + * + */ + +void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q15_t *bias, q15_t *pOut); + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return none. + * + */ + +void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, + const uint16_t dim_vec, const uint16_t num_of_rows, + const uint16_t bias_shift, const uint16_t out_shift, + const q7_t *bias, q15_t *pOut); + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void csky_vdsp2_relu_q7(q7_t *data, uint16_t size); + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void csky_vdsp2_relu_q15(q15_t *data, uint16_t size); + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + +void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, + csky_vdsp2_nn_activation_type type); + +/** + * @brief Q15 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + +void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, + csky_vdsp2_nn_activation_type type); + +/** + * @brief Q7 max pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, + q7_t *Im_out); + +/** + * @brief Q7 average pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void csky_vdsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, + q7_t *Im_out); + +void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension + const uint16_t dim_im_in_y, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension + const uint16_t dim_im_out_y, // output image dimension + q7_t *bufferA, // a buffer for local storage + q7_t *Im_out, // output feature + const uint16_t out_lshift); // output left shift (scaling) + +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + */ + +void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); + +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + */ + +void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); + +#ifdef __cplusplus +} +#endif + +#endif // SOURCE_I805_OPT_I805_FUNCTION_H_ diff --git a/source/i805_opt/maxpool.c b/source/i805_opt/maxpool.c index 43b6eb76..1802a3ed 100644 --- a/source/i805_opt/maxpool.c +++ b/source/i805_opt/maxpool.c @@ -16,39 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -static int csi_i805_maxpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_i805_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - q7_t *input_data = (q7_t *)input->data; + q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] q7_t buffer_tmp[out_hw * out_hw * in_c]; // buffer_size = out_h * out_w * channel csky_vdsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, - params->stride_height, out_hw, buffer_tmp, output_data); + params->stride_height, out_hw, buffer_tmp, output_data); return CSINN_TRUE; } -int csi_i805_maxpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_i805_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { @@ -61,26 +60,26 @@ int csi_i805_maxpool2d_init_q7(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n"); - params->base.bc = csi_ref_maxpool2d_quant; + shl_debug_warning( + "maxpool q7 is not optimized to achieve under this condition on i805, call reference " + "func replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; } else { - params->base.bc = csi_i805_maxpool2d_q7; + cb->exec = shl_i805_maxpool2d_q7; } return CSINN_TRUE; } - -int csi_i805_maxpool2d_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_i805_maxpool2d_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - uint8_t *input_data = (uint8_t *)input->data; + uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_h = input->dim[1]; - uint16_t in_w = input->dim[2]; - uint16_t in_c = input->dim[3]; + uint16_t in_h = input->dim[1]; + uint16_t in_w = input->dim[2]; + uint16_t in_c = input->dim[3]; uint16_t out_h = output->dim[1]; uint16_t out_w = output->dim[2]; @@ -92,8 +91,8 @@ int csi_i805_maxpool2d_u8(struct csi_tensor *input, int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; - csi_i805_maxpool2d_opt_u8(input_data, output_data, in_h, in_w, in_c, ker_h, ker_w, - pad_h, pad_w, stride_h, stride_w, out_h, out_w); + shl_i805_maxpool2d_opt_u8(input_data, output_data, in_h, in_w, in_c, ker_h, ker_w, pad_h, pad_w, + stride_h, stride_w, out_h, out_w); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/i805_opt/mul.c b/source/i805_opt/mul.c index eaf1004a..74ddfd00 100644 --- a/source/i805_opt/mul.c +++ b/source/i805_opt/mul.c @@ -16,35 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_mul_init_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_i805_mul_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { // compute out multiplier and shift for scale_in/scale_out float real_scale = input0->qinfo->scale * input1->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); - params->base.bc = csi_i805_mul_u8; + shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_mul_u8; return CSINN_TRUE; } -int csi_i805_mul_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int shl_i805_mul_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint8_t *input0_data = (uint8_t *)input0->data; uint8_t *input1_data = (uint8_t *)input1->data; uint8_t *output_data = (uint8_t *)output->data; - int32_t size = csi_tensor_size(input0); + int32_t size = csinn_tensor_size(input0); - csi_i805_elementwise_mul_opt_u8(input0_data, input1_data, output_data, size, -input0->qinfo->zero_point, -input1->qinfo->zero_point, - output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift); + shl_i805_elementwise_mul_opt_u8(input0_data, input1_data, output_data, size, + -input0->qinfo->zero_point, -input1->qinfo->zero_point, + output->qinfo->zero_point, output->qinfo->multiplier, + -output->qinfo->shift); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/i805_opt/nn-support/csi_xt800v_nntables.c b/source/i805_opt/nn-support/csi_xt800v_nntables.c deleted file mode 100644 index 1563f833..00000000 --- a/source/i805_opt/nn-support/csi_xt800v_nntables.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csky_vdsp2_nntables.c - * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift - * - * -------------------------------------------------------------------- */ -#include "csi_instance.h" - -/** - * @brief tables for various activation functions - * - * This file include the declaration of common tables. - * Most of them are used for activation functions - * - * Assumption: - * Unified table: input is 3.x format, i.e, range of [-8, 8) - * sigmoid(8) = 0.9996646498695336 - * tanh(8) = 0.9999997749296758 - * The accuracy here should be good enough - * - * 2-stage HL table: - * - * The entire input range is divided into two parts: - * - * Low range table: 0x000x xxxx or 0x111x xxxx - * table entry will be the binary number excluding the first - * two digits, i.e., 0x0x xxxx or 0x1x xxxx - * - * - * - * High range table 0x0010 0000 -- 0x0111 1111 - * 0x1000 0000 -- 0x1101 1111 - * - * For positive numbers, table entry will be - * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 - * i.e., 0x0000 0000 - 0x0101 11111 - * - * same thing for the negative numbers, table entry will be - * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 - * i.e., 0x0110 0000 - 0x1011 1111 - */ - -const q7_t sigmoidTable_q7[256] = { - 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, - 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, - 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, - 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, - 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, - 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, - 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, - 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, - 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, - 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, - 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, - 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, - 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, - 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, - 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, -}; - -const q15_t sigmoidTable_q15[256] = { - 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, - 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb, - 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, - 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, - 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, - 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, - 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, - 0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00, -}; - -const q15_t sigmoidLTable_q15[128] = { - 0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, - 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb, - 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, - 0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, - 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc, - 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, - 0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a, - 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, - 0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, - 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3, - 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, - 0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, - 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833, - 0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, - 0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f, - 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00, -}; - -const q15_t sigmoidHTable_q15[192] = { - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, -}; - -const q7_t tanhTable_q7[256] = { - 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, - 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, - 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, - 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, - 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, - 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, - 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, - 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, - 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, - 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, - 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, - 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, -}; - -const q15_t tanhTable_q15[256] = { - 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, - 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6, - 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, - 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, - 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, - 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, - 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, - 0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803, -}; - -const q15_t tanhLTable_q15[128] = { - 0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, - 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6, - 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, - 0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, - 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e, - 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, - 0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807, - 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, - 0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, - 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70, - 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, - 0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, - 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d, - 0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, - 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec, - 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00, -}; - -const q15_t tanhHTable_q15[192] = { - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, -}; diff --git a/source/i805_opt/nn-support/shl_xt800v_nntables.c b/source/i805_opt/nn-support/shl_xt800v_nntables.c new file mode 100644 index 00000000..1e21ec94 --- /dev/null +++ b/source/i805_opt/nn-support/shl_xt800v_nntables.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csky_vdsp2_nntables.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * -------------------------------------------------------------------- */ + +#include +/** + * @brief 8-bit fractional data type in 1.7 format. + */ +typedef int8_t q7_t; + +/** + * @brief 16-bit fractional data type in 1.15 format. + */ +typedef int16_t q15_t; + +/** + * @brief tables for various activation functions + * + * This file include the declaration of common tables. + * Most of them are used for activation functions + * + * Assumption: + * Unified table: input is 3.x format, i.e, range of [-8, 8) + * sigmoid(8) = 0.9996646498695336 + * tanh(8) = 0.9999997749296758 + * The accuracy here should be good enough + * + * 2-stage HL table: + * + * The entire input range is divided into two parts: + * + * Low range table: 0x000x xxxx or 0x111x xxxx + * table entry will be the binary number excluding the first + * two digits, i.e., 0x0x xxxx or 0x1x xxxx + * + * + * + * High range table 0x0010 0000 -- 0x0111 1111 + * 0x1000 0000 -- 0x1101 1111 + * + * For positive numbers, table entry will be + * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 + * i.e., 0x0000 0000 - 0x0101 11111 + * + * same thing for the negative numbers, table entry will be + * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 + * i.e., 0x0110 0000 - 0x1011 1111 + */ + +const q7_t sigmoidTable_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, + 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, + 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, + 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, + 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, + 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + +const q15_t sigmoidTable_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, + 0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, + 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, + 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, + 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, + 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, + 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, + 0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, + 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, + 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, + 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, + 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, + 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, + 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, + 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, + 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, + 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7, + 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, + 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615, + 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + +const q7_t tanhTable_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, + 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, + 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, + 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + +const q15_t tanhTable_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, + 0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, + 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, + 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, + 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, + 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, + 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, + 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, + 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, + 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, + 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, + 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5, + 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, + 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941, + 0xe0a7, 0xe847, 0xf015, 0xf803, +}; diff --git a/source/i805_opt/pooling/csi_i805_maxpool_8.S b/source/i805_opt/pooling/shl_i805_maxpool_8.S similarity index 94% rename from source/i805_opt/pooling/csi_i805_maxpool_8.S rename to source/i805_opt/pooling/shl_i805_maxpool_8.S index 5b2a6eb0..e8e3f59f 100644 --- a/source/i805_opt/pooling/csi_i805_maxpool_8.S +++ b/source/i805_opt/pooling/shl_i805_maxpool_8.S @@ -16,18 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_maxpool2d_8.S + * @file shl_i805_maxpool2d_8.S * @brief uint8 maxpool function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_maxpool2d_opt_u8(uint8_t *input_data, + void shl_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h, int32_t input_w, @@ -67,14 +67,14 @@ */ - .file "csi_i805_maxpool2d_8.S" - .section .text.csi_i805_maxpool2d_opt_u8,"ax",@progbits + .file "shl_i805_maxpool2d_8.S" + .section .text.shl_i805_maxpool2d_opt_u8,"ax",@progbits .align 2 - .global csi_i805_maxpool2d_opt_u8 - .type csi_i805_maxpool2d_opt_u8, @function + .global shl_i805_maxpool2d_opt_u8 + .type shl_i805_maxpool2d_opt_u8, @function -csi_i805_maxpool2d_opt_u8: +shl_i805_maxpool2d_opt_u8: push l0, l1, l2, l3, l4, l5, l6, l7, l8 ld.w l0, (sp, 0x24) // input_ch @@ -222,4 +222,4 @@ csi_i805_maxpool2d_opt_u8: .END: pop l0, l1, l2, l3, l4, l5, l6, l7, l8 rts - .size csi_i805_maxpool2d_opt_u8, .-csi_i805_maxpool2d_opt_u8 + .size shl_i805_maxpool2d_opt_u8, .-shl_i805_maxpool2d_opt_u8 diff --git a/source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S b/source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S similarity index 95% rename from source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S rename to source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S index ebf3f516..495bd657 100644 --- a/source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S +++ b/source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S @@ -17,14 +17,14 @@ */ /****************************************************************************** - * @file csi_xt800v_avepool_q7_HWC_nonsquare.S + * @file shl_xt800v_avepool_q7_HWC_nonsquare.S * @brief Pooling functions implementations. * @version V1.0 * @date 31. May 2018 ******************************************************************************/ /* - * void csi_xt800v_avepool_q7_HWC_nonsquare( + * void shl_xt800v_avepool_q7_HWC_nonsquare( * const q7_t *Im_in, // input image * const uint16_t dim_im_in_x, // input image dimension * const uint16_t dim_im_in_y, // input image dimension @@ -42,12 +42,12 @@ * const uint16_t out_lshift) // output left shift (scaling) */ - .section .text.csi_xt800v_avepool_q7_HWC_nonsquare,"ax",@progbits + .section .text.shl_xt800v_avepool_q7_HWC_nonsquare,"ax",@progbits .align 2 - .global csi_xt800v_avepool_q7_HWC_nonsquare - .type csi_xt800v_avepool_q7_HWC_nonsquare, @function + .global shl_xt800v_avepool_q7_HWC_nonsquare + .type shl_xt800v_avepool_q7_HWC_nonsquare, @function -csi_xt800v_avepool_q7_HWC_nonsquare: +shl_xt800v_avepool_q7_HWC_nonsquare: push l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr ld.hs l8, (sp, 0X2C) // dim_kernel_x ld.hs l3, (sp, 0x34) // padding_x @@ -384,8 +384,7 @@ csi_xt800v_avepool_q7_HWC_nonsquare: .L67: pop l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr - .size csi_xt800v_avepool_q7_HWC_nonsquare, .-csi_xt800v_avepool_q7_HWC_nonsquare -.weak csi_avepool_q7_HWC_nonsquare -.set csi_avepool_q7_HWC_nonsquare, csi_xt800v_avepool_q7_HWC_nonsquare + .size shl_xt800v_avepool_q7_HWC_nonsquare, .-shl_xt800v_avepool_q7_HWC_nonsquare + .weak csky_vdsp2_avepool_q7_HWC_nonsquare -.set csky_vdsp2_avepool_q7_HWC_nonsquare, csi_xt800v_avepool_q7_HWC_nonsquare +.set csky_vdsp2_avepool_q7_HWC_nonsquare, shl_xt800v_avepool_q7_HWC_nonsquare diff --git a/source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S b/source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S similarity index 93% rename from source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S rename to source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S index 3945e91a..b82c6095 100644 --- a/source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S +++ b/source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S @@ -17,7 +17,7 @@ */ /****************************************************************************** - * @file csi_xt800v_pool_q7_HWC.S + * @file shl_xt800v_pool_q7_HWC.S * @brief Pooling functions implementations. * @version V1.0 * @date 31. May 2018 @@ -25,7 +25,7 @@ /* * void - * csi_xt800v_maxpool2d_q7_HWC(q7_t * Im_in, + * shl_xt800v_maxpool2d_q7_HWC(q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const uint16_t dim_kernel, @@ -36,13 +36,13 @@ * q7_t * Im_out) */ - .file "csi_xt800v_pool_HWC_q7.S" - .section .text.csi_xt800v_maxpool2d_q7_HWC,"ax",@progbits + .file "shl_xt800v_pool_HWC_q7.S" + .section .text.shl_xt800v_maxpool2d_q7_HWC,"ax",@progbits .align 2 - .global csi_xt800v_maxpool2d_q7_HWC - .type csi_xt800v_maxpool2d_q7_HWC, @function + .global shl_xt800v_maxpool2d_q7_HWC + .type shl_xt800v_maxpool2d_q7_HWC, @function -csi_xt800v_maxpool2d_q7_HWC: +shl_xt800v_maxpool2d_q7_HWC: push l0, l1, l2, l3, l4, l5, l6, l7 ld.w l0, (sp, 0x30) // im_out ld.hs l1, (sp, 0x28) // dim_im_out @@ -249,16 +249,14 @@ csi_xt800v_maxpool2d_q7_HWC: .L28: pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_xt800v_maxpool2d_q7_HWC, .-csi_xt800v_maxpool2d_q7_HWC + .size shl_xt800v_maxpool2d_q7_HWC, .-shl_xt800v_maxpool2d_q7_HWC -.weak csi_maxpool2d_q7_HWC -.set csi_maxpool2d_q7_HWC, csi_xt800v_maxpool2d_q7_HWC .weak csky_vdsp2_maxpool2d_q7_HWC -.set csky_vdsp2_maxpool2d_q7_HWC, csi_xt800v_maxpool2d_q7_HWC +.set csky_vdsp2_maxpool2d_q7_HWC, shl_xt800v_maxpool2d_q7_HWC /* * void - * csi_xt800v_avepool_q7_HWC(q7_t * Im_in, + * shl_xt800v_avepool_q7_HWC(q7_t * Im_in, * const uint16_t dim_im_in, * const uint16_t ch_im_in, * const uint16_t dim_kernel, @@ -269,12 +267,12 @@ csi_xt800v_maxpool2d_q7_HWC: * q7_t * Im_out) */ - .section .text.csi_xt800v_avepool_q7_HWC,"ax",@progbits + .section .text.shl_xt800v_avepool_q7_HWC,"ax",@progbits .align 2 - .global csi_xt800v_avepool_q7_HWC - .type csi_xt800v_avepool_q7_HWC, @function + .global shl_xt800v_avepool_q7_HWC + .type shl_xt800v_avepool_q7_HWC, @function -csi_xt800v_avepool_q7_HWC: +shl_xt800v_avepool_q7_HWC: push l0, l1, l2, l3, l4, l5, l6, l7 ld.w l0, (sp, 0x30) // im_out ld.w t5, (sp, 0x2c) // bufferA @@ -599,8 +597,7 @@ csi_xt800v_avepool_q7_HWC: .L67: pop l0, l1, l2, l3, l4, l5, l6, l7 - .size csi_xt800v_avepool_q7_HWC, .-csi_xt800v_avepool_q7_HWC -.weak csi_avepool_q7_HWC -.set csi_avepool_q7_HWC, csi_xt800v_avepool_q7_HWC + .size shl_xt800v_avepool_q7_HWC, .-shl_xt800v_avepool_q7_HWC + .weak csky_vdsp2_avepool_q7_HWC -.set csky_vdsp2_avepool_q7_HWC, csi_xt800v_avepool_q7_HWC +.set csky_vdsp2_avepool_q7_HWC, shl_xt800v_avepool_q7_HWC diff --git a/source/i805_opt/relu.c b/source/i805_opt/relu.c index 0b7d1342..6a62261d 100644 --- a/source/i805_opt/relu.c +++ b/source/i805_opt/relu.c @@ -16,53 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_relu_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); - csky_vdsp2_relu_q7(input_data, size); // FIXME: unified func name - csi_relu_q7? + int size = csinn_tensor_size(input); + csky_vdsp2_relu_q7(input_data, size); // FIXME: unified func name - csinn_relu_q7? output->data = input_data; return CSINN_TRUE; } -int csi_i805_relu_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_relu_q15(input_data, size); output->data = input_data; return CSINN_TRUE; } - -int csi_i805_relu_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { // compute out multiplier and shift for scale_in/scale_out float real_multiplier = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_multiplier, &output->qinfo->multiplier, &output->qinfo->shift); - params->base.bc = csi_i805_relu_u8; + shl_quantize_multiplier(real_multiplier, &output->qinfo->multiplier, &output->qinfo->shift); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_relu_u8; return CSINN_TRUE; } -int csi_i805_relu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { uint8_t *input_data = (uint8_t *)input->data; - int32_t size = csi_tensor_size(input); + int32_t size = csinn_tensor_size(input); - csi_i805_relu_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, output->qinfo->shift); + shl_i805_relu_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, + output->qinfo->shift); output->data = input_data; return CSINN_TRUE; } diff --git a/source/i805_opt/relu6.c b/source/i805_opt/relu6.c index d9f215bb..03125010 100644 --- a/source/i805_opt/relu6.c +++ b/source/i805_opt/relu6.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_relu6_init_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu6_init_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { // compute out multiplier and shift for scale_in/scale_out float real_scale = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); - params->base.bc = csi_i805_relu6_u8; + shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); + struct csinn_callback *cb = params->base.cb; + cb->exec = shl_i805_relu6_u8; return CSINN_TRUE; } -int csi_i805_relu6_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_relu6_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { uint8_t *input_data = (uint8_t *)input->data; - int32_t size = csi_tensor_size(input); + int32_t size = csinn_tensor_size(input); - csi_i805_relu6_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, output->qinfo->shift); + shl_i805_relu6_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, + output->qinfo->shift); output->data = input_data; return CSINN_TRUE; } \ No newline at end of file diff --git a/source/i805_opt/reshape.c b/source/i805_opt/reshape.c index 1dd23cdd..c1412cbe 100644 --- a/source/i805_opt/reshape.c +++ b/source/i805_opt/reshape.c @@ -16,20 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_reshape_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int shl_i805_reshape_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; - int32_t size = csi_tensor_size(input); + int32_t size = csinn_tensor_size(input); if (output_data != input_data) { - csi_i805_reshape_opt_u8(input_data, output_data, size); + shl_i805_reshape_opt_u8(input_data, output_data, size); } return CSINN_TRUE; } diff --git a/source/i805_opt/reshape/csi_i805_reshape_8.S b/source/i805_opt/reshape/shl_i805_reshape_8.S similarity index 82% rename from source/i805_opt/reshape/csi_i805_reshape_8.S rename to source/i805_opt/reshape/shl_i805_reshape_8.S index cc8d27e8..9b91d0a4 100644 --- a/source/i805_opt/reshape/csi_i805_reshape_8.S +++ b/source/i805_opt/reshape/shl_i805_reshape_8.S @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /****************************************************************************** - * @file csi_i805_reshape_8.S + * @file shl_i805_reshape_8.S * @brief uint8 reshape/memcpy layer function. * @version V1.0 * @date 9. Jul 2021 ******************************************************************************/ /* - void csi_i805_reshape_opt_u8(uint8_t * input_data, + void shl_i805_reshape_opt_u8(uint8_t * input_data, uint8_t * output_data int32_t size) @@ -40,14 +40,14 @@ a2: tensor size */ - .file "csi_i805_reshape_8.S" - .section .text.csi_i805_reshape_opt_u8,"ax",@progbits + .file "shl_i805_reshape_8.S" + .section .text.shl_i805_reshape_opt_u8,"ax",@progbits .align 2 - .global csi_i805_reshape_opt_u8 - .type csi_i805_reshape_opt_u8, @function + .global shl_i805_reshape_opt_u8 + .type shl_i805_reshape_opt_u8, @function -csi_i805_reshape_opt_u8: +shl_i805_reshape_opt_u8: lsri t0, a2, 6 // t0 = size / 64 bez t0, .TAIL_64 @@ -75,4 +75,4 @@ csi_i805_reshape_opt_u8: .END: rts - .size csi_i805_reshape_opt_u8, .-csi_i805_reshape_opt_u8 + .size shl_i805_reshape_opt_u8, .-shl_i805_reshape_opt_u8 diff --git a/source/i805_opt/setup.c b/source/i805_opt/setup.c index 4eea511b..92db3005 100644 --- a/source/i805_opt/setup.c +++ b/source/i805_opt/setup.c @@ -16,111 +16,72 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "shl_i805.h" -static void *setup_init_map() +static void *setup_cb_map() { - static void* init_map[CSINN_OP_AND_UTILS_SIZE][2]; + static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2]; + memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2); + /* q7 dtype */ - // init_map[CSINN_OP_AVGPOOL2D][0] = csi_i805_avgpool2d_init_q7; - init_map[CSINN_OP_ADD][0] = csi_i805_add_init_u8; - init_map[CSINN_OP_CONV2D][0] = csi_i805_conv2d_init_u8; - init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_i805_depthwise_conv2d_init_u8; - init_map[CSINN_OP_FULLYCONNECTED][0] = csi_i805_fullyconnected_init_u8; - init_map[CSINN_OP_MAXPOOL2D][0] = csi_i805_maxpool2d_init_q7; - init_map[CSINN_OP_MUL][0] = csi_i805_mul_init_u8; - init_map[CSINN_OP_RELU][0] = csi_i805_relu_init_u8; - init_map[CSINN_OP_RELU6][0] = csi_i805_relu6_init_u8; + cb_map[CSINN_OP_ADD][0].init = shl_i805_add_init_u8; + cb_map[CSINN_OP_CONV2D][0].init = shl_i805_conv2d_init_u8; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_i805_depthwise_conv2d_init_u8; + cb_map[CSINN_OP_FULLYCONNECTED][0].init = shl_i805_fullyconnected_init_u8; + cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_i805_maxpool2d_init_q7; + cb_map[CSINN_OP_MUL][0].init = shl_i805_mul_init_u8; + cb_map[CSINN_OP_RELU][0].init = shl_i805_relu_init_u8; + cb_map[CSINN_OP_RELU6][0].init = shl_i805_relu6_init_u8; + + cb_map[CSINN_OP_ADD][0].exec = shl_i805_add_u8; + cb_map[CSINN_OP_CONV2D][0].exec = shl_i805_conv2d_u8; + + cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].exec = shl_i805_depthwise_conv2d_u8; + cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_i805_fullyconnected_u8; + cb_map[CSINN_OP_MUL][0].exec = shl_i805_mul_u8; + cb_map[CSINN_OP_RELU][0].exec = shl_i805_relu_u8; + cb_map[CSINN_OP_RELU6][0].exec = shl_i805_relu6_u8; + cb_map[CSINN_OP_RESHAPE][0].exec = shl_i805_reshape_u8; + cb_map[CSINN_OP_SIGMOID][0].exec = shl_i805_sigmoid_q7; + cb_map[CSINN_OP_TANH][0].exec = shl_i805_tanh_q7; /* q15 dtype */ - init_map[CSINN_OP_CONV2D][1] = csi_i805_conv2d_init_q15; + cb_map[CSINN_OP_CONV2D][1].init = shl_i805_conv2d_init_q15; - return init_map; + cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_i805_fullyconnected_q15; + cb_map[CSINN_OP_RELU][1].exec = shl_i805_relu_q15; + cb_map[CSINN_OP_SIGMOID][1].exec = shl_i805_sigmoid_q15; + cb_map[CSINN_OP_SOFTMAX][1].exec = shl_i805_softmax_q15; + cb_map[CSINN_OP_TANH][1].exec = shl_i805_tanh_q15; + + return cb_map; } -static int get_init_map_index(int op, int dtype) +static int get_cb_map_index(int op, int dtype) { switch (dtype) { - case CSINN_DTYPE_UINT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; + case CSINN_DTYPE_UINT8: + return op * 2; + break; + case CSINN_DTYPE_INT16: + return op * 2 + 1; + break; + default: + return CSINN_UNSUPPORT_DTYPE; } } -void *csi_init_map_i805(int op, int dtype) +static struct csinn_callback *__cb_map_table_i805; +struct csinn_callback *__attribute__((weak)) shl_cb_map_i805(int op, int dtype) { - void **init_map_table = setup_init_map(); - int idx = get_init_map_index(op, dtype); - if (idx >= 0) { - return init_map_table[idx]; - } else { - return NULL; - } + return &__cb_map_table_i805[get_cb_map_index(op, dtype)]; } - -static void *setup_bc_map() +void shl_target_init_i805() { - static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2]; - - /* q7 dtype */ - bc_map[CSINN_OP_ADD][0] = csi_i805_add_u8; - bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant; - bc_map[CSINN_OP_CONV2D][0] = csi_i805_conv2d_u8; - // bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant; - - bc_map[CSINN_OP_CLIP][0] = csi_ref_clip_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_i805_depthwise_conv2d_u8; - bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_i805_fullyconnected_u8; - bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant; - bc_map[CSINN_OP_MUL][0] = csi_i805_mul_u8; - bc_map[CSINN_OP_RELU][0] = csi_i805_relu_u8; - bc_map[CSINN_OP_RELU6][0] = csi_i805_relu6_u8; - bc_map[CSINN_OP_RESHAPE][0] = csi_i805_reshape_u8; - bc_map[CSINN_OP_SQUEEZE][0] = csi_ref_squeeze; - bc_map[CSINN_OP_SIGMOID][0] = csi_i805_sigmoid_q7; - bc_map[CSINN_OP_SOFTMAX][0] = csi_ref_softmax_quant; - bc_map[CSINN_OP_TANH][0] = csi_i805_tanh_q7; - - /* q15 dtype */ - bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_i805_fullyconnected_q15; - bc_map[CSINN_OP_RELU][1] = csi_i805_relu_q15; - bc_map[CSINN_OP_SIGMOID][1] = csi_i805_sigmoid_q15; - bc_map[CSINN_OP_SOFTMAX][1] = csi_i805_softmax_q15; - bc_map[CSINN_OP_TANH][1] = csi_i805_tanh_q15; - - return bc_map; -} - -static int get_bc_map_index(int op, int dtype) -{ - switch (dtype) { - case CSINN_DTYPE_UINT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; - } -} - -void *__attribute__((weak)) csi_bc_map_i805(int op, int dtype) -{ - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; - } - return bc_map_table[get_bc_map_index(op, dtype)]; + __cb_map_table_i805 = setup_cb_map(); + shl_register_runtime_callback(CSINN_I805, NULL); + shl_register_op_callback(CSINN_I805, shl_cb_map_i805); } diff --git a/source/i805_opt/sigmoid.c b/source/i805_opt/sigmoid.c index 961b4c98..ace798d7 100644 --- a/source/i805_opt/sigmoid.c +++ b/source/i805_opt/sigmoid.c @@ -16,36 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_sigmoid_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_i805_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_nn_activations_direct_q7(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; } -int csi_i805_sigmoid_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_i805_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_nn_activations_direct_q15(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; diff --git a/source/i805_opt/softmax.c b/source/i805_opt/softmax.c index 37041e95..56e3f270 100644 --- a/source/i805_opt/softmax.c +++ b/source/i805_opt/softmax.c @@ -16,29 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_softmax_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_i805_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_softmax_q7(input_data, size, output_data); return CSINN_TRUE; } -int csi_i805_softmax_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_i805_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *output_data = (q15_t *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_softmax_q15(input_data, size, output_data); return CSINN_TRUE; } diff --git a/source/i805_opt/softmax/csi_xt800v_softmax_q15.S b/source/i805_opt/softmax/shl_xt800v_softmax_q15.S similarity index 93% rename from source/i805_opt/softmax/csi_xt800v_softmax_q15.S rename to source/i805_opt/softmax/shl_xt800v_softmax_q15.S index ac6e5e13..c7c5b3b1 100644 --- a/source/i805_opt/softmax/csi_xt800v_softmax_q15.S +++ b/source/i805_opt/softmax/shl_xt800v_softmax_q15.S @@ -17,25 +17,25 @@ */ /****************************************************************************** - * @file csi_xt800v_softmax_q15.S + * @file shl_xt800v_softmax_q15.S * @brief Pooling functions implementations. * @version V1.0 * @date 01. June 2018 ******************************************************************************/ /* - * void csi_xt800v_softmax_q15(const q15_t * vec_in, + * void shl_xt800v_softmax_q15(const q15_t * vec_in, * const uint16_t dim_vec, * q15_t * p_out) */ - .file "csi_xt800v_softmax_q15.S" - .section .text.csi_xt800v_softmax_q15,"ax",@progbits + .file "shl_xt800v_softmax_q15.S" + .section .text.shl_xt800v_softmax_q15,"ax",@progbits .align 2 - .global csi_xt800v_softmax_q15 - .type csi_xt800v_softmax_q15, @function + .global shl_xt800v_softmax_q15 + .type shl_xt800v_softmax_q15, @function -csi_xt800v_softmax_q15: +shl_xt800v_softmax_q15: push l0, l1, l2 subi sp, sp, 64 vstm.8 vr8-vr11, (sp) @@ -272,8 +272,7 @@ csi_xt800v_softmax_q15: vldmu.8 vr12-vr14, (sp) vldmu.8 vr8-vr11, (sp) pop l0, l1, l2 - .size csi_xt800v_softmax_q15, .-csi_xt800v_softmax_q15 -.weak csi_softmax_q15 -.set csi_softmax_q15, csi_xt800v_softmax_q15 + .size shl_xt800v_softmax_q15, .-shl_xt800v_softmax_q15 + .weak csky_vdsp2_softmax_q15 -.set csky_vdsp2_softmax_q15, csi_xt800v_softmax_q15 +.set csky_vdsp2_softmax_q15, shl_xt800v_softmax_q15 diff --git a/source/i805_opt/softmax/csi_xt800v_softmax_q7.S b/source/i805_opt/softmax/shl_xt800v_softmax_q7.S similarity index 92% rename from source/i805_opt/softmax/csi_xt800v_softmax_q7.S rename to source/i805_opt/softmax/shl_xt800v_softmax_q7.S index 6e591d0b..6ad38771 100644 --- a/source/i805_opt/softmax/csi_xt800v_softmax_q7.S +++ b/source/i805_opt/softmax/shl_xt800v_softmax_q7.S @@ -17,25 +17,25 @@ */ /****************************************************************************** - * @file csi_xt800v_softmax_q7.S + * @file shl_xt800v_softmax_q7.S * @brief Pooling functions implementations. * @version V1.0 * @date 04. June 2018 ******************************************************************************/ /* - * void csi_xt800v_softmax_q7(const q7_t * vec_in, + * void shl_xt800v_softmax_q7(const q7_t * vec_in, * const uint16_t dim_vec, * q7_t * p_out) */ - .file "csi_xt800v_softmax_q7.S" - .section .text.csi_xt800v_softmax_q7,"ax",@progbits + .file "shl_xt800v_softmax_q7.S" + .section .text.shl_xt800v_softmax_q7,"ax",@progbits .align 2 - .global csi_xt800v_softmax_q7 - .type csi_xt800v_softmax_q7, @function + .global shl_xt800v_softmax_q7 + .type shl_xt800v_softmax_q7, @function -csi_xt800v_softmax_q7: +shl_xt800v_softmax_q7: push l0, l1, l2 subi sp, sp, 32 vstm.8 vr8-vr9, (sp) @@ -225,8 +225,7 @@ csi_xt800v_softmax_q7: .L18: vldmu.8 vr8-vr9, (sp) pop l0, l1, l2 - .size csi_xt800v_softmax_q7, .-csi_xt800v_softmax_q7 -.weak csi_softmax_q7 -.set csi_softmax_q7, csi_xt800v_softmax_q7 + .size shl_xt800v_softmax_q7, .-shl_xt800v_softmax_q7 + .weak csky_vdsp2_softmax_q7 -.set csky_vdsp2_softmax_q7, csi_xt800v_softmax_q7 +.set csky_vdsp2_softmax_q7, shl_xt800v_softmax_q7 diff --git a/source/i805_opt/tanh.c b/source/i805_opt/tanh.c index e550b861..b0c0bbd4 100644 --- a/source/i805_opt/tanh.c +++ b/source/i805_opt/tanh.c @@ -16,36 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_i805.h" +#include "i805_function.h" +#include "shl_i805.h" - -int csi_i805_tanh_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_i805_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_nn_activations_direct_q7(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; } -int csi_i805_tanh_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_i805_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); csky_vdsp2_nn_activations_direct_q15(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; diff --git a/source/i805_ref/activation/csi_nn_activations_q15.c b/source/i805_ref/activation/csi_nn_activations_q15.c deleted file mode 100644 index f7da936f..00000000 --- a/source/i805_ref/activation/csi_nn_activations_q15.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nn_activations_q15.c - * Description: Q15 neural network activation function using direct table look-up - * - * -------------------------------------------------------------------- */ - -#include "csi_nn_tables.h" -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Acti - * @{ - */ - - /** - * @brief Q15 neural network activation function using direct table look-up - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 - * @param[in] type type of activation functions - * @return none. - * - * @details - * - * This is the direct table look-up approach. - * - * Assume here the integer part of the fixed-point is <= 3. - * More than 3 just not making much sense, makes no difference with - * saturation followed by any of these activation functions. - */ - -void csi_nn_activations_direct_q15(q15_t * data, uint16_t size, - uint16_t int_width, - csi_nn_activation_type type) -{ - uint16_t i = size; - q15_t *pIn = data; - q15_t *pOut = data; - uint16_t shift_size = 8 + 3 - int_width; - uint32_t bit_mask = 0x7FF >> int_width; - uint32_t full_frac = bit_mask + 1; - const q15_t *lookup_table; - - switch (type) - { - case CSKY_SIGMOID: - lookup_table = sigmoidTable_q15; - break; - case CSKY_TANH: - default: - lookup_table = tanhTable_q15; - break; - } - - while (i) - { - q15_t out; - q15_t in = *pIn++; - q15_t frac = (uint32_t) in & bit_mask; - q15_t value = lookup_table[(uint8_t)__SSAT(in >> shift_size, 8)]; - q15_t value2 = lookup_table[(uint8_t)__SSAT(1 + (in >> shift_size), 8)]; - - /* doing the interpolation here for better accuracy */ - out = ((q31_t)(full_frac - frac) * value + (q31_t) value2 * frac) >> - shift_size; - - *pOut++ = out; - i--; - } - -} - -/** - * @} end of Acti group - */ diff --git a/source/i805_ref/activation/csi_nn_activations_q7.c b/source/i805_ref/activation/csi_nn_activations_q7.c deleted file mode 100644 index 919c4ea5..00000000 --- a/source/i805_ref/activation/csi_nn_activations_q7.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nn_activations_q7.c - * Description: Q7 neural network activation function using direct table look-up - * - * -------------------------------------------------------------------- */ - -#include "csi_nn_tables.h" -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Acti - * @{ - */ - - /** - * @brief Q7 neural network activation function using direct table look-up - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 - * @param[in] type type of activation functions - * @return none. - * - * @details - * - * This is the direct table look-up approach. - * - * Assume here the integer part of the fixed-point is <= 3. - * More than 3 just not making much sense, makes no difference with - * saturation followed by any of these activation functions. - */ - -void csi_nn_activations_direct_q7(q7_t * data, uint16_t size, - uint16_t int_width, - csi_nn_activation_type type) -{ - uint16_t i = size; - q7_t *pIn = data; - q7_t *pOut = data; - q7_t in; - q7_t out; - uint16_t shift_size = 3 - int_width; - const q7_t *lookup_table; - switch (type) - { - case CSKY_SIGMOID: - lookup_table = sigmoidTable_q7; - break; - case CSKY_TANH: - default: - lookup_table = tanhTable_q7; - break; - } - while (i) - { - in = *pIn++; - out = lookup_table[(uint8_t) in >> shift_size]; - *pOut++ = out; - i--; - } -} - -/** - * @} end of Acti group - */ diff --git a/source/i805_ref/activation/csi_relu_q15.c b/source/i805_ref/activation/csi_relu_q15.c deleted file mode 100644 index bbe8ae45..00000000 --- a/source/i805_ref/activation/csi_relu_q15.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_relu_q15.c - * Description: Q15 version of ReLU - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Acti - * @{ - */ - - /** - * @brief Q15 RELU function - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @return none. - * - * @details - * - * Optimized relu with QSUB instructions. - * - */ - -void csi_relu_q15(q15_t * data, uint16_t size) -{ - -#if defined (CSI_MATH_DSP) - - uint16_t i = size >> 1; - q15_t *pIn = data; - q15_t *pOut = data; - q31_t in; - q31_t buf; - q31_t mask; - - while (i) - { - in = *__SIMD32(pIn)++; - - /* extract the first bit */ - buf = __ROR(in & 0x80008000, 15); - - /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ - mask = __QSUB16(0x00000000, buf); - - *__SIMD32(pOut)++ = in & (~mask); - i--; - } - - if (size & 0x1) - { - if (*pIn < 0) - { - *pIn = 0; - } - pIn++; - } -#else - uint16_t i; - - for (i = 0; i < size; i++) - { - if (data[i] < 0) - data[i] = 0; - } - -#endif /* CSI_MATH_DSP */ - -} - -/** - * @} end of Acti group - */ diff --git a/source/i805_ref/activation/csi_relu_q7.c b/source/i805_ref/activation/csi_relu_q7.c deleted file mode 100644 index 8e6f2a9c..00000000 --- a/source/i805_ref/activation/csi_relu_q7.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_relu_q7.c - * Description: Q7 version of ReLU - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Acti - * @{ - */ - - /** - * @brief Q7 RELU function - * @param[in,out] data pointer to input - * @param[in] size number of elements - * @return none. - * - * @details - * - * Optimized relu with QSUB instructions. - * - */ - -void csi_relu_q7(q7_t * data, uint16_t size) -{ - -#if defined (CSI_MATH_DSP) - - uint16_t i = size >> 2; - q7_t *pIn = data; - q7_t *pOut = data; - q31_t in; - q31_t buf; - q31_t mask; - - while (i) - { - in = *__SIMD32(pIn)++; - - /* extract the first bit */ - buf = __ROR(in & 0x80808080, 7); - - /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ - mask = __QSUB8(0x00000000, buf); - - *__SIMD32(pOut)++ = in & (~mask); - i--; - } - - i = size & 0x3; - while (i) - { - if (*pIn < 0) - { - *pIn = 0; - } - pIn++; - i--; - } - -#else - - uint16_t i; - - for (i = 0; i < size; i++) - { - if (data[i] < 0) - data[i] = 0; - } - -#endif /* CSI_MATH_DSP */ - -} - -/** - * @} end of Acti group - */ diff --git a/source/i805_ref/activation/shl_activations_q15.c b/source/i805_ref/activation/shl_activations_q15.c new file mode 100644 index 00000000..4b6fe5d8 --- /dev/null +++ b/source/i805_ref/activation/shl_activations_q15.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_activations_q15.c + * Description: Q15 neural network activation function using direct table look-up + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q15 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void shl_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, + csi_nn_activation_type type) +{ + uint16_t i = size; + q15_t *pIn = data; + q15_t *pOut = data; + uint16_t shift_size = 8 + 3 - int_width; + uint32_t bit_mask = 0x7FF >> int_width; + uint32_t full_frac = bit_mask + 1; + const q15_t *lookup_table; + + switch (type) { + case CSKY_SIGMOID: + lookup_table = sigmoidTable_q15; + break; + case CSKY_TANH: + default: + lookup_table = tanhTable_q15; + break; + } + + while (i) { + q15_t out; + q15_t in = *pIn++; + q15_t frac = (uint32_t)in & bit_mask; + q15_t value = lookup_table[(uint8_t)__SSAT(in >> shift_size, 8)]; + q15_t value2 = lookup_table[(uint8_t)__SSAT(1 + (in >> shift_size), 8)]; + + /* doing the interpolation here for better accuracy */ + out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size; + + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/source/i805_ref/activation/shl_activations_q7.c b/source/i805_ref/activation/shl_activations_q7.c new file mode 100644 index 00000000..0c4a9e0e --- /dev/null +++ b/source/i805_ref/activation/shl_activations_q7.c @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_activations_q7.c + * Description: Q7 neural network activation function using direct table look-up + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void shl_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, + csi_nn_activation_type type) +{ + uint16_t i = size; + q7_t *pIn = data; + q7_t *pOut = data; + q7_t in; + q7_t out; + uint16_t shift_size = 3 - int_width; + const q7_t *lookup_table; + switch (type) { + case CSKY_SIGMOID: + lookup_table = sigmoidTable_q7; + break; + case CSKY_TANH: + default: + lookup_table = tanhTable_q7; + break; + } + while (i) { + in = *pIn++; + out = lookup_table[(uint8_t)in >> shift_size]; + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/source/i805_ref/activation/shl_relu_q15.c b/source/i805_ref/activation/shl_relu_q15.c new file mode 100644 index 00000000..5860426c --- /dev/null +++ b/source/i805_ref/activation/shl_relu_q15.c @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_relu_q15.c + * Description: Q15 version of ReLU + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void shl_relu_q15(q15_t* data, uint16_t size) +{ + uint16_t i; + + for (i = 0; i < size; i++) { + if (data[i] < 0) data[i] = 0; + } +} diff --git a/source/i805_ref/activation/shl_relu_q7.c b/source/i805_ref/activation/shl_relu_q7.c new file mode 100644 index 00000000..89511abb --- /dev/null +++ b/source/i805_ref/activation/shl_relu_q7.c @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_relu_q7.c + * Description: Q7 version of ReLU + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void shl_relu_q7(q7_t* data, uint16_t size) +{ + uint16_t i; + + for (i = 0; i < size; i++) { + if (data[i] < 0) data[i] = 0; + } +} diff --git a/source/i805_ref/avgpool.c b/source/i805_ref/avgpool.c index 10b1019d..7c8e2bc7 100644 --- a/source/i805_ref/avgpool.c +++ b/source/i805_ref/avgpool.c @@ -16,19 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_ref_i805.h" +/* CSI-NN2 version 2.0.x */ +#include "i805_ref_function.h" +#include "shl_ref_i805.h" /* constraint: 1.input tensor layout: NHWC 2. pad_left = pad_right; pad_top = pad_down FIXME: count_include_pad */ -static int csi_ref_i805_avgpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_i805_ref_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; @@ -47,31 +46,34 @@ static int csi_ref_i805_avgpool2d_q7(struct csi_tensor *input, uint16_t stride_h = params->stride_height; uint16_t stride_w = params->stride_width; - uint16_t pad_x = params->pad_left; // i.e. pad_x = params->pad_right - uint16_t pad_y = params->pad_top; // i.e. pad_y = params->pad_down + uint16_t pad_x = params->pad_left; // i.e. pad_x = params->pad_right + uint16_t pad_y = params->pad_top; // i.e. pad_y = params->pad_down q7_t buffer_tmp[out_h * out_w * in_c]; // buffer_size = out_h * out_w * channel - if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) { - csi_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h, - buffer_tmp, output_data); + if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) { + shl_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h, buffer_tmp, + output_data); } else { - csi_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, - pad_x, pad_y, stride_w, stride_h, out_w, out_h, - buffer_tmp, output_data, output->qinfo->shift); + shl_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, pad_x, pad_y, + stride_w, stride_h, out_w, out_h, buffer_tmp, output_data, + output->qinfo->shift); } return CSINN_TRUE; } -int csi_ref_i805_avgpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_i805_ref_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { - csi_debug_warning("avgpool q7 unsupport asymmetric padddings on ref_i805, call reference func replaced.\n"); - params->base.bc = csi_ref_avgpool2d_quant; // FIXME: csi_ref_avgpool2d_quant may be not applicable to i805 + struct csinn_callback *cb = params->base.cb; + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { + shl_debug_warning( + "avgpool q7 unsupport asymmetric padddings on ref_i805, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; // FIXME: shl_ref_avgpool2d_quant may be not + // applicable to i805 } else { - params->base.bc = csi_ref_i805_avgpool2d_q7; + cb->exec = shl_i805_ref_avgpool2d_q7; } return CSINN_TRUE; -} +} diff --git a/source/i805_ref/convolution.c b/source/i805_ref/convolution.c index de0b3ec2..9be3ae83 100644 --- a/source/i805_ref/convolution.c +++ b/source/i805_ref/convolution.c @@ -16,21 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -static int csi_ref_i805_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_ref_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -51,80 +49,77 @@ static int csi_ref_i805_conv2d_q7(struct csi_tensor *input, uint16_t pad_x = params->pad_left; // e.g. pad_x = params->pad_right uint16_t pad_y = params->pad_top; // e.g. pad_y = params->pad_down - q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_h * + kernel_w]; // buffer_size = in_c * kernel_size * kernel_size - if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) { - if ( (kernel_h == 1) && (kernel_w == 1) ) { - csi_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + if ((in_c % 4 == 0) && (out_c % 2 == 0)) { + if ((kernel_h == 1) && (kernel_w == 1)) { + shl_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c, + bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_w, out_h, buffer_tmp); } else { - csi_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c, + shl_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y, stride_w, stride_h, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_w, out_h, buffer_tmp); } } else if (in_c == 3) { - csi_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + shl_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y, stride_h, + bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, + out_h, buffer_tmp); } else { - csi_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + shl_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y, + stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_h, buffer_tmp); } return CSINN_TRUE; } - - -static int csi_ref_i805_conv2d_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_ref_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q15_t *input_data = (q15_t *)input->data; - q15_t *kernel_data = (q15_t *)kernel->data; - q15_t *bias_data = (q15_t *)bias->data; - q15_t *output_data = (q15_t *)output->data; + q15_t *input_data = (q15_t *)input->data; + q15_t *kernel_data = (q15_t *)kernel->data; + q15_t *bias_data = (q15_t *)bias->data; + q15_t *output_data = (q15_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; - uint16_t in_c = input->dim[3]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] uint16_t out_c = output->dim[3]; - uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; - uint16_t stride = params->stride_height; // e.g. stride = params->stride_width - uint16_t padding = params->pad_top; // e.g. padding = params->down = params->left = params->right - - if ( (in_c % 2 == 0) && (out_c % 2 == 0) ) { - q15_t buffer_tmp[2 * in_c * kernel_size * kernel_size]; // buffer_size = in_c * kernel_size * kernel_size - csi_convolve_HWC_q15_fast(input_data, in_hw, in_c, kernel_data, out_c, - kernel_size, padding, stride, bias_data, bias->qinfo->shift, - output->qinfo->shift, output_data, out_hw, buffer_tmp); + uint16_t kernel_size = kernel->dim[2]; // e.g. kernel_size = kernel->dim[3]; + uint16_t stride = params->stride_height; // e.g. stride = params->stride_width + uint16_t padding = + params->pad_top; // e.g. padding = params->down = params->left = params->right + + if ((in_c % 2 == 0) && (out_c % 2 == 0)) { + q15_t buffer_tmp[2 * in_c * kernel_size * + kernel_size]; // buffer_size = in_c * kernel_size * kernel_size + shl_convolve_HWC_q15_fast(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, padding, + stride, bias_data, bias->qinfo->shift, output->qinfo->shift, + output_data, out_hw, buffer_tmp); } else { - q15_t buffer_tmp[in_c * kernel_size * kernel_size]; // buffer_size = in_c * kernel_size * kernel_size - csi_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, - kernel_size, padding, stride, bias_data, bias->qinfo->shift, + q15_t buffer_tmp[in_c * kernel_size * + kernel_size]; // buffer_size = in_c * kernel_size * kernel_size + shl_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, + padding, stride, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, out_hw, buffer_tmp); } return CSINN_TRUE; } - -static int csi_ref_i805_depthwise_conv2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_i805_ref_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - q7_t *input_data = (q7_t *)input->data; - q7_t *kernel_data = (q7_t *)kernel->data; - q7_t *bias_data = (q7_t *)bias->data; - q7_t *output_data = (q7_t *)output->data; + q7_t *input_data = (q7_t *)input->data; + q7_t *kernel_data = (q7_t *)kernel->data; + q7_t *bias_data = (q7_t *)bias->data; + q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; uint16_t in_h = input->dim[1]; @@ -144,61 +139,63 @@ static int csi_ref_i805_depthwise_conv2d_q7(struct csi_tensor *input, uint16_t pad_x = params->pad_left; uint16_t pad_y = params->pad_top; - q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w]; // buffer_size = in_c * kernel_size * kernel_size + q15_t buffer_tmp[2 * in_c * kernel_h * + kernel_w]; // buffer_size = in_c * kernel_size * kernel_size - if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) { - csi_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h, - pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift, - output_data, out_h, buffer_tmp); + if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) { + shl_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h, + pad_y, stride_h, bias_data, bias->qinfo->shift, + output->qinfo->shift, output_data, out_h, buffer_tmp); } else { - csi_depthwise_separable_conv_HWC_q7_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c, - kernel_w, kernel_h, pad_x, pad_y, stride_h, stride_w, - bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, - out_w, out_h, buffer_tmp); + shl_depthwise_separable_conv_HWC_q7_nonsquare( + input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y, + stride_h, stride_w, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data, + out_w, out_h, buffer_tmp); } return CSINN_TRUE; } -int csi_ref_i805_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_ref_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { flag |= 0x01; } - if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) { - if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || - (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) { + if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) { + if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || + (params->pad_left != params->pad_top) || + (params->stride_height != params->stride_width)) { flag |= 0x02; } } if (flag > 0) { - csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q7 is not optimized to achieve under this condition on ref_i805, call " + "reference func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_ref_i805_conv2d_q7; + cb->exec = shl_i805_ref_conv2d_q7; } return CSINN_TRUE; } -int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_ref_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { - flag |= 0x02; + flag |= 0x02; } if (kernel->dim[2] != kernel->dim[3]) { flag |= 0x04; @@ -207,29 +204,32 @@ int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n"); - params->base.bc = csi_ref_conv2d_quant; + shl_debug_warning( + "conv2d q15 is not optimized to achieve under this condition on ref_i805, call " + "reference func replaced.\n"); + cb->exec = shl_ref_conv2d_quant; } else { - params->base.bc = csi_ref_i805_conv2d_q15; + cb->exec = shl_i805_ref_conv2d_q15; } return CSINN_TRUE; } -int csi_ref_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_i805_ref_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) { flag |= 0x01; } if (flag > 0) { - csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n"); - params->base.bc = csi_ref_depthwise_conv2d_quant; + shl_debug_warning( + "depthwise_conv2d q7 is not optimized to achieve under this condition on ref_i805, " + "call reference func replaced.\n"); + cb->exec = shl_ref_depthwise_conv2d_quant; } else { - params->base.bc = csi_ref_i805_depthwise_conv2d_q7; + cb->exec = shl_i805_ref_depthwise_conv2d_q7; } return CSINN_TRUE; } diff --git a/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c b/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c deleted file mode 100644 index f39ea334..00000000 --- a/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_1x1_HWC_q7_fast_nonsquare.c - * Description: Fast Q7 version of 1x1 convolution (non-square shape) - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - -/** - * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * This function is optimized for convolution with 1x1 kernel size. - * It can be used for the second half of MobileNets [1] after depthwise - * separable convolution. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 4 - * ch_im_out is multiple of 2 - * - * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications - * https://arxiv.org/abs/1704.04861 - */ - -void csi_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x; - int16_t i_ch_out; - - /* ----------------------- - * Here we use bufferA as q15_t internally as computation are done with q15_t level - * im2col are done to output in q15_t format from q7_t input - */ - - q15_t *pBuffer = bufferA; - q7_t *pOut = Im_out; - - if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) - { - /* This part implements the im2col function */ - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in + - (i_out_y * dim_im_in_x + i_out_x) - * ch_im_in, pBuffer, - ch_im_in); - pBuffer += ch_im_in; - - if (pBuffer == bufferA + 2 * ch_im_in) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, - ch_im_in, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* check if there is left-over for compute */ - if (pBuffer != bufferA) - { - const q7_t *pA = wt; - for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) - { - q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + - NN_ROUND(out_shift); - q15_t *pB = bufferA; - /* basically each time it process 4 entries */ - uint16_t colCnt = ch_im_in >> 2; - - while (colCnt) - { - - q31_t inA1, inA2; - q31_t inB1, inB2; - - pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, - &inA2); - - inB1 = *__SIMD32(pB)++; - sum = __SMLAD(inA1, inB1, sum); - inB2 = *__SIMD32(pB)++; - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = ch_im_in & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut = (q7_t) __SSAT((sum >> out_shift), 8); - pOut++; - - } - - } - -#else - - int i, j, k, l; - int conv_out; - int in_row, in_col; - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out_y; j++) - { - for (k = 0; k < dim_im_out_x; k++) - { - conv_out = ((q31_t)(bias[i]) << bias_shift) - + NN_ROUND(out_shift); - // if-for implementation - in_row = j; - in_col = k; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y - && in_col < dim_im_in_x) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += Im_in[(in_row * dim_im_in_x - + in_col) * ch_im_in + l] * - wt[i * ch_im_in + l]; - } - } - Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = - (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c b/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c deleted file mode 100644 index 60038362..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q15_basic.c - * Description: Q15 version of convolution - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - - /** - * @brief Basic Q15 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * bufferA size: ch_im_in*dim_kernel*dim_kernel - * - * This basic version is designed to work for any input tensor and weight - * dimension. - */ - -void -csi_convolve_HWC_q15_basic(const q15_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q15_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q15_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - uint16_t im2col_out_pixel_index = 0; - q15_t *pBuffer = bufferA; - q15_t *pOut = Im_out; - q15_t *im_buffer = bufferA; - const q15_t *pA; - int i; - - /* This part implements the im2col function */ - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* Filling 0 for out-of-bound paddings */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - memcpy(pBuffer, (q15_t *) Im_in - + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, - sizeof(q15_t)*ch_im_in); - } - pBuffer += ch_im_in; - } - } - - pA = wt; - for (i = 0; i < ch_im_out; i++) - { - q31_t sum = ((q31_t)bias[i] << bias_shift) - + NN_ROUND(out_shift); - q15_t *pB = im_buffer; - uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; - while (colCnt) - { - q31_t inA1 = *__SIMD32(pA)++; - q31_t inB1 = *__SIMD32(pB)++; - q31_t inA2 = *__SIMD32(pA)++; - q31_t inB2 = *__SIMD32(pB)++; - - sum = __SMLAD(inA1, inB1, sum); - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; - while (colCnt) - { - q15_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut = (q15_t) __SSAT((sum >> out_shift), 16); - pOut++; - } - - /* counter reset */ - pBuffer = im_buffer; - im2col_out_pixel_index++; - } - } - -#else - uint16_t i, j, k, l, m, n; - int conv_out; - signed char in_row, in_col; - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out; j++) - { - for (k = 0; k < dim_im_out; k++) - { - conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel; m++) - { - for (n = 0; n < dim_kernel; n++) - { - in_row = stride * j + m - padding; - in_col = stride * k + n - padding; - if (in_row >= 0 && in_col >= 0 - && in_row < dim_im_in && in_col < dim_im_in) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += Im_in[(in_row * dim_im_in + in_col) - * ch_im_in + l] * - wt[i * ch_im_in * dim_kernel * dim_kernel - + (m * dim_kernel + n) * ch_im_in + l]; - } - } - } - } - Im_out[i + (j * dim_im_out + k) * ch_im_out] = - (q15_t) __SSAT((conv_out >> out_shift), 16); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c b/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c deleted file mode 100644 index 835b4854..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q15_fast.c - * Description: Fast Q15 version of convolution - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - - /** - * @brief Fast Q15 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * @details - * - * Buffer size: - * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel - * - * Input dimension constraints: - * - * ch_im_in is multiple of 2 - * - * ch_im_out is multipe of 2 - * - */ - -void -csi_convolve_HWC_q15_fast(const q15_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q15_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q15_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q15_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - q15_t *pBuffer = bufferA; - q15_t *im_buffer = bufferA; - q15_t *pOut = Im_out; - - if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - /* This part implements the im2col function */ - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - memcpy(pBuffer, (q15_t *) Im_in - + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, - sizeof(q15_t)*ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (i_out_x & 0x1) - { - int i; - /* initialize the matrix pointers for A */ - const q15_t *pA = wt; - - /* set up the second output pointers */ - q15_t *pOut2 = pOut + ch_im_out; - - /* this loop over rows in A */ - for (i = 0; i < ch_im_out; i += 2) - { - /* setup pointers for B */ - q15_t *pB = im_buffer; - const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel; - - /* aling the second pointer for A */ - const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel; - - /* init the sum with bias */ - q31_t sum = ((q31_t)bias[i] << bias_shift) - + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)bias[i] << bias_shift) - + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) - + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) - + NN_ROUND(out_shift); - - uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1; - /* accumulate over the vector */ - while (colCnt) - { - q31_t inA1 = *__SIMD32(pA)++; - q31_t inB1 = *__SIMD32(pB)++; - q31_t inA2 = *__SIMD32(pA2)++; - q31_t inB2 = *__SIMD32(pB2)++; - - sum = __SMLAD(inA1, inB1, sum); - sum2 = __SMLAD(inA1, inB2, sum2); - sum3 = __SMLAD(inA2, inB1, sum3); - sum4 = __SMLAD(inA2, inB2, sum4); - - colCnt--; - } /* while over colCnt */ - colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1; - while (colCnt) - { - q15_t inA1 = *pA++; - q15_t inB1 = *pB++; - q15_t inA2 = *pA2++; - q15_t inB2 = *pB2++; - - sum += inA1 * inB1; - sum2 += inA1 * inB2; - sum3 += inA2 * inB1; - sum4 += inA2 * inB2; - colCnt--; - } /* while over colCnt */ - *pOut++ = (q15_t) __SSAT(sum >> out_shift, 16); - *pOut++ = (q15_t) __SSAT(sum3 >> out_shift, 16); - *pOut2++ = (q15_t) __SSAT(sum2 >> out_shift, 16); - *pOut2++ = (q15_t) __SSAT(sum4 >> out_shift, 16); - - /* skip the row computed with A2 */ - pA += ch_im_in * dim_kernel * dim_kernel; - } /* for over ch_im_out */ - - pOut += ch_im_out; - /* counter reset */ - pBuffer = im_buffer; - } - } - } - -#else - uint16_t i, j, k, l, m, n; - int conv_out; - signed char in_row, in_col; - - if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out; j++) - { - for (k = 0; k < dim_im_out; k++) - { - conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel; m++) - { - for (n = 0; n < dim_kernel; n++) - { - in_row = stride * j + m - padding; - in_col = stride * k + n - padding; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in - && in_col < dim_im_in) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += Im_in[(in_row * dim_im_in + in_col) - * ch_im_in + l] * - wt[i * ch_im_in * dim_kernel * dim_kernel - + (m * dim_kernel + n) * ch_im_in + l]; - } - } - } - } - Im_out[i + (j * dim_im_out + k) * ch_im_out] = - (q15_t) __SSAT((conv_out >> out_shift), 16); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c deleted file mode 100644 index c1e5f7aa..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q7_RGB.c - * Description: Q7 version of convolution for RGB image - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - - /** - * @brief Q7 convolution function for RGB image - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * @details - * - * Buffer size: - * - * bufferA size: 2*3*dim_kernel*dim_kernel - * - * Input dimension constraints: - * - * ch_im_in equals 3 - * - * This kernel is written exclusively for convolution with ch_im_in - * equals 3. This applies on the first layer of CNNs which has input - * image with RGB format. - */ - -void -csi_convolve_HWC_q7_RGB(const q7_t * Im_in, - const uint16_t dim_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - /* - * Here we use bufferA as q15_t internally as computation are done with q15_t level - * im2col are done to output in q15_t format from q7_t input - */ - q15_t *pBuffer = bufferA; - q7_t *pOut = Im_out; - - // This part implements the im2col function - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* Equivalent to csi_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */ - *__SIMD32(pBuffer) = 0x0; - *(pBuffer + 2) = 0; - pBuffer += 3; - } else - { - /* - * Equivalent to: - * csi_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3); - */ - - const q7_t *pPixel = Im_in + - (i_ker_y * dim_im_in + i_ker_x) * 3; - q31_t buf = *__SIMD32(pPixel); - - union csi_nnword top; - union csi_nnword bottom; - - top.word = __SXTB16(buf); - bottom.word = __SXTB16(__ROR(buf, 8)); - -#ifndef CSI_MATH_BIG_ENDIAN - /* - * little-endian, | omit | 3rd | 2nd | 1st | - * MSB LSB - * top | 3rd | 1st |; bottom | omit | 2nd | - * - * version 1, need to swap 2nd and 3rd weight - * *__SIMD32(pBuffer) = top.word; - * *(pBuffer+2) = bottom.half_words[0]; - * - * version 2, no weight shuffling required - */ - *pBuffer++ = top.half_words[0]; - *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0); -#else - /* - * big-endian, | 1st | 2nd | 3rd | omit | - * MSB LSB - * top | 2nd | omit |; bottom | 1st | 3rd | - * - * version 1, need to swap 2nd and 3rd weight - * *__SIMD32(pBuffer) = bottom.word; - * *(pBuffer+2) = top.half_words[1]; - * - * version 2, no weight shuffling required - */ - *pBuffer++ = bottom.half_words[0]; - *__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0); -#endif - pBuffer += 2; - } - } - } - - if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15(wt, bufferA, - ch_im_out, - 3 * dim_kernel * dim_kernel, - bias_shift, out_shift, - bias, pOut); - - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* left-over because odd number of output pixels */ - if (pBuffer != bufferA) - { - const q7_t *pA = wt; - int i; - - for (i = 0; i < ch_im_out; i++) - { - q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - q15_t *pB = bufferA; - /* basically each time it process 4 entries */ - uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2; - - while (colCnt) - { - - q31_t inA1, inA2; - q31_t inB1, inB2; - - pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2); - - inB1 = *__SIMD32(pB)++; - sum = __SMLAD(inA1, inB1, sum); - inB2 = *__SIMD32(pB)++; - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = 3 * dim_kernel * dim_kernel & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - } - } -#else - - uint16_t i, j, k, l, m, n; - int conv_out; - signed char in_row, in_col; - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out; j++) - { - for (k = 0; k < dim_im_out; k++) - { - conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel; m++) - { - for (n = 0; n < dim_kernel; n++) - { - /* if-for implementation */ - in_row = stride * j + m - padding; - in_col = stride * k + n - padding; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in - && in_col < dim_im_in) - { - for (l = 0; l < 3; l++) - { - conv_out += Im_in[(in_row * dim_im_in + in_col) - * 3 + l] * wt[i * 3 - * dim_kernel * dim_kernel + (m * dim_kernel - + n) * 3 + l]; - } - } - } - } - Im_out[i + (j * dim_im_out + k) * ch_im_out] = - (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c deleted file mode 100644 index 81f1d03b..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q7_basic.c - * Description: Q7 version of convolution - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - - /** - * @brief Basic Q7 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel - * - * This basic version is designed to work for any input tensor and weight - * dimension. - */ - -void -csi_convolve_HWC_q7_basic(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - /* - * Here we use bufferA as q15_t internally as computation are done with q15_t level - * im2col are done to output in q15_t format from q7_t input - */ - q15_t *pBuffer = bufferA; - q7_t *pOut = Im_out; - - /* This part implements the im2col function */ - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* Filling 0 for out-of-bound paddings */ - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - /* Copying the pixel data to column */ - csi_q7_to_q15_no_shift((q7_t *)Im_in - + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, - pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - /* Computation is filed for every 2 columns */ - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15(wt, bufferA, - ch_im_out, - ch_im_in * - dim_kernel * dim_kernel, - bias_shift, out_shift, - bias, pOut); - - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* left-over because odd number of output pixels */ - if (pBuffer != bufferA) - { - const q7_t *pA = wt; - int i; - - for (i = 0; i < ch_im_out; i++) - { - /* Load the accumulator with bias first */ - q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - - /* Point to the beging of the im2col buffer */ - q15_t *pB = bufferA; - - /* Each time it process 4 entries */ - uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; - - while (colCnt) - { - q31_t inA1, inA2; - q31_t inB1, inB2; - - pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2); - - inB1 = *__SIMD32(pB)++; - sum = __SMLAD(inA1, inB1, sum); - inB2 = *__SIMD32(pB)++; - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - } - } -#else - - uint16_t i, j, k, l, m, n; - int conv_out; - signed char in_row, in_col; - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out; j++) - { - for (k = 0; k < dim_im_out; k++) - { - conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel; m++) - { - for (n = 0; n < dim_kernel; n++) - { - // if-for implementation - in_row = stride * j + m - padding; - in_col = stride * k + n - padding; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in - && in_col < dim_im_in) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += Im_in[(in_row * dim_im_in + in_col) - * ch_im_in + l] * wt[i * ch_im_in - * dim_kernel * dim_kernel + (m * dim_kernel - + n) * ch_im_in + l]; - } - } - } - } - Im_out[i + (j * dim_im_out + k) * ch_im_out] = - (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c deleted file mode 100644 index 91c24c3d..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q7_fast.c - * Description: Fast Q7 version of convolution - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - - /** - * @brief Fast Q7 convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * @details - * - * Buffer size: - * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel - * - * Input dimension constraints: - * - * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap ) - * - * ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel ) - * - * The im2col converts the Q7 tensor input into Q15 column, which is stored in - * bufferA. There is reordering happenning during this im2col process with - * csi_q7_to_q15_reordered_no_shift. For every four elements, the second and - * third elements are swapped. - * - * The computation kernel csi_nn_mat_mult_kernel_q7_q15_reordered does the - * GEMM computation with the reordered columns. - * - * To speed-up the determination of the padding condition, we split the - * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. - * This reduces the total number of boundary condition checks and improves - * the data copying performance. - */ - -void -csi_convolve_HWC_q7_fast(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - /* - * Here we use bufferA as q15_t internally as computation are done with q15_t level - * im2col are done to output in q15_t format from q7_t input - */ - - q15_t *pBuffer = bufferA; - q7_t *pOut = Im_out; - - if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - /* - * Here we split the entire matrix into three regions depending on the padding situation - * Top: i_out_y from 0 to padding - 1 - * Middle: i_out_y from padding to dim_im_out-padding-1 - * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 - */ - - /* top part */ - for (i_out_y = 0; i_out_y < padding; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift - ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) - * ch_im_in, pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, - bufferA, - ch_im_out, - ch_im_in - * dim_kernel - * dim_kernel, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* middle part, here we also divide the x into left, mid and right */ - for (; i_out_y < dim_im_out - padding; i_out_y++) - { - - /* left part */ - for (i_out_x = 0; i_out_x < padding; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift - ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) - * ch_im_in, pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, - bufferA, - ch_im_out, - ch_im_in - * dim_kernel - * dim_kernel, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - - /* mid part */ - for (; i_out_x < dim_im_out - padding; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in - + (i_ker_y * - dim_im_in + - i_out_x * - stride - padding) * ch_im_in, - pBuffer, - ch_im_in * dim_kernel); - pBuffer += ch_im_in * dim_kernel; - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, - bufferA, - ch_im_out, - ch_im_in - * dim_kernel - * dim_kernel, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - - /* right part */ - for (; i_out_x < dim_im_out; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift - ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) - * ch_im_in, pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, - bufferA, - ch_im_out, - ch_im_in - * dim_kernel - * dim_kernel, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - for (; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift - ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) - * ch_im_in, pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, - bufferA, - ch_im_out, - ch_im_in - * dim_kernel - * dim_kernel, - bias_shift, - out_shift, - bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* check if there is left-over for compute */ - if (pBuffer != bufferA) - { - const q7_t *pA = wt; - int i; - - for (i = 0; i < ch_im_out; i++) - { - q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); - q15_t *pB = bufferA; - /* each time it process 4 entries */ - uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; - - while (colCnt) - { - - q31_t inA1, inA2; - q31_t inB1, inB2; - - pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA1, &inA2); - - inB1 = *__SIMD32(pB)++; - sum = __SMLAD(inA1, inB1, sum); - inB2 = *__SIMD32(pB)++; - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut = (q7_t) __SSAT((sum >> out_shift), 8); - pOut++; - - } - - } -#else - - uint16_t i, j, k, l, m, n; - int conv_out; - signed char in_row, in_col; - - if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out; j++) - { - for (k = 0; k < dim_im_out; k++) - { - conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel; m++) - { - for (n = 0; n < dim_kernel; n++) - { - // if-for implementation - in_row = stride * j + m - padding; - in_col = stride * k + n - padding; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in - && in_col < dim_im_in) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += - Im_in[(in_row * dim_im_in + in_col) - * ch_im_in + l] * wt[i * ch_im_in - * dim_kernel * dim_kernel + (m * dim_kernel - + n) * ch_im_in + l]; - } - } - } - } - Im_out[i + (j * dim_im_out + k) * ch_im_out] = - (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c deleted file mode 100644 index a3b0a6b6..00000000 --- a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_convolve_HWC_q7_fast_nonsquare.c - * Description: Fast Q7 version of convolution (non-sqaure shape) - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - -/** - * @brief Fast Q7 convolution function (non-sqaure shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel_x filter kernel size x - * @param[in] dim_kernel_y filter kernel size y - * @param[in] padding_x padding size x - * @param[in] padding_y padding size y - * @param[in] stride_x convolution stride x - * @param[in] stride_y convolution stride y - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 4 - * ch_im_out is multiple of 2 - */ - -void csi_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel_x, - const uint16_t dim_kernel_y, - const uint16_t padding_x, - const uint16_t padding_y, - const uint16_t stride_x, - const uint16_t stride_y, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; - - /* ----------------------- - * Here we use bufferA as q15_t internally as computation are done with q15_t level - * im2col are done to output in q15_t format from q7_t input - */ - - q15_t *pBuffer = bufferA; - q7_t *pOut = Im_out; - - if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) - { - /* check if the input dimension meets the constraints */ - return; - } - - /* - * Here we split the entire matrix into three regions depending on the padding situation - * Top: i_out_y from 0 to padding - 1 - * Middle: i_out_y from padding to dim_im_out-padding-1 - * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 - */ - - /* top part */ - for (i_out_y = 0; i_out_y < padding_y; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - for (i_ker_x = i_out_x * stride_x - padding_x; - i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in_y - || i_ker_x < 0 || i_ker_x >= dim_im_in_x) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in - + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, - pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, - bias_shift, out_shift, bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* middle part, here we also divide the x into left, mid and right */ - for (; i_out_y < dim_im_out_y - padding_y; i_out_y++) - { - - /* left part */ - for (i_out_x = 0; i_out_x < padding_x; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - for (i_ker_x = i_out_x * stride_x - padding_x; - i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; - i_ker_x++) - { - if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in - + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, - pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, - bias_shift, out_shift, bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - - /* mid part */ - for (; i_out_x < dim_im_out_x - padding_x; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in + - (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) - * ch_im_in, pBuffer, ch_im_in * dim_kernel_x); - pBuffer += ch_im_in * dim_kernel_x; - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, - bias_shift, out_shift, bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - - /* right part */ - for (; i_out_x < dim_im_out_x; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - for (i_ker_x = i_out_x * stride_x - padding_x; - i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; - i_ker_x++) - { - if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in - + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, - pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, - bias_shift, out_shift, bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - for (; i_out_y < dim_im_out_y; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) - { - /* This part implements the im2col function */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - for (i_ker_x = i_out_x * stride_x - padding_x; - i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in_y - || i_ker_x < 0 || i_ker_x >= dim_im_in_x) - { - /* csi_fill_q15(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); - } else - { - csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in - + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, - pBuffer, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) - { - pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, - ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, - bias_shift, out_shift, bias, pOut); - /* counter reset */ - pBuffer = bufferA; - } - } - } - - /* check if there is left-over for compute */ - if (pBuffer != bufferA) - { - const q7_t *pA = wt; - int i; - for (i = 0; i < ch_im_out; i++) - { - q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - q15_t *pB = bufferA; - /* basically each time it process 4 entries */ - uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; - - while (colCnt) - { - - q31_t inA1, inA2; - q31_t inB1, inB2; - - pA = (const q7_t *)read_and_pad_reordered( - (void *)pA, &inA1, &inA2); - - inB1 = *__SIMD32(pB)++; - sum = __SMLAD(inA1, inB1, sum); - inB2 = *__SIMD32(pB)++; - sum = __SMLAD(inA2, inB2, sum); - - colCnt--; - } - colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - sum += inA1 * inB1; - colCnt--; - } - *pOut = (q7_t) __SSAT((sum >> out_shift), 8); - pOut++; - - } - - } - -#else - int i, j, k, l, m, n; - int conv_out; - int in_row, in_col; - - for (i = 0; i < ch_im_out; i++) - { - for (j = 0; j < dim_im_out_y; j++) - { - for (k = 0; k < dim_im_out_x; k++) - { - conv_out = ((q31_t)(bias[i]) << bias_shift) - + NN_ROUND(out_shift); - for (m = 0; m < dim_kernel_y; m++) - { - for (n = 0; n < dim_kernel_x; n++) - { - /* if-for implementation */ - in_row = stride_y * j + m - padding_y; - in_col = stride_x * k + n - padding_x; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y - && in_col < dim_im_in_x) - { - for (l = 0; l < ch_im_in; l++) - { - conv_out += Im_in[(in_row * dim_im_in_x - + in_col) * ch_im_in + l] * - wt[i * ch_im_in * dim_kernel_y - * dim_kernel_x + (m * dim_kernel_x + n) - * ch_im_in + l]; - } - } - } - } - Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = - (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c b/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c deleted file mode 100644 index 8df5e394..00000000 --- a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_depthwise_separable_conv_HWC_q7.c - * Description: Q7 depthwise separable convolution function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - -/** - * @brief Q7 depthwise separable convolution function - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * @details - * - * Buffer size: - * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel - * - * Input dimension constraints: - * - * ch_im_in equals ch_im_out - * - * Implementation: - * There are 3 nested loop here: - * Inner loop: calculate each output value with MAC instruction over an accumulator - * Mid loop: loop over different output channel - * Outer loop: loop over different output (x, y) - */ - -void csi_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_out_y, i_out_x; - int16_t i_ker_y, i_ker_x; - q7_t *colBuffer = (q7_t *) bufferA; - q7_t *pBuffer = colBuffer; - const q7_t *pBias = bias; - q7_t *pOut = Im_out; - uint16_t rowCnt; - uint16_t row_shift; - - /* do some checking here, basically ch_im_in == ch_im_out */ - if (ch_im_in != ch_im_out) - { - return; - } - - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - /* we first do im2col here */ - for (i_ker_y = i_out_y * stride - padding; - i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) - { - for (i_ker_x = i_out_x * stride - padding; - i_ker_x < i_out_x * stride - padding + dim_kernel; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in - || i_ker_x < 0 || i_ker_x >= dim_im_in) - { - /* csi_fill_q7(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, ch_im_in); - } else - { - /* csi_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */ - memcpy(pBuffer, (q7_t *) Im_in + (i_ker_y * dim_im_in - + i_ker_x) * ch_im_in, ch_im_in); - } - pBuffer += ch_im_in; - } - } - - /* we will do the computation here for each channel */ - rowCnt = ch_im_out >> 2; - row_shift = 0; - pBias = bias; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - - uint16_t colCnt = (dim_kernel * dim_kernel) >> 1; - q7_t *pB = colBuffer + row_shift; - const q7_t *pA = wt + row_shift; - row_shift += 4; - - while (colCnt) - { - q31_t inA1, inA2, inB1, inB2, opA, opB; - - inB1 = *__SIMD32(pB); - pB += ch_im_in; - opB = *__SIMD32(pB); - pB += ch_im_in; - inB2 = __PKHTB(opB, inB1, 16); - inB1 = __PKHBT(inB1, opB, 16); - inA1 = *__SIMD32(pA); - pA += ch_im_in; - opB = *__SIMD32(pA); - pA += ch_im_in; - inA2 = __PKHTB(opB, inA1, 16); - inA1 = __PKHBT(inA1, opB, 16); - opA = __SXTB16(inA1); - opB = __SXTB16(inB1); - sum = __SMLAD(opA, opB, sum); - opA = __SXTB16(__ROR(inA1, 8)); - opB = __SXTB16(__ROR(inB1, 8)); - sum2 = __SMLAD(opA, opB, sum2); - opA = __SXTB16(inA2); - opB = __SXTB16(inB2); - sum3 = __SMLAD(opA, opB, sum3); - opA = __SXTB16(__ROR(inA2, 8)); - opB = __SXTB16(__ROR(inB2, 8)); - sum4 = __SMLAD(opA, opB, sum4); - colCnt--; - } - - colCnt = (dim_kernel * dim_kernel) & 0x1; - while (colCnt) - { - union csi_nnword inA, inB; - inA.word = *__SIMD32(pA); - pA += ch_im_in; - inB.word = *__SIMD32(pB); - pB += ch_im_in; - sum += inA.bytes[0] * inB.bytes[0]; - sum2 += inA.bytes[1] * inB.bytes[1]; - sum3 += inA.bytes[2] * inB.bytes[2]; - sum4 += inA.bytes[3] * inB.bytes[3]; - colCnt--; - } - - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8); - - rowCnt--; - } - - rowCnt = ch_im_out & 0x3; - while (rowCnt) - { - q7_t *pB = colBuffer + row_shift; - const q7_t *pA = wt + row_shift; - q31_t sum = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - uint16_t colCnt = (dim_kernel * dim_kernel); - - row_shift += 1; - - while (colCnt) - { - q7_t A1 = *pA; - q7_t B1 = *pB; - pA += ch_im_in; - pB += ch_im_in; - sum += A1 * B1; - - colCnt--; - } - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - rowCnt--; - } - - /* clear counter and pointers */ - pBuffer = colBuffer; - } - } - -#else - int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y; - int conv_out; - - /* do some checking here, basically ch_im_in == ch_im_out */ - if (ch_im_in != ch_im_out) - { - return; - } - - for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) - { - for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) - { - // for each output - conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) - + NN_ROUND(out_shift); - for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) - { - for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) - { - int in_row = stride * i_out_y + i_ker_y - padding; - int in_col = stride * i_out_x + i_ker_x - padding; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in - && in_col < dim_im_in) - { - conv_out += Im_in[(in_row * dim_im_in + in_col) - * ch_im_in + i_ch_out] - * wt[(i_ker_y * dim_kernel + i_ker_x) - * ch_im_out + i_ch_out]; - } - } - } - Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] - = (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; - -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c b/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c deleted file mode 100644 index 4a491fa9..00000000 --- a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_depthwise_separable_conv_HWC_q7_nonsquare.c - * Description: Q7 depthwise separable convolution function (non-square shape) - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup NNConv - * @{ - */ - -/** - * @brief Q7 depthwise separable convolution function (non-square shape) - * @param[in] Im_in pointer to input tensor - * @param[in] dim_im_in_x input tensor dimention x - * @param[in] dim_im_in_y input tensor dimention y - * @param[in] ch_im_in number of input tensor channels - * @param[in] wt pointer to kernel weights - * @param[in] ch_im_out number of filters, i.e., output tensor channels - * @param[in] dim_kernel_x filter kernel size x - * @param[in] dim_kernel_y filter kernel size y - * @param[in] padding_x padding sizes x - * @param[in] padding_y padding sizes y - * @param[in] stride_x convolution stride x - * @param[in] stride_y convolution stride y - * @param[in] bias pointer to bias - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in,out] Im_out pointer to output tensor - * @param[in] dim_im_out_x output tensor dimension x - * @param[in] dim_im_out_y output tensor dimension y - * @param[in,out] bufferA pointer to buffer space for input - * @return The function returns either - * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size checking. - * - * This function is the version with full list of optimization tricks, but with - * some contraints: - * ch_im_in is multiple of 2 - * ch_im_out is multiple of 2 - */ - -void -csi_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, - const uint16_t dim_im_in_x, - const uint16_t dim_im_in_y, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel_x, - const uint16_t dim_kernel_y, - const uint16_t padding_x, - const uint16_t padding_y, - const uint16_t stride_x, - const uint16_t stride_y, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out_x, - const uint16_t dim_im_out_y, - q15_t * bufferA) -{ - -#if defined (CSI_MATH_DSP) - -/* - * Implementation: - * There are 3 nested loop here: - * Inner loop: calculate each output value with MAC instruction over an accumulator - * Mid loop: loop over different output channel - * Outer loop: loop over different output (x, y) - * - */ - - int16_t i_out_y, i_out_x; - int16_t i_ker_y, i_ker_x; - q7_t *colBuffer = (q7_t *) bufferA; - q7_t *pBuffer = colBuffer; - const q7_t *pBias = bias; - q7_t *pOut = Im_out; - uint16_t rowCnt; - uint16_t row_shift; - - /* do some checking here, basically ch_im_in == ch_im_out */ - if (ch_im_in != ch_im_out) - { - return; - } - - for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) - { - /* we first do im2col here */ - for (i_ker_y = i_out_y * stride_y - padding_y; - i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; - i_ker_y++) - { - for (i_ker_x = i_out_x * stride_x - padding_x; - i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; - i_ker_x++) - { - if (i_ker_y < 0 || i_ker_y >= dim_im_in_y - || i_ker_x < 0 || i_ker_x >= dim_im_in_x) - { - /* csi_fill_q7(0, pBuffer, ch_im_in); */ - memset(pBuffer, 0, ch_im_in); - } else - { - /* csi_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */ - memcpy(pBuffer, (q7_t *) Im_in + - (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, - ch_im_in); - } - pBuffer += ch_im_in; - } - } - - /* we will do the computation here for each channel */ - rowCnt = ch_im_out >> 2; - row_shift = 0; - pBias = bias; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - - uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1; - q7_t *pB = colBuffer + row_shift; - const q7_t *pA = wt + row_shift; - row_shift += 4; - - while (colCnt) - { - q31_t inA1, inA2, inB1, inB2, opA, opB; - - inB1 = *__SIMD32(pB); - pB += ch_im_in; - opB = *__SIMD32(pB); - pB += ch_im_in; - inB2 = __PKHTB(opB, inB1, 16); - inB1 = __PKHBT(inB1, opB, 16); - inA1 = *__SIMD32(pA); - pA += ch_im_in; - opB = *__SIMD32(pA); - pA += ch_im_in; - inA2 = __PKHTB(opB, inA1, 16); - inA1 = __PKHBT(inA1, opB, 16); - opA = __SXTB16(inA1); - opB = __SXTB16(inB1); - sum = __SMLAD(opA, opB, sum); - opA = __SXTB16(__ROR(inA1, 8)); - opB = __SXTB16(__ROR(inB1, 8)); - sum2 = __SMLAD(opA, opB, sum2); - opA = __SXTB16(inA2); - opB = __SXTB16(inB2); - sum3 = __SMLAD(opA, opB, sum3); - opA = __SXTB16(__ROR(inA2, 8)); - opB = __SXTB16(__ROR(inB2, 8)); - sum4 = __SMLAD(opA, opB, sum4); - colCnt--; - } - - colCnt = (dim_kernel_x * dim_kernel_y) & 0x1; - while (colCnt) - { - union csi_nnword inA, inB; - inA.word = *__SIMD32(pA); - pA += ch_im_in; - inB.word = *__SIMD32(pB); - pB += ch_im_in; - sum += inA.bytes[0] * inB.bytes[0]; - sum2 += inA.bytes[1] * inB.bytes[1]; - sum3 += inA.bytes[2] * inB.bytes[2]; - sum4 += inA.bytes[3] * inB.bytes[3]; - colCnt--; - } - - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8); - - rowCnt--; - } - - rowCnt = ch_im_out & 0x3; - while (rowCnt) - { - q7_t *pB = colBuffer + row_shift; - const q7_t *pA = wt + row_shift; - q31_t sum = ((q31_t)(*pBias++) << bias_shift) - + NN_ROUND(out_shift); - uint16_t colCnt = (dim_kernel_x * dim_kernel_y); - - row_shift += 1; - - while (colCnt) - { - q7_t A1 = *pA; - q7_t B1 = *pB; - pA += ch_im_in; - pB += ch_im_in; - sum += A1 * B1; - - colCnt--; - } - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - rowCnt--; - } - - // clear counter and pointers - pBuffer = colBuffer; - } - } - -#else - int i_out_y, i_out_x, i_ch_out; - int i_ker_y, i_ker_x; - - /* do some checking here, basically ch_im_in == ch_im_out */ - if (ch_im_in != ch_im_out) - { - return; - } - - for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) - { - for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) - { - for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) - { - // for each output - int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) - + NN_ROUND(out_shift); - for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) - { - for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) - { - int in_row = stride_y * i_out_y + i_ker_y - padding_y; - int in_col = stride_x * i_out_x + i_ker_x - padding_x; - if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y - && in_col < dim_im_in_x) - { - conv_out += Im_in[(in_row * dim_im_in_x + in_col) - * ch_im_in + i_ch_out] * - wt[(i_ker_y * dim_kernel_x + i_ker_x) - * ch_im_out + i_ch_out]; - } - } - } - Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out - + i_ch_out] = (q7_t) __SSAT((conv_out >> out_shift), 8); - } - } - } - -#endif /* CSI_MATH_DSP */ - - - /* Return to application */ - return; - -} - -/** - * @} end of NNConv group - */ diff --git a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c b/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c deleted file mode 100644 index 5e2df5ec..00000000 --- a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nn_mat_mult_kernel_q7_q15.c - * Description: Matrix-multiplication function for convolution - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - - /** - * @brief Matrix-multiplication function for convolution - * @param[in] pA pointer to operand A - * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors - * @param[in] ch_im_out numRow of A - * @param[in] numCol_A numCol of A - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias the bias - * @param[in,out] pOut pointer to output - * @return The function returns the incremented output pointer - * - * @details - * - * This function does the matrix multiplication with weight matrix - * and 2 columns from im2col. - */ - -q7_t *csi_nn_mat_mult_kernel_q7_q15(const q7_t * pA, - const q15_t * pInBuffer, - const uint16_t ch_im_out, - const uint16_t numCol_A, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut) -{ -#if defined (CSI_MATH_DSP) - /* set up the second output pointers */ - q7_t *pOut2 = pOut + ch_im_out; - const q7_t *pBias = bias; - - uint16_t rowCnt = ch_im_out >> 1; - /* this loop over rows in A */ - while (rowCnt) - { - /* setup pointers for B */ - const q15_t *pB = pInBuffer; - const q15_t *pB2 = pB + numCol_A; - - /* align the second pointer for A */ - const q7_t *pA2 = pA + numCol_A; - - /* init the sum with bias */ - q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = numCol_A >> 2; - /* accumulate over the vector */ - while (colCnt) - { - q31_t inA11, inA12, inA21, inA22; - q31_t inB1 = *__SIMD32(pB)++; - q31_t inB2 = *__SIMD32(pB2)++; - - pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12); - pA2 = (q7_t *) read_and_pad((void *)pA2, &inA21, &inA22); - - sum = __SMLAD(inA11, inB1, sum); - sum2 = __SMLAD(inA11, inB2, sum2); - sum3 = __SMLAD(inA21, inB1, sum3); - sum4 = __SMLAD(inA21, inB2, sum4); - - inB1 = *__SIMD32(pB)++; - inB2 = *__SIMD32(pB2)++; - - sum = __SMLAD(inA12, inB1, sum); - sum2 = __SMLAD(inA12, inB2, sum2); - sum3 = __SMLAD(inA22, inB1, sum3); - sum4 = __SMLAD(inA22, inB2, sum4); - - colCnt--; - } /* while over colCnt */ - colCnt = numCol_A & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - q7_t inA2 = *pA2++; - q15_t inB2 = *pB2++; - - sum += inA1 * inB1; - sum2 += inA1 * inB2; - sum3 += inA2 * inB1; - sum4 += inA2 * inB2; - colCnt--; - } /* while over colCnt */ - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8); - *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - *pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8); - - /* skip the row computed with A2 */ - pA += numCol_A; - rowCnt--; - } /* for over ch_im_out */ - - /* compute left-over row if any */ - if (ch_im_out & 0x1) - { - /* setup pointers for B */ - const q15_t *pB = pInBuffer; - const q15_t *pB2 = pB + numCol_A; - - /* load the bias */ - q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = numCol_A >> 2; - while (colCnt) - { - q31_t inA11, inA12; - q31_t inB1 = *__SIMD32(pB)++; - q31_t inB2 = *__SIMD32(pB2)++; - - pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12); - - sum = __SMLAD(inA11, inB1, sum); - sum2 = __SMLAD(inA11, inB2, sum2); - - inB1 = *__SIMD32(pB)++; - inB2 = *__SIMD32(pB2)++; - sum = __SMLAD(inA12, inB1, sum); - sum2 = __SMLAD(inA12, inB2, sum2); - - colCnt--; - } - colCnt = numCol_A & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - q15_t inB2 = *pB2++; - - sum += inA1 * inB1; - sum2 += inA1 * inB2; - colCnt--; - } - - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - } - - pOut += ch_im_out; - - /* return the new output pointer with offset */ - return pOut; -#else - /* To be completed */ - return NULL; -#endif /* CSI_MATH_DSP */ - -} diff --git a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c b/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c deleted file mode 100644 index 38a8090c..00000000 --- a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nn_mat_mult_kernel_q7_q15_reordered.c - * Description: Matrix-multiplication function for convolution with reordered columns - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - - /** - * @brief Matrix-multiplication function for convolution with reordered columns - * @param[in] pA pointer to operand A - * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors - * @param[in] ch_im_out numRow of A - * @param[in] numCol_A numCol of A - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias the bias - * @param[in,out] pOut pointer to output - * @return The function returns the incremented output pointer - * - * @details - * - * This function assumes that data in pInBuffer are reordered - */ - -q7_t *csi_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA, - const q15_t * pInBuffer, - const uint16_t ch_im_out, - const uint16_t numCol_A, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut) -{ - -#if defined (CSI_MATH_DSP) - /* set up the second output pointers */ - q7_t *pOut2 = pOut + ch_im_out; - int i; - - /* this loop over rows in A */ - for (i = 0; i < ch_im_out; i += 2) - { - /* setup pointers for B */ - const q15_t *pB = pInBuffer; - const q15_t *pB2 = pB + numCol_A; - - /* align the second pointer for A */ - const q7_t *pA2 = pA + numCol_A; - - /* init the sum with bias */ - q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = numCol_A >> 2; - /* accumulate over the vector */ - while (colCnt) - { - q31_t inA11, inA12, inA21, inA22; - q31_t inB1 = *__SIMD32(pB)++; - q31_t inB2 = *__SIMD32(pB2)++; - - pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA11, &inA12); - pA2 = (q7_t *) read_and_pad_reordered((void *)pA2, &inA21, &inA22); - - sum = __SMLAD(inA11, inB1, sum); - sum2 = __SMLAD(inA11, inB2, sum2); - sum3 = __SMLAD(inA21, inB1, sum3); - sum4 = __SMLAD(inA21, inB2, sum4); - - inB1 = *__SIMD32(pB)++; - inB2 = *__SIMD32(pB2)++; - - sum = __SMLAD(inA12, inB1, sum); - sum2 = __SMLAD(inA12, inB2, sum2); - sum3 = __SMLAD(inA22, inB1, sum3); - sum4 = __SMLAD(inA22, inB2, sum4); - - colCnt--; - } /* while over colCnt */ - colCnt = numCol_A & 0x3; - while (colCnt) - { - q7_t inA1 = *pA++; - q15_t inB1 = *pB++; - q7_t inA2 = *pA2++; - q15_t inB2 = *pB2++; - - sum += inA1 * inB1; - sum2 += inA1 * inB2; - sum3 += inA2 * inB1; - sum4 += inA2 * inB2; - colCnt--; - } /* while over colCnt */ - *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8); - *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - *pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8); - - /* skip the row computed with A2 */ - pA += numCol_A; - } /* for over ch_im_out */ - - pOut += ch_im_out; - - /* return the new output pointer with offset */ - return pOut; -#else - /* To be completed */ - return NULL; -#endif /* CSI_MATH_DSP */ -} diff --git a/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c b/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c new file mode 100644 index 00000000..0fa44da2 --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_1x1_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of 1x1 convolution (non-square shape) + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * This function is optimized for convolution with 1x1 kernel size. + * It can be used for the second half of MobileNets [1] after depthwise + * separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + * + * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications + * https://arxiv.org/abs/1704.04861 + */ + +void shl_convolve_1x1_HWC_q7_fast(const q7_t* Im_in, const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, const uint16_t ch_im_in, + const q7_t* wt, const uint16_t ch_im_out, const q7_t* bias, + const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out, + const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, + q15_t* bufferA) +{ + int i, j, k, l; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out_y; j++) { + for (k = 0; k < dim_im_out_x; k++) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + // if-for implementation + in_row = j; + in_col = k; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in + l]; + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c b/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c new file mode 100644 index 00000000..fbe2718c --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q15_basic.c + * Description: Q15 version of convolution + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * bufferA size: ch_im_in*dim_kernel*dim_kernel + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +void shl_convolve_HWC_q15_basic(const q15_t* Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q15_t* wt, const uint16_t ch_im_out, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const q15_t* bias, const uint16_t bias_shift, + const uint16_t out_shift, q15_t* Im_out, const uint16_t dim_im_out, + q15_t* bufferA) +{ + uint16_t i, j, k, l, m, n; + int conv_out; + signed char in_row, in_col; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out; j++) { + for (k = 0; k < dim_im_out; k++) { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) { + for (n = 0; n < dim_kernel; n++) { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = + (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c b/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c new file mode 100644 index 00000000..39089e78 --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * + */ + +void shl_convolve_HWC_q15_fast(const q15_t* Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q15_t* wt, const uint16_t ch_im_out, + const uint16_t dim_kernel, const uint16_t padding, + const uint16_t stride, const q15_t* bias, const uint16_t bias_shift, + const uint16_t out_shift, q15_t* Im_out, const uint16_t dim_im_out, + q15_t* bufferA) +{ + uint16_t i, j, k, l, m, n; + int conv_out; + signed char in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) { + /* check if the input dimension meets the constraints */ + return; + } + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out; j++) { + for (k = 0; k < dim_im_out; k++) { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) { + for (n = 0; n < dim_kernel; n++) { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = + (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c new file mode 100644 index 00000000..2cb834a6 --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q7_RGB.c + * Description: Q7 version of convolution for RGB image + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 convolution function for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*3*dim_kernel*dim_kernel + * + * Input dimension constraints: + * + * ch_im_in equals 3 + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +void shl_convolve_HWC_q7_RGB(const q7_t* Im_in, const uint16_t dim_im_in, const q7_t* wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t* bias, + const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out, + const uint16_t dim_im_out, q15_t* bufferA) +{ + uint16_t i, j, k, l, m, n; + int conv_out; + signed char in_row, in_col; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out; j++) { + for (k = 0; k < dim_im_out; k++) { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) { + for (n = 0; n < dim_kernel; n++) { + /* if-for implementation */ + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + for (l = 0; l < 3; l++) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * 3 + l] * + wt[i * 3 * dim_kernel * dim_kernel + + (m * dim_kernel + n) * 3 + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c new file mode 100644 index 00000000..b7e0d605 --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +void shl_convolve_HWC_q7_basic(const q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t* bias, + const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out, + const uint16_t dim_im_out, q15_t* bufferA) +{ + uint16_t i, j, k, l, m, n; + int conv_out; + signed char in_row, in_col; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out; j++) { + for (k = 0; k < dim_im_out; k++) { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) { + for (n = 0; n < dim_kernel; n++) { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c new file mode 100644 index 00000000..b3eee998 --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q7_fast.c + * Description: Fast Q7 version of convolution + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * Input dimension constraints: + * + * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap ) + * + * ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel ) + * + * To speed-up the determination of the padding condition, we split the + * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. + * This reduces the total number of boundary condition checks and improves + * the data copying performance. + */ + +void shl_convolve_HWC_q7_fast(const q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, const q7_t* bias, + const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out, + const uint16_t dim_im_out, q15_t* bufferA) +{ + uint16_t i, j, k, l, m, n; + int conv_out; + signed char in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) { + /* check if the input dimension meets the constraints */ + return; + } + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out; j++) { + for (k = 0; k < dim_im_out; k++) { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) { + for (n = 0; n < dim_kernel; n++) { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c new file mode 100644 index 00000000..047d550c --- /dev/null +++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_convolve_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of convolution (non-sqaure shape) + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +void shl_convolve_HWC_q7_fast_nonsquare( + const q7_t* Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, + const uint16_t ch_im_in, const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, + const uint16_t stride_x, const uint16_t stride_y, const q7_t* bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t* Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t* bufferA) +{ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) { + for (j = 0; j < dim_im_out_y; j++) { + for (k = 0; k < dim_im_out_x; k++) { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) { + for (n = 0; n < dim_kernel_x; n++) { + /* if-for implementation */ + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && + in_col < dim_im_in_x) { + for (l = 0; l < ch_im_in; l++) { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + + (m * dim_kernel_x + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c new file mode 100644 index 00000000..bf5835d4 --- /dev/null +++ b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_depthwise_separable_conv_HWC_q7.c + * Description: Q7 depthwise separable convolution function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * Input dimension constraints: + * + * ch_im_in equals ch_im_out + * + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + */ + +void shl_depthwise_separable_conv_HWC_q7(const q7_t* Im_in, const uint16_t dim_im_in, + const uint16_t ch_im_in, const q7_t* wt, + const uint16_t ch_im_out, const uint16_t dim_kernel, + const uint16_t padding, const uint16_t stride, + const q7_t* bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t* Im_out, + const uint16_t dim_im_out, q15_t* bufferA) +{ + int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y; + int conv_out; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) { + return; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) { + // for each output + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) { + for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) { + int in_row = stride * i_out_y + i_ker_y - padding; + int in_col = stride * i_out_x + i_ker_x - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && + in_col < dim_im_in) { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c new file mode 100644 index 00000000..43949203 --- /dev/null +++ b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_depthwise_separable_conv_HWC_q7_nonsquare.c + * Description: Q7 depthwise separable convolution function (non-square shape) + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @return The function returns either + * CSI_MATH_SIZE_MISMATCH or CSI_MATH_SUCCESS based on the outcome of size + * checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +void shl_depthwise_separable_conv_HWC_q7_nonsquare( + const q7_t* Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, + const uint16_t ch_im_in, const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, + const uint16_t stride_x, const uint16_t stride_y, const q7_t* bias, const uint16_t bias_shift, + const uint16_t out_shift, q7_t* Im_out, const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, q15_t* bufferA) +{ + int i_out_y, i_out_x, i_ch_out; + int i_ker_y, i_ker_x; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) { + return; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) { + // for each output + int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) { + for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) { + int in_row = stride_y * i_out_y + i_ker_y - padding_y; + int in_col = stride_x * i_out_x + i_ker_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && + in_col < dim_im_in_x) { + conv_out += + Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + + return; +} diff --git a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c b/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c deleted file mode 100644 index f2e9d508..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_mat_q7_vec_q15.c - * Description: Mixed Q15-Q7 fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Mixed Q15-Q7 fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * Q7_Q15 version of the fully connected layer - * - * Weights are in q7_t and Activations are in q15_t - * - */ - -void -csi_fully_connected_mat_q7_vec_q15(const q15_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut) -{ - -#if defined (CSI_MATH_DSP) - - const q7_t *pB = pM; - const q7_t *pB2; - q15_t *pO = pOut; - const q7_t *pBias = bias; - const q15_t *pA = pV; - - uint16_t rowCnt = num_of_rows >> 1; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - pB2 = pB + dim_vec; - - while (colCnt) - { - q31_t inV, inM11, inM12, inM21, inM22; - pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12); - pB2 = (q7_t *) read_and_pad((void *)pB2, &inM21, &inM22); - - inV = *__SIMD32(pA)++; - - sum = __SMLAD(inV, inM11, sum); - sum2 = __SMLAD(inV, inM21, sum2); - - inV = *__SIMD32(pA)++; - - sum = __SMLAD(inV, inM12, sum); - sum2 = __SMLAD(inV, inM22, sum2); - - colCnt--; - } - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - q7_t inM2 = *pB2++; - - sum += inV * inM; - sum2 += inV * inM2; - colCnt--; - } /* while over colCnt */ - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16)); - - /*adjust the pointers and counters */ - pB += dim_vec; - rowCnt--; - } - - /* left-over part of the rows */ - rowCnt = num_of_rows & 0x1; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - - while (colCnt) - { - q31_t inV1, inV2, inM11, inM12; - - pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12); - - inV1 = *__SIMD32(pA)++; - sum = __SMLAD(inV1, inM11, sum); - - inV2 = *__SIMD32(pA)++; - sum = __SMLAD(inV2, inM12, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - sum += inV * inM; - colCnt--; - } - - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - - rowCnt--; - } - -#else - int i, j; - - for (i = 0; i < num_of_rows; i++) - { - int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - for (j = 0; j < dim_vec; j++) - { - ip_out += pV[j] * pM[i * dim_vec + j]; - } - pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16); - } - -#endif /* CSI_MATH_DSP */ - - /* Return to CSI_MATH_SUCCESS */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c deleted file mode 100644 index 2df9659b..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_mat_q7_vec_q15_opt.c - * Description: Mixed Q15-Q7 opt fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Mixed Q15-Q7 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * Q7_Q15 version of the fully connected layer - * - * Weights are in q7_t and Activations are in q15_t - * - * Limitation: x4 version requires weight reordering to work - * - * Here we use only one pointer to read 4 rows in the weight - * matrix. So if the original q7_t matrix looks like this: - * - * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | - * - * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | - * - * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | - * - * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | - * - * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | - * - * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | - * - * We operates on multiple-of-4 rows, so the first four rows becomes - * - * | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 | - * - * | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 | - * - * | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 | - * - * The column left over will be in-order. - * which is: - * | a17 | a27 | a37 | a47 | - * - * For the left-over rows, we do 1x1 computation, so the data remains - * as its original order. - * - * So the stored weight matrix looks like this: - * - * | a11 | a21 | a12 | a22 | a31 | a41 | - * - * | a32 | a42 | a13 | a23 | a14 | a24 | - * - * | a33 | a43 | a34 | a44 | a15 | a25 | - * - * | a16 | a26 | a35 | a45 | a36 | a46 | - * - * | a17 | a27 | a37 | a47 | a51 | a52 | - * - * | a53 | a54 | a55 | a56 | a57 | a61 | - * - * | a62 | a63 | a64 | a65 | a66 | a67 | - * - */ - -void -csi_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q15_t * pOut) -{ - -#if defined (CSI_MATH_DSP) - - const q7_t *pB = pM; - q15_t *pO = pOut; - const q7_t *pBias = bias; - const q15_t *pA = pV; - - uint16_t rowCnt = num_of_rows >> 2; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 1; - - pA = pV; - - while (colCnt) - { - q31_t inM11, inM12, inM13, inM14; - q31_t inV; - - inV = *__SIMD32(pA)++; - inM11 = *__SIMD32(pB)++; - inM12 = __SXTB16(__ROR(inM11, 8)); - inM11 = __SXTB16(inM11); - sum = __SMLAD(inM11, inV, sum); - sum2 = __SMLAD(inM12, inV, sum2); - inM13 = *__SIMD32(pB)++; - inM14 = __SXTB16(__ROR(inM13, 8)); - inM13 = __SXTB16(inM13); - sum3 = __SMLAD(inM13, inV, sum3); - sum4 = __SMLAD(inM14, inV, sum4); - colCnt--; - } - - colCnt = dim_vec & 0x1; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - q7_t inM2 = *pB++; - q7_t inM3 = *pB++; - q7_t inM4 = *pB++; - - sum += inV * inM; - sum2 += inV * inM2; - sum3 += inV * inM3; - sum4 += inV * inM4; - colCnt--; - } /* while over colCnt */ - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16)); - - /* adjust the pointers and counters */ - rowCnt--; - } - - /* left-over part of the rows */ - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - - while (colCnt) - { - q31_t inV1, inV2, inM11, inM12; - - pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12); - - inV1 = *__SIMD32(pA)++; - sum = __SMLAD(inV1, inM11, sum); - - inV2 = *__SIMD32(pA)++; - sum = __SMLAD(inV2, inM12, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - sum += inV * inM; - colCnt--; - } - - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - - rowCnt--; - } - -#else - uint16_t rowCnt = num_of_rows >> 2; - const q7_t *pB = pM; - const q15_t *pA; - q15_t *pO = pOut; - const q7_t *pBias = bias; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - uint16_t colCnt = dim_vec >> 1; - - pA = pV; - - while (colCnt) - { - q15_t inA1 = *pA++; - q15_t inA2 = *pA++; - - q7_t inB1 = *pB++; - q7_t inB3 = *pB++; - q7_t inB2 = *pB++; - q7_t inB4 = *pB++; - - sum += inA1 * inB1 + inA2 * inB2; - sum2 += inA1 * inB3 + inA2 * inB4; - - inB1 = *pB++; - inB3 = *pB++; - inB2 = *pB++; - inB4 = *pB++; - - sum3 += inA1 * inB1 + inA2 * inB2; - sum4 += inA1 * inB3 + inA2 * inB4; - - colCnt--; - } - - colCnt = dim_vec & 0x1; - while (colCnt) - { - q15_t inA = *pA++; - q7_t inB = *pB++; - sum += inA * inB; - inB = *pB++; - sum2 += inA * inB; - inB = *pB++; - sum3 += inA * inB; - inB = *pB++; - sum4 += inA * inB; - - colCnt--; - } - *pO++ = (q15_t) __SSAT((sum >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16); - - rowCnt--; - } - - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - int j; - - pA = pV; - for (j = 0; j < dim_vec; j++) - { - q15_t inA = *pA++; - q7_t inB = *pB++; - ip_out += inA * inB; - } - *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16); - - rowCnt--; - } - -#endif /* CSI_MATH_DSP */ - - /* Return to CSI_MATH_SUCCESS */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q15.c b/source/i805_ref/fully-connect/csi_fully_connected_q15.c deleted file mode 100644 index 64cd0d8b..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_q15.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_q15.c - * Description: Q15 basic fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Q15 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return The function returns CSI_MATH_SUCCESS - * - */ - -void -csi_fully_connected_q15(const q15_t * pV, - const q15_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut) -{ - -#if defined (CSI_MATH_DSP) - - const q15_t *pB = pM; - const q15_t *pB2 = pB + dim_vec; - q15_t *pO = pOut; - const q15_t *pA; - const q15_t *pBias = bias; - uint16_t rowCnt = num_of_rows >> 1; - - /* this loop loops over different output */ - while (rowCnt) { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - pB2 = pB + dim_vec; - - while (colCnt) - { - q31_t inV1, inM1, inM2; - inV1 = *__SIMD32(pA)++; - inM1 = *__SIMD32(pB)++; - sum = __SMLAD(inV1, inM1, sum); - inM2 = *__SIMD32(pB2)++; - sum2 = __SMLAD(inV1, inM2, sum2); - - inV1 = *__SIMD32(pA)++; - inM1 = *__SIMD32(pB)++; - sum = __SMLAD(inV1, inM1, sum); - inM2 = *__SIMD32(pB2)++; - sum2 = __SMLAD(inV1, inM2, sum2); - - colCnt--; - } - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q15_t inM = *pB++; - q15_t inM2 = *pB2++; - - sum += inV * inM; - sum2 += inV * inM2; - colCnt--; - } /* while over colCnt */ - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum2>> out_shift), 16)); - - /* adjust the pointers and counters */ - pB = pB + dim_vec; - rowCnt --; - } - - rowCnt = num_of_rows & 0x1; - - while (rowCnt) { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - - while (colCnt) { - q31_t inV1, inM1; - inV1 = *__SIMD32(pA)++; - inM1 = *__SIMD32(pB)++; - sum = __SMLAD(inV1, inM1, sum); - - inV1 = *__SIMD32(pA)++; - inM1 = *__SIMD32(pB)++; - sum = __SMLAD(inV1, inM1, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while(colCnt) { - q15_t inV = *pA++; - q15_t inM = *pB++; - - sum += inV * inM; - - colCnt--; - } - - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - - rowCnt --; - } - -#else - int i, j; - - for (i = 0; i < num_of_rows; i++) - { - int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - for (j = 0; j < dim_vec; j++) - { - ip_out += pV[j] * pM[i * dim_vec + j]; - } - pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16); - } - -#endif /* CSI_MATH_DSP */ - - /* Return to application */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c deleted file mode 100644 index cb0b24b6..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_q15_opt.c - * Description: Q15 opt fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Q15 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @return The function returns CSI_MATH_SUCCESS - * - * - * @details - * - * Here we use only one pointer to read 4 rows in the weight - * matrix. So if the original matrix looks like this: - * - * | a11 | a12 | a13 | - * - * | a21 | a22 | a23 | - * - * | a31 | a32 | a33 | - * - * | a41 | a42 | a43 | - * - * | a51 | a52 | a53 | - * - * | a61 | a62 | a63 | - * - * We operates on multiple-of-4 rows, so the first four rows becomes - * - * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | - * - * | a13 | a23 | a33 | a43 | - * - * Remaining rows are kept the same original order. - * - * So the stored weight matrix looks like this: - * - * - * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | - * - * | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 | - * - * | a62 | a63 | - */ - -void -csi_fully_connected_q15_opt(const q15_t * pV, - const q15_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q15_t * bias, - q15_t * pOut) -{ - -#if defined (CSI_MATH_DSP) - - const q15_t *pB = pM; - q15_t *pO = pOut; - const q15_t *pBias = bias; - const q15_t *pA = pV; - - uint16_t rowCnt = num_of_rows >> 2; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 1; - - pA = pV; - - while (colCnt) - { - q31_t inM11, inM12, inM13, inM14; - q31_t inV; - - inV = *__SIMD32(pA)++; - inM11 = *__SIMD32(pB)++; - sum = __SMLAD(inV, inM11, sum); - inM12 = *__SIMD32(pB)++; - sum2 = __SMLAD(inV, inM12, sum2); - inM13 = *__SIMD32(pB)++; - sum3 = __SMLAD(inV, inM13, sum3); - inM14 = *__SIMD32(pB)++; - sum4 = __SMLAD(inV, inM14, sum4); - colCnt--; - } - - colCnt = dim_vec & 0x1; - while (colCnt) - { - - q15_t inV = *pA++; - q15_t inM = *pB++; - q15_t inM2 = *pB++; - q15_t inM3 = *pB++; - q15_t inM4 = *pB++; - - sum += inV * inM; - sum2 += inV * inM2; - sum3 += inV * inM3; - sum4 += inV * inM4; - colCnt--; - } /* while over colCnt */ - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16)); - *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16)); - - /* adjust the pointers and counters */ - rowCnt--; - } - - /* left-over part of the rows */ - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - - while (colCnt) - { - q31_t inV1, inV2, inM1, inM2; - - inM1 = *__SIMD32(pB)++; - inV1 = *__SIMD32(pA)++; - sum = __SMLAD(inV1, inM1, sum); - - inM2 = *__SIMD32(pB)++; - inV2 = *__SIMD32(pA)++; - sum = __SMLAD(inV2, inM2, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q15_t inM = *pB++; - sum += inV * inM; - colCnt--; - } - - *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16)); - - rowCnt--; - } - -#else - uint16_t rowCnt = num_of_rows >> 2; - const q15_t *pB = pM; - const q15_t *pA; - q15_t *pO = pOut; - const q15_t *pBias = bias; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 1; - - pA = pV; - while (colCnt) - { - q15_t inA1 = *pA++; - q15_t inA2 = *pA++; - - q15_t inB1 = *pB++; - q15_t inB2 = *pB++; - sum += inA1 * inB1 + inA2 * inB2; - - inB1 = *pB++; - inB2 = *pB++; - sum2 += inA1 * inB1 + inA2 * inB2; - - inB1 = *pB++; - inB2 = *pB++; - sum3 += inA1 * inB1 + inA2 * inB2; - - inB1 = *pB++; - inB2 = *pB++; - sum4 += inA1 * inB1 + inA2 * inB2; - - colCnt--; - } - colCnt = dim_vec & 0x1; - while (colCnt) - { - q15_t inA = *pA++; - q15_t inB = *pB++; - sum += inA * inB; - inB = *pB++; - sum2 += inA * inB; - inB = *pB++; - sum3 += inA * inB; - inB = *pB++; - sum4 += inA * inB; - colCnt--; - } - *pO++ = (q15_t) __SSAT((sum >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16); - *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16); - - rowCnt--; - } - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - int j; - - pA = pV; - for (j = 0; j < dim_vec; j++) - { - q15_t inA = *pA++; - q15_t inB = *pB++; - ip_out += inA * inB; - } - *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16); - - rowCnt--; - } - -#endif /* CSI_MATH_DSP */ - - /* Return to CSI_MATH_SUCCESS */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q7.c b/source/i805_ref/fully-connect/csi_fully_connected_q7.c deleted file mode 100644 index 60689c47..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_q7.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_q7.c - * Description: Q7 basic fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Q7 basic fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @param[in,out] vec_buffer pointer to buffer space for input - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * vec_buffer size: dim_vec - * - * This basic function is designed to work with regular weight - * matrix without interleaving. - * - */ - -void -csi_fully_connected_q7(const q7_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut) -{ - -#if 0//defined (CSI_MATH_DSP) - - const q7_t *pB = pM; - const q7_t *pB2; - q7_t *pO = pOut; - const q7_t *pBias = bias; - q15_t *pA; - q15_t vec_buffer[dim_vec*num_of_rows]; - uint16_t rowCnt = num_of_rows >> 1; - - /* expand the vector into the buffer */ - csi_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - uint16_t colCnt = dim_vec >> 2; - - pA = vec_buffer; - pB2 = pB + dim_vec; - - while (colCnt) - { - q31_t inV, inM11, inM12, inM21, inM22; - pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12); - pB2 = (q7_t *) read_and_pad_reordered((void *)pB2, &inM21, &inM22); - - inV = *__SIMD32(pA)++; - - sum = __SMLAD(inV, inM11, sum); - sum2 = __SMLAD(inV, inM21, sum2); - - inV = *__SIMD32(pA)++; - - sum = __SMLAD(inV, inM12, sum); - sum2 = __SMLAD(inV, inM22, sum2); - - colCnt--; - } - colCnt = dim_vec & 0x3; - while (colCnt) - { - q7_t inV = *pA++; - q15_t inM = *pB++; - q15_t inM2 = *pB2++; - - sum += inV * inM; - sum2 += inV * inM2; - colCnt--; - } /* while over colCnt */ - *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8)); - *pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8)); - - /* adjust the pointers and counters */ - pB += dim_vec; - rowCnt--; - } - - /* left-over part of the rows */ - rowCnt = num_of_rows & 0x1; - - while (rowCnt) - { - uint16_t colCnt = dim_vec >> 2; - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - pA = vec_buffer; - - while (colCnt) - { - q31_t inV1, inV2, inM11, inM12; - - pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12); - - inV1 = *__SIMD32(pA)++; - sum = __SMLAD(inV1, inM11, sum); - - inV2 = *__SIMD32(pA)++; - sum = __SMLAD(inV2, inM12, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while (colCnt) - { - q7_t inV = *pA++; - q15_t inM = *pB++; - sum += inV * inM; - colCnt--; - } - - *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8)); - - rowCnt--; - } - -#else - int i, j; - - for (i = 0; i < num_of_rows; i++) - { - int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); - for (j = 0; j < dim_vec; j++) - { - ip_out += pV[j] * pM[i * dim_vec + j]; - } - pOut[i] = (q7_t) __SSAT((ip_out >> out_shift), 8); - } - -#endif /* CSI_MATH_DSP */ - - /* Return to CSI_MATH_SUCCESS */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c deleted file mode 100644 index d712fe88..00000000 --- a/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_fully_connected_q7_opt.c - * Description: Q7 basic fully-connected layer function - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup FC - * @{ - */ - - /** - * @brief Q7 opt fully-connected layer function - * @param[in] pV pointer to input vector - * @param[in] pM pointer to matrix weights - * @param[in] dim_vec length of the vector - * @param[in] num_of_rows number of rows in weight matrix - * @param[in] bias_shift amount of left-shift for bias - * @param[in] out_shift amount of right-shift for output - * @param[in] bias pointer to bias - * @param[in,out] pOut pointer to output vector - * @param[in,out] vec_buffer pointer to buffer space for input - * @return The function returns CSI_MATH_SUCCESS - * - * @details - * - * Buffer size: - * - * vec_buffer size: dim_vec - * - * This opt function is designed to work with interleaved weight - * matrix. The vector input is assumed in q7_t format, we call - * csi_q7_to_q15_no_shift_shuffle function to expand into - * q15_t format with certain weight re-ordering, refer to the function - * comments for more details. - * Here we use only one pointer to read 4 rows in the weight - * matrix. So if the original q7_t matrix looks like this: - * - * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | - * - * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | - * - * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | - * - * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | - * - * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | - * - * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | - * - * - * We operates on multiple-of-4 rows, so the first four rows becomes - * - * | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 | - * - * | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 | - * - * | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 | - * - * So within the kernel, we first read the re-ordered vector in as: - * - * | b1 | b3 | and | b2 | b4 | - * - * the four q31_t weights will look like - * - * | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 | - * - * | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 | - * - * The column left over will be in-order. - * which is: - * - * | a17 | a27 | a37 | a47 | - * - * For the left-over rows, we do 1x1 computation, so the data remains - * as its original order. - * - * So the stored weight matrix looks like this: - * - * | a11 | a21 | a13 | a23 | a31 | a41 | - * - * | a33 | a43 | a12 | a22 | a14 | a24 | - * - * | a32 | a42 | a34 | a44 | a15 | a25 | - * - * | a35 | a45 | a16 | a26 | a36 | a46 | - * - * | a17 | a27 | a37 | a47 | a51 | a52 | - * - * | a53 | a54 | a55 | a56 | a57 | a61 | - * - * | a62 | a63 | a64 | a65 | a66 | a67 | - * - * - */ - -void -csi_fully_connected_q7_opt(const q7_t * pV, - const q7_t * pM, - const uint16_t dim_vec, - const uint16_t num_of_rows, - const uint16_t bias_shift, - const uint16_t out_shift, - const q7_t * bias, - q7_t * pOut) -{ - -#if 0//defined (CSI_MATH_DSP) - - const q7_t *pB = pM; - q7_t *pO = pOut; - const q7_t *pBias = bias; - q15_t *pA; - uint16_t rowCnt = num_of_rows >> 2; - - csi_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); - - while (rowCnt) - { - - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = vec_buffer; - - while (colCnt) - { - q31_t inM11, inM12, inM13, inM14; - q31_t inV; - - inV = *__SIMD32(pA)++; - inM11 = *__SIMD32(pB)++; - inM12 = __SXTB16(__ROR(inM11, 8)); - inM11 = __SXTB16(inM11); - sum = __SMLAD(inM11, inV, sum); - sum2 = __SMLAD(inM12, inV, sum2); - inM13 = *__SIMD32(pB)++; - inM14 = __SXTB16(__ROR(inM13, 8)); - inM13 = __SXTB16(inM13); - sum3 = __SMLAD(inM13, inV, sum3); - sum4 = __SMLAD(inM14, inV, sum4); - - inV = *__SIMD32(pA)++; - inM11 = *__SIMD32(pB)++; - inM12 = __SXTB16(__ROR(inM11, 8)); - inM11 = __SXTB16(inM11); - sum = __SMLAD(inM11, inV, sum); - sum2 = __SMLAD(inM12, inV, sum2); - inM13 = *__SIMD32(pB)++; - inM14 = __SXTB16(__ROR(inM13, 8)); - inM13 = __SXTB16(inM13); - sum3 = __SMLAD(inM13, inV, sum3); - sum4 = __SMLAD(inM14, inV, sum4); - colCnt--; - } - - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - q7_t inM2 = *pB++; - q7_t inM3 = *pB++; - q7_t inM4 = *pB++; - - sum += inV * inM; - sum2 += inV * inM2; - sum3 += inV * inM3; - sum4 += inV * inM4; - colCnt--; - } /* while over colCnt */ - *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8)); - *pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8)); - *pO++ = (q7_t) (__SSAT((sum3 >> out_shift), 8)); - *pO++ = (q7_t) (__SSAT((sum4 >> out_shift), 8)); - - /* adjust the pointers and counters */ - rowCnt--; - } - - /* left-over part of the rows */ - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - uint16_t colCnt = dim_vec >> 2; - - pA = vec_buffer; - - while (colCnt) - { - q31_t inV1, inV2, inM11, inM12; - - pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12); - - inV1 = *__SIMD32(pA)++; - sum = __SMLAD(inV1, inM11, sum); - - inV2 = *__SIMD32(pA)++; - sum = __SMLAD(inV2, inM12, sum); - - colCnt--; - } - - /* left-over of the vector */ - colCnt = dim_vec & 0x3; - while (colCnt) - { - q15_t inV = *pA++; - q7_t inM = *pB++; - sum += inV * inM; - colCnt--; - } - - *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8)); - - rowCnt--; - } - -#else - uint16_t rowCnt = num_of_rows >> 2; - const q7_t *pB = pM; - const q7_t *pA; - q7_t *pO = pOut; - const q7_t *pBias = bias; - - while (rowCnt) - { - q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - uint16_t colCnt = dim_vec >> 2; - - pA = pV; - - while (colCnt) - { - q7_t inA1 = *pA++; - q7_t inA3 = *pA++; - q7_t inA2 = *pA++; - q7_t inA4 = *pA++; - - q7_t inB1 = *pB++; - q7_t inB3 = *pB++; - q7_t inB2 = *pB++; - q7_t inB4 = *pB++; - - sum += inA1 * inB1 + inA2 * inB2; - sum2 += inA1 * inB3 + inA2 * inB4; - - inB1 = *pB++; - inB3 = *pB++; - inB2 = *pB++; - inB4 = *pB++; - - sum3 += inA1 * inB1 + inA2 * inB2; - sum4 += inA1 * inB3 + inA2 * inB4; - - inB1 = *pB++; - inB3 = *pB++; - inB2 = *pB++; - inB4 = *pB++; - - sum += inA3 * inB1 + inA4 * inB2; - sum2 += inA3 * inB3 + inA4 * inB4; - - inB1 = *pB++; - inB3 = *pB++; - inB2 = *pB++; - inB4 = *pB++; - - sum3 += inA3 * inB1 + inA4 * inB2; - sum4 += inA3 * inB3 + inA4 * inB4; - - colCnt--; - } - colCnt = dim_vec & 0x3; - while (colCnt) - { - q7_t inA = *pA++; - q7_t inB = *pB++; - sum += inA * inB; - inB = *pB++; - sum2 += inA * inB; - inB = *pB++; - sum3 += inA * inB; - inB = *pB++; - sum4 += inA * inB; - - colCnt--; - } - *pO++ = (q7_t) __SSAT((sum >> out_shift), 8); - *pO++ = (q7_t) __SSAT((sum2 >> out_shift), 8); - *pO++ = (q7_t) __SSAT((sum3 >> out_shift), 8); - *pO++ = (q7_t) __SSAT((sum4 >> out_shift), 8); - - rowCnt--; - } - - rowCnt = num_of_rows & 0x3; - - while (rowCnt) - { - int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); - - int j; - - pA = pV; - for (j = 0; j < dim_vec; j++) - { - q7_t inA = *pA++; - q7_t inB = *pB++; - ip_out += inA * inB; - } - *pO++ = (q7_t) __SSAT((ip_out >> out_shift), 8); - - rowCnt--; - } - -#endif /* CSI_MATH_DSP */ - - /* Return to CSI_MATH_SUCCESS */ - return; - -} - -/** - * @} end of FC group - */ diff --git a/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c new file mode 100644 index 00000000..a0ed4d5c --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_mat_q7_vec_q15.c + * Description: Mixed Q15-Q7 fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + */ + +void shl_fully_connected_mat_q7_vec_q15(const q15_t* pV, const q7_t* pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t* bias, q15_t* pOut) +{ + int i, j; + + for (i = 0; i < num_of_rows; i++) { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + + return; +} diff --git a/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c new file mode 100644 index 00000000..9f686636 --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_mat_q7_vec_q15_opt.c + * Description: Mixed Q15-Q7 opt fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + * Limitation: x4 version requires weight reordering to work + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 | + * + * | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 | + * + * | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 | + * + * The column left over will be in-order. + * which is: + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a12 | a22 | a31 | a41 | + * + * | a32 | a42 | a13 | a23 | a14 | a24 | + * + * | a33 | a43 | a34 | a44 | a15 | a25 | + * + * | a16 | a26 | a35 | a45 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + */ + +void shl_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t *bias, q15_t *pOut) +{ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + + while (colCnt) { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + colCnt--; + } + + colCnt = dim_vec & 0x1; + while (colCnt) { + q15_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) { + q15_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + + return; +} diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q15.c b/source/i805_ref/fully-connect/shl_fully_connected_q15.c new file mode 100644 index 00000000..c1893df0 --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_q15.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_q15.c + * Description: Q15 basic fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return The function returns CSI_MATH_SUCCESS + * + */ + +void shl_fully_connected_q15(const q15_t* pV, const q15_t* pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q15_t* bias, q15_t* pOut) +{ + int i, j; + + for (i = 0; i < num_of_rows; i++) { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + + return; +} diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c new file mode 100644 index 00000000..6db17c6b --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_q15_opt.c + * Description: Q15 opt fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @return The function returns CSI_MATH_SUCCESS + * + * + * @details + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original matrix looks like this: + * + * | a11 | a12 | a13 | + * + * | a21 | a22 | a23 | + * + * | a31 | a32 | a33 | + * + * | a41 | a42 | a43 | + * + * | a51 | a52 | a53 | + * + * | a61 | a62 | a63 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | + * + * Remaining rows are kept the same original order. + * + * So the stored weight matrix looks like this: + * + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 | + * + * | a62 | a63 | + */ + +void shl_fully_connected_q15_opt(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q15_t *bias, q15_t *pOut) +{ + uint16_t rowCnt = num_of_rows >> 2; + const q15_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q15_t *pBias = bias; + + while (rowCnt) { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + while (colCnt) { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q15_t inB1 = *pB++; + q15_t inB2 = *pB++; + sum += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum2 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum3 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum4 += inA1 * inB1 + inA2 * inB2; + + colCnt--; + } + colCnt = dim_vec & 0x1; + while (colCnt) { + q15_t inA = *pA++; + q15_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + rowCnt = num_of_rows & 0x3; + + while (rowCnt) { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) { + q15_t inA = *pA++; + q15_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + + return; +} diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q7.c b/source/i805_ref/fully-connect/shl_fully_connected_q7.c new file mode 100644 index 00000000..0fe8d120 --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_q7.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_q7.c + * Description: Q7 basic fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: dim_vec + * + * This basic function is designed to work with regular weight + * matrix without interleaving. + * + */ + +void shl_fully_connected_q7(const q7_t* pV, const q7_t* pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t* bias, q7_t* pOut) +{ + int i, j; + + for (i = 0; i < num_of_rows; i++) { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__SSAT((ip_out >> out_shift), 8); + } + + return; +} diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c new file mode 100644 index 00000000..992379f9 --- /dev/null +++ b/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: shl_fully_connected_q7_opt.c + * Description: Q7 basic fully-connected layer function + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns CSI_MATH_SUCCESS + * + * @details + * + * Buffer size: + * + * vec_buffer size: dim_vec + * + * This opt function is designed to work with interleaved weight + * matrix. The vector input is assumed in q7_t format, we call + * csi_q7_to_q15_no_shift_shuffle function to expand into + * q15_t format with certain weight re-ordering, refer to the function + * comments for more details. + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 | + * + * | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 | + * + * | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 | + * + * So within the kernel, we first read the re-ordered vector in as: + * + * | b1 | b3 | and | b2 | b4 | + * + * the four q31_t weights will look like + * + * | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 | + * + * | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 | + * + * The column left over will be in-order. + * which is: + * + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a13 | a23 | a31 | a41 | + * + * | a33 | a43 | a12 | a22 | a14 | a24 | + * + * | a32 | a42 | a34 | a44 | a15 | a25 | + * + * | a35 | a45 | a16 | a26 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + */ + +void shl_fully_connected_q7_opt(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, + const uint16_t num_of_rows, const uint16_t bias_shift, + const uint16_t out_shift, const q7_t *bias, q7_t *pOut) +{ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q7_t *pA; + q7_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) { + q7_t inA1 = *pA++; + q7_t inA3 = *pA++; + q7_t inA2 = *pA++; + q7_t inA4 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum += inA3 * inB1 + inA4 * inB2; + sum2 += inA3 * inB3 + inA4 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA3 * inB1 + inA4 * inB2; + sum4 += inA3 * inB3 + inA4 * inB4; + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) { + q7_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) { + q7_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q7_t)__SSAT((ip_out >> out_shift), 8); + + rowCnt--; + } + + return; +} diff --git a/source/i805_ref/fullyconnected.c b/source/i805_ref/fullyconnected.c index 97d3b70b..f9fe593e 100644 --- a/source/i805_ref/fullyconnected.c +++ b/source/i805_ref/fullyconnected.c @@ -16,39 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -int csi_ref_i805_fullyconnected_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_ref_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *weight_data = (q7_t *)weights->data; q7_t *bias_data = (q7_t *)bias->data; q7_t *output_data = (q7_t *)output->data; - csi_fully_connected_q7(input_data, weight_data, input->dim[1], weights->dim[0], + shl_fully_connected_q7(input_data, weight_data, input->dim[1], weights->dim[0], bias->qinfo->shift, output->qinfo->shift, bias_data, output_data); return CSINN_TRUE; } -int csi_ref_i805_fullyconnected_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int shl_i805_ref_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *weight_data = (q15_t *)weights->data; q15_t *bias_data = (q15_t *)bias->data; q15_t *output_data = (q15_t *)output->data; - csi_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0], + shl_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0], bias->qinfo->shift, output->qinfo->shift, bias_data, output_data); return CSINN_TRUE; } diff --git a/include/include_xt800/csi_nnfunctions.h b/source/i805_ref/i805_ref_function.h similarity index 92% rename from include/include_xt800/csi_nnfunctions.h rename to source/i805_ref/i805_ref_function.h index 1f530e74..ae124075 100644 --- a/include/include_xt800/csi_nnfunctions.h +++ b/source/i805_ref/i805_ref_function.h @@ -17,20 +17,19 @@ */ /* ---------------------------------------------------------------------- - * Title: csi_nnfunctions.h + * Title: i805_ref_function.h * Description: Public header file for CSI NN Library * * -------------------------------------------------------------------- */ -#ifndef INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_ -#define INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_ +#ifndef SOURCE_I805_REF_I805_REF_FUNCTION_H_ +#define SOURCE_I805_REF_I805_REF_FUNCTION_H_ #ifdef __cplusplus extern "C" { #endif -#include "csi_instance.h" -#include "csi_nnsupportfunctions.h" +#include "nn-support/i805_ref_support.h" /** * @brief Struct for specifying activation function types @@ -61,7 +60,7 @@ typedef enum { * */ -void csi_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, +void shl_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, @@ -87,14 +86,14 @@ void csi_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, cons * */ -void csi_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, +void shl_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA); -void csi_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, +void shl_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, @@ -130,7 +129,7 @@ void csi_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, * ch_im_out is multiple of 2 */ -void csi_convolve_HWC_q7_fast_nonsquare( +void shl_convolve_HWC_q7_fast_nonsquare( const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, @@ -170,7 +169,7 @@ void csi_convolve_HWC_q7_fast_nonsquare( * ch_im_in is multiple of 4 * ch_im_out is multiple of 2 */ -void csi_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, +void shl_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, @@ -200,7 +199,7 @@ void csi_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x, * image with RGB format. */ -void csi_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt, +void shl_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, @@ -230,7 +229,7 @@ void csi_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const * ch_im_out is multiple of 2 */ -void csi_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, +void shl_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, @@ -266,7 +265,7 @@ void csi_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_i * ch_im_in is multiple of 2 * ch_im_out is multiple of 2 */ -void csi_depthwise_separable_conv_HWC_q7_nonsquare( +void shl_depthwise_separable_conv_HWC_q7_nonsquare( const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, @@ -287,7 +286,7 @@ void csi_depthwise_separable_conv_HWC_q7_nonsquare( * @return none. */ -void csi_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, +void shl_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut); @@ -305,7 +304,7 @@ void csi_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_v * */ -void csi_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, +void shl_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q15_t *bias, q15_t *pOut); @@ -323,7 +322,7 @@ void csi_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t di * */ -void csi_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, +void shl_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q15_t *pOut); @@ -334,7 +333,7 @@ void csi_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const u * @return none. */ -void csi_relu_q7(q7_t *data, uint16_t size); +void shl_relu_q7(q7_t *data, uint16_t size); /** * @brief Q15 RELU function @@ -343,7 +342,7 @@ void csi_relu_q7(q7_t *data, uint16_t size); * @return none. */ -void csi_relu_q15(q15_t *data, uint16_t size); +void shl_relu_q15(q15_t *data, uint16_t size); /** * @brief Q7 neural network activation function using direct table look-up @@ -354,8 +353,8 @@ void csi_relu_q15(q15_t *data, uint16_t size); * @return none. */ -void csi_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, - csi_nn_activation_type type); +void shl_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, + csi_nn_activation_type type); /** * @brief Q15 neural network activation function using direct table look-up @@ -366,8 +365,8 @@ void csi_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, * @return none. */ -void csi_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, - csi_nn_activation_type type); +void shl_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, + csi_nn_activation_type type); /** * @brief Q7 max pooling function @@ -384,7 +383,7 @@ void csi_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_widt * */ -void csi_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, +void shl_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out); @@ -403,11 +402,11 @@ void csi_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t * */ -void csi_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, +void shl_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out); -void csi_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image +void shl_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image const uint16_t dim_im_in_x, // input image dimension const uint16_t dim_im_in_y, // input image dimension const uint16_t ch_im_in, // number of input image channels @@ -432,7 +431,7 @@ void csi_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image * */ -void csi_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); +void shl_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); /** * @brief Q15 softmax function @@ -443,10 +442,10 @@ void csi_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); * */ -void csi_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); +void shl_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); #ifdef __cplusplus } #endif -#endif // INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_ +#endif // SOURCE_I805_REF_I805_REF_FUNCTION_H_ diff --git a/source/i805_ref/maxpool.c b/source/i805_ref/maxpool.c index e3a840b2..63fa5239 100644 --- a/source/i805_ref/maxpool.c +++ b/source/i805_ref/maxpool.c @@ -16,39 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -static int csi_ref_i805_maxpool2d_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int shl_i805_ref_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - q7_t *input_data = (q7_t *)input->data; + q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; uint16_t batch = input->dim[0]; - uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; + uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2]; uint16_t in_c = input->dim[3]; - uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] + uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2] q7_t buffer_tmp[out_hw * out_hw * in_c]; // buffer_size = out_h * out_w * channel - csi_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, - params->stride_height, out_hw, buffer_tmp, output_data); + shl_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, + params->stride_height, out_hw, buffer_tmp, output_data); return CSINN_TRUE; } -int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int shl_i805_ref_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { + struct csinn_callback *cb = params->base.cb; uint8_t flag = 0; - if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || - (params->pad_top != params->pad_left) ) { + if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) || + (params->pad_top != params->pad_left)) { flag |= 0x01; } if (input->dim[1] != input->dim[2]) { @@ -61,10 +60,12 @@ int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input, flag |= 0x08; } if (flag > 0) { - csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n"); - params->base.bc = csi_ref_maxpool2d_quant; + shl_debug_warning( + "maxpool q7 is not optimized to achieve under this condition on ref_i805, call " + "reference func replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; } else { - params->base.bc = csi_ref_i805_maxpool2d_q7; + cb->exec = shl_i805_ref_maxpool2d_q7; } return CSINN_TRUE; } diff --git a/source/i805_ref/nn-support/csi_nntables.c b/source/i805_ref/nn-support/csi_nntables.c deleted file mode 100644 index b5a5ad64..00000000 --- a/source/i805_ref/nn-support/csi_nntables.c +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_nntables.c - * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift - * - * -------------------------------------------------------------------- */ - -#include "csi_nnsupportfunctions.h" - -/** - * @brief tables for various activation functions - * - * This file include the declaration of common tables. - * Most of them are used for activation functions - * - * Assumption: - * Unified table: input is 3.x format, i.e, range of [-8, 8) - * sigmoid(8) = 0.9996646498695336 - * tanh(8) = 0.9999997749296758 - * The accuracy here should be good enough - * - * 2-stage HL table: - * - * The entire input range is divided into two parts: - * - * Low range table: 0x000x xxxx or 0x111x xxxx - * table entry will be the binary number excluding the first - * two digits, i.e., 0x0x xxxx or 0x1x xxxx - * - * - * - * High range table 0x0010 0000 -- 0x0111 1111 - * 0x1000 0000 -- 0x1101 1111 - * - * For positive numbers, table entry will be - * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 - * i.e., 0x0000 0000 - 0x0101 11111 - * - * same thing for the negative numbers, table entry will be - * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 - * i.e., 0x0110 0000 - 0x1011 1111 - */ - -const q7_t sigmoidTable_q7[256] = { - 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, - 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, - 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, - 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, - 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, - 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, - 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, - 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, - 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, - 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, - 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, - 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, - 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, - 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, - 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, -}; - -const q15_t sigmoidTable_q15[256] = { - 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, - 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb, - 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, - 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, - 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, - 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, - 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, - 0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00, -}; - -const q15_t sigmoidLTable_q15[128] = { - 0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, - 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb, - 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, - 0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, - 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc, - 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, - 0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a, - 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, - 0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, - 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3, - 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, - 0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, - 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833, - 0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, - 0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f, - 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00, -}; - -const q15_t sigmoidHTable_q15[192] = { - 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, - 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, - 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, - 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, - 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, - 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, - 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, - 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, - 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, - 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, - 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, - 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, - 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, - 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, - 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, - 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, - 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, - 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, - 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, - 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, - 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, - 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, - 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, -}; - -const q7_t tanhTable_q7[256] = { - 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, - 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, - 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, - 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, - 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, - 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, - 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, - 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, - 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, - 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, - 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, - 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, -}; - -const q15_t tanhTable_q15[256] = { - 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, - 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6, - 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, - 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, - 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, - 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, - 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, - 0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803, -}; - -const q15_t tanhLTable_q15[128] = { - 0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, - 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6, - 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, - 0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, - 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e, - 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, - 0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807, - 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, - 0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, - 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70, - 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, - 0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, - 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d, - 0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, - 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec, - 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00, -}; - -const q15_t tanhHTable_q15[192] = { - 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, - 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, - 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, - 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, - 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, - 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, - 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, - 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, - 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, - 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, - 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, - 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, - 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, - 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, -}; diff --git a/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c b/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c deleted file mode 100644 index bae01450..00000000 --- a/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_q7_to_q15_no_shift.c - * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift - * - * -------------------------------------------------------------------- */ - -#include "csi_nnsupportfunctions.h" - -/** - * @ingroup groupSupport - */ - -/** - * @addtogroup nndata_convert - * @{ - */ - -/** - * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * \par Description: - * - * The equation used for the conversion process is: - * - *
- * 	pDst[n] = (q15_t) pSrc[n];   0 <= n < blockSize.
- * 
- * - */ - -void csi_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst, - uint32_t blockSize) -{ - const q7_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ - -#ifndef CSI_MATH_NO_SIMD - q31_t in; - q31_t in1, in2; - q31_t out1, out2; - - /*loop Unrolling */ - blkCnt = blockSize >> 2u; - - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (blkCnt > 0u) - { - /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ - in = *__SIMD32(pIn)++; - - /* rotatate in by 8 and extend two q7_t values to q15_t values */ - in1 = __SXTB16(__ROR(in, 8)); - - /* extend remainig two q7_t values to q15_t values */ - in2 = __SXTB16(in); - -#ifndef CSI_MATH_BIG_ENDIAN - - out2 = __PKHTB(in1, in2, 16); - out1 = __PKHBT(in2, in1, 16); - -#else - - out1 = __PKHTB(in1, in2, 16); - out2 = __PKHBT(in2, in1, 16); - -#endif - - *__SIMD32(pDst)++ = out1; - *__SIMD32(pDst)++ = out2; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4u; - -#else - - /* Loop over blockSize number of values */ - blkCnt = blockSize; - -#endif /* #ifndef CSI_MATH_CM0_FAMILY */ - - while (blkCnt > 0u) - { - /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ - *pDst++ = (q15_t) * pIn++; - - /* Decrement the loop counter */ - blkCnt--; - } - -} - -/** - * @} end of q7_to_x group - */ diff --git a/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c b/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c deleted file mode 100644 index c79ddb46..00000000 --- a/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_q7_to_q15_reordered_no_shift.c - * Description: Converts the elements of the Q7 vector to reordered Q15 vector without left-shift - * - * -------------------------------------------------------------------- */ - -#include "csi_nnsupportfunctions.h" - -/** - * @ingroup groupSupport - */ - -/** - * @addtogroup nndata_convert - * @{ - */ - -/** - * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift - * @param[in] *pSrc points to the Q7 input vector - * @param[out] *pDst points to the Q15 output vector - * @param[in] blockSize length of the input vector - * @return none. - * - * @details - * - * This function does the q7 to q15 expansion with re-ordering - * - *
- *                          |   A1   |   A2   |   A3   |   A4   |
- *
- *                           0      7 8     15 16    23 24    31
- * 
- * - * is converted into: - * - *
- *  |       A1       |       A3       |   and  |       A2       |       A4       |
- *
- *   0             15 16            31          0             15 16            31
- * 
- * - * - * This looks strange but is natural considering how sign-extension is done at - * assembly level. - * - * The expansion of other other oprand will follow the same rule so that the end - * results are the same. - * - * The tail (i.e., last (N % 4) elements) will still be in original order. - * - */ - -void csi_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, - uint32_t blockSize) -{ - const q7_t *pIn = pSrc; /* Src pointer */ - uint32_t blkCnt; /* loop counter */ - -#ifndef CSI_MATH_NO_SIMD - q31_t in; - q31_t in1, in2; - - /*loop Unrolling */ - blkCnt = blockSize >> 2u; - - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (blkCnt > 0u) - { - /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ - in = *__SIMD32(pIn)++; - - /* rotatate in by 8 and extend two q7_t values to q15_t values */ - in1 = __SXTB16(__ROR(in, 8)); - - /* extend remainig two q7_t values to q15_t values */ - in2 = __SXTB16(in); - -#ifndef CSI_MATH_BIG_ENDIAN - *__SIMD32(pDst)++ = in2; - *__SIMD32(pDst)++ = in1; -#else - *__SIMD32(pDst)++ = in1; - *__SIMD32(pDst)++ = in2; -#endif - - /* Decrement the loop counter */ - blkCnt--; - } - - /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - blkCnt = blockSize % 0x4u; - -#else - - /* Loop over blockSize number of values */ - blkCnt = blockSize; - -#endif /* #ifndef CSI_MATH_CM0_FAMILY */ - - while (blkCnt > 0u) - { - /* C = (q15_t) A << 8 */ - /* convert from q7 to q15 and then store the results in the destination buffer */ - *pDst++ = (q15_t) * pIn++; - - /* Decrement the loop counter */ - blkCnt--; - } - -} - -/** - * @} end of q7_to_x group - */ diff --git a/source/i805_ref/nn-support/i805_ref_support.h b/source/i805_ref/nn-support/i805_ref_support.h new file mode 100644 index 00000000..e472138d --- /dev/null +++ b/source/i805_ref/nn-support/i805_ref_support.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csi_nnsupportfunctions.h + * Description: Public header file of support functions for CSI NN Library + * + * -------------------------------------------------------------------- */ + +#ifndef SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_ +#define SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_ + +#include +#include +#include +#include + +/** + * @brief 8-bit fractional data type in 1.7 format. + */ +typedef int8_t q7_t; + +/** + * @brief 16-bit fractional data type in 1.15 format. + */ +typedef int16_t q15_t; + +/** + * @brief 32-bit fractional data type in 1.31 format. + */ +typedef int32_t q31_t; + +/** + * @brief tables for various activation functions + * + */ + +extern const q15_t sigmoidTable_q15[256]; +extern const q7_t sigmoidTable_q7[256]; + +extern const q7_t tanhTable_q7[256]; +extern const q15_t tanhTable_q15[256]; + +int32_t __SSAT_8(int32_t x) +{ + int32_t res = x; + if (x > 0x7f) { + res = 0x7f; + } else if (x < -128) { + res = -128; + } + + return res; +} + +int32_t __SSAT(int32_t val, uint32_t sat) +{ + if ((sat >= 1U) && (sat <= 32U)) { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max; + + if (val > max) { + return max; + + } else if (val < min) { + return min; + } + } + + return val; +} + +uint32_t __USAT(int32_t val, uint32_t sat) +{ + if (sat <= 31U) { + const uint32_t max = ((1U << sat) - 1U); + + if (val > (int32_t)max) { + return max; + + } else if (val < 0) { + return 0U; + } + } + + return (uint32_t)val; +} + +/** + * @brief defition to adding rouding offset + */ +#ifndef CSKY_NN_TRUNCATE +#define NN_ROUND(out_shift) (0x1 << (out_shift - 1)) +#else +#define NN_ROUND(out_shift) 0 +#endif + +#endif // SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_ diff --git a/source/i805_ref/nn-support/shl_nntables.c b/source/i805_ref/nn-support/shl_nntables.c new file mode 100644 index 00000000..b72c12a1 --- /dev/null +++ b/source/i805_ref/nn-support/shl_nntables.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csi_nntables.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_support.h" + +/** + * @brief tables for various activation functions + * + * This file include the declaration of common tables. + * Most of them are used for activation functions + * + * Assumption: + * Unified table: input is 3.x format, i.e, range of [-8, 8) + * sigmoid(8) = 0.9996646498695336 + * tanh(8) = 0.9999997749296758 + * The accuracy here should be good enough + * + * 2-stage HL table: + * + * The entire input range is divided into two parts: + * + * Low range table: 0x000x xxxx or 0x111x xxxx + * table entry will be the binary number excluding the first + * two digits, i.e., 0x0x xxxx or 0x1x xxxx + * + * + * + * High range table 0x0010 0000 -- 0x0111 1111 + * 0x1000 0000 -- 0x1101 1111 + * + * For positive numbers, table entry will be + * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 + * i.e., 0x0000 0000 - 0x0101 11111 + * + * same thing for the negative numbers, table entry will be + * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 + * i.e., 0x0110 0000 - 0x1011 1111 + */ + +const q7_t sigmoidTable_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, + 0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, + 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, + 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, + 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, + 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + +const q15_t sigmoidTable_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, + 0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, + 0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, + 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, + 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, + 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, + 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, + 0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, + 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, + 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, + 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, + 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, + 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, + 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, + 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, + 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, + 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7, + 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0, + 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615, + 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + +const q7_t tanhTable_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, + 0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, + 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, + 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + +const q15_t tanhTable_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, + 0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, + 0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, + 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, + 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, + 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, + 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, + 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, + 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, + 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, + 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, + 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5, + 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50, + 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941, + 0xe0a7, 0xe847, 0xf015, 0xf803, +}; diff --git a/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c b/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c deleted file mode 100644 index 8187e1d7..00000000 --- a/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "csi_nnfunctions.h" - -void csi_avepool_q7_HWC_nonsquare( - q7_t *Im_in, // input image - const uint16_t dim_im_in_x, // input image dimension - const uint16_t dim_im_in_y, // input image dimension - const uint16_t ch_im_in, // number of input image channels - const uint16_t dim_kernel_x, // window kernel size - const uint16_t dim_kernel_y, // window kernel size - const uint16_t padding_x, // padding sizes - const uint16_t padding_y, // padding sizes - const uint16_t stride_x, // stride - const uint16_t stride_y, // stride - const uint16_t dim_im_out_x, // output image dimension - const uint16_t dim_im_out_y, // output image dimension - q7_t *bufferA, // a buffer for local storage - q7_t *Im_out, // output feature - const uint16_t out_lshift) // output left shift (scaling) -{ -#if defined (CSI_MATH_DSP) - - q15_t *buffer = (q15_t *) bufferA; - int16_t i_x, i_y, i; - int16_t count = 0; - - /* first does the pooling along x axis */ - for (i_y = 0; i_y < dim_im_in_y; i_y++) - { - - for (i_x = 0; i_x < dim_im_out_x; i_x++) - { - /* for each output pixel */ - q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in; - q7_t *win_start; - q7_t *win_stop; - if (i_x * stride_x - padding_x < 0) - { - win_start = target; - } else - { - win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - - padding_x) * ch_im_in; - } - - if (i_x * stride_x - padding_x + dim_kernel_x >= dim_im_in_x) - { - win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in_x) * ch_im_in; - } else - { - win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x - + dim_kernel_x) * ch_im_in; - } - - /* first step is to copy over initial data */ - csi_q7_to_q15_no_shift(win_start, buffer, ch_im_in); - count = 1; - - /* start the max operation from the second part */ - win_start += ch_im_in; - for (; win_start < win_stop; win_start += ch_im_in) - { - accumulate_q7_to_q15(buffer, win_start, ch_im_in); - count++; - } - buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); - } - } - - /* then does the pooling along y axis */ - for (i_y = 0; i_y < dim_im_out_y; i_y++) - { - /* for each output row */ - q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in; - q7_t *row_start; - q7_t *row_end; - /* setting the starting row */ - if (i_y * stride_y - padding_y < 0) - { - row_start = Im_in; - } else - { - row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in; - } - /* setting the stopping row */ - if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y) - { - row_end = Im_in + dim_im_in_x * dim_im_in_y * ch_im_in; - } else - { - row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) - * dim_im_in_x * ch_im_in; - } - - /* copy over the first row */ - csi_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in); - count = 1; - - /* move over to next row */ - row_start += ch_im_in * dim_im_in_x; - - for (; row_start < row_end; row_start += dim_im_in_x * ch_im_in) - { - accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in); - count++; - } - - /* out left shift */ - for(i = 0; i < dim_im_out_x * ch_im_in; i++) - { - buffer[i] = buffer[i] << out_lshift; - } - buffer_scale_back_q15_to_q7(buffer, target, - dim_im_out_x * ch_im_in, count); - } -#else - - int16_t i_ch_in, i_x, i_y; - int16_t k_x, k_y; - - for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) { - for (i_y = 0; i_y < dim_im_out_y; i_y++) { - for (i_x = 0; i_x < dim_im_out_x; i_x++) { - int sum = 0; - int count = 0; - for (k_y = i_y * stride_y - padding_y; - k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) { - for (k_x = i_x * stride_x - padding_x; - k_x < i_x * stride_x - padding_x + dim_kernel_x; - k_x++) { - if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && - k_x < dim_im_in_x) { - sum += Im_in[i_ch_in + - ch_im_in * (k_x + k_y * dim_im_in_x)]; - count++; - } - } - } - sum = __SSAT_8((sum << out_lshift) / count); - Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum; - } - } - } - -#endif -} diff --git a/source/i805_ref/pooling/csi_pool_q7_HWC.c b/source/i805_ref/pooling/csi_pool_q7_HWC.c deleted file mode 100644 index c5ee5760..00000000 --- a/source/i805_ref/pooling/csi_pool_q7_HWC.c +++ /dev/null @@ -1,472 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Title: csi_pool_q7_HWC.c - * Description: Pooling function implementations - * - * -------------------------------------------------------------------- */ - -#include "csi_nnfunctions.h" - -#if defined (CSI_MATH_DSP) - -void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, - uint16_t length, uint16_t scale) -{ - int i; - - for (i = 0; i < length; i++) - { - target[i] = (q7_t) __SSAT_8(buffer[i] / scale); - } -} - -void accumulate_q7_to_q15(q15_t * base, q7_t * target, - const uint16_t length) -{ - q15_t *pCnt = base; - q7_t *pV = target; - q31_t v1, v2, vo1, vo2; - uint16_t cnt = length >> 2; - q31_t in; - - while (cnt > 0u) - { - q31_t value = *__SIMD32(pV)++; - v1 = __SXTB16(__ROR(value, 8)); - v2 = __SXTB16(value); -#ifndef CSI_MATH_BIG_ENDIAN - - vo2 = __PKHTB(v1, v2, 16); - vo1 = __PKHBT(v2, v1, 16); - -#else - - vo1 = __PKHTB(v1, v2, 16); - vo2 = __PKHBT(v2, v1, 16); - -#endif - - in = *__SIMD32(pCnt); - *__SIMD32(pCnt)++ = __QADD16(vo1, in); - - in = *__SIMD32(pCnt); - *__SIMD32(pCnt)++ = __QADD16(vo2, in); - - cnt--; - } - cnt = length & 0x3; - while (cnt > 0u) - { - *pCnt++ += *pV++; - cnt--; - } -} - -static void compare_and_replace_if_larger_q7(q7_t * base, // base data - q7_t * target, // compare target - const uint16_t length // data length - ) -{ - q7_t *pIn = base; - q7_t *pCom = target; - union csi_nnword in; - union csi_nnword com; - uint16_t cnt = length >> 2; - - while (cnt > 0u) - { - in.word = *__SIMD32(pIn); - com.word = *__SIMD32(pCom)++; - - // if version - if (com.bytes[0] > in.bytes[0]) - in.bytes[0] = com.bytes[0]; - if (com.bytes[1] > in.bytes[1]) - in.bytes[1] = com.bytes[1]; - if (com.bytes[2] > in.bytes[2]) - in.bytes[2] = com.bytes[2]; - if (com.bytes[3] > in.bytes[3]) - in.bytes[3] = com.bytes[3]; - - *__SIMD32(pIn)++ = in.word; - - cnt--; - } - - cnt = length & 3u; - - while (cnt > 0u) - { - // if version - if (*pCom > *pIn) - *pIn = *pCom; - - *pIn++; - *pCom++; - - cnt--; - } - -} - -#endif // CSI_MATH_DSP - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Pooling - * @{ - */ - - /** - * @brief Q7 max pooling function - * @param[in, out] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @param[in,out] Im_out pointer to output tensor - * @return none. - * - * @details - * - * Buffer size: - * - * bufferA size: 0 - * - * The pooling function is implemented as split x-pooling then - * y-pooling. - * - * This pooling function is input-destructive. Input data is undefined - * after calling this function. - * - */ - -void -csi_maxpool2d_q7_HWC(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out) -{ - -#if defined (CSI_MATH_DSP) - - int16_t i_x, i_y; - - /* first does the pooling along x axis */ - for (i_y = 0; i_y < dim_im_in; i_y++) - { - - for (i_x = 0; i_x < dim_im_out; i_x++) - { - /* for each output pixel */ - q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; - q7_t *win_start; - q7_t *win_stop; - if (i_x * stride - padding < 0) - { - win_start = target; - } else - { - win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) - * ch_im_in; - } - - if (i_x * stride - padding + dim_kernel >= dim_im_in) - { - win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; - } else - { - win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding - + dim_kernel) * ch_im_in; - } - - /* first step is to copy over initial data */ - /* csi_copy_q7(win_start, target, ch_im_in); */ - memmove(target, win_start, ch_im_in); - - /* start the max operation from the second part */ - win_start += ch_im_in; - for (; win_start < win_stop; win_start += ch_im_in) - { - compare_and_replace_if_larger_q7(target, win_start, ch_im_in); - } - } - } - - /* then does the pooling along y axis */ - for (i_y = 0; i_y < dim_im_out; i_y++) - { - - /* for each output row */ - q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; - q7_t *row_start; - q7_t *row_end; - /* setting the starting row */ - if (i_y * stride - padding < 0) - { - row_start = Im_in; - } else - { - row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; - } - /* setting the stopping row */ - if (i_y * stride - padding + dim_kernel >= dim_im_in) - { - row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; - } else - { - row_end = Im_in + (i_y * stride - padding + dim_kernel) - * dim_im_in * ch_im_in; - } - - /* copy over the first row */ - /* csi_copy_q7(row_start, target, dim_im_out * ch_im_in); */ - memmove(target, row_start, dim_im_out * ch_im_in); - - /* move over to next row */ - row_start += ch_im_in * dim_im_in; - - for (; row_start < row_end; row_start += dim_im_in * ch_im_in) - { - compare_and_replace_if_larger_q7(target, row_start, - dim_im_out * ch_im_in); - } - } - -#else - - int16_t i_ch_in, i_x, i_y; - int16_t k_x, k_y; - - for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) - { - for (i_y = 0; i_y < dim_im_out; i_y++) - { - for (i_x = 0; i_x < dim_im_out; i_x++) - { - int max = -129; - for (k_y = i_y * stride - padding; - k_y < i_y * stride - padding + dim_kernel; k_y++) - { - for (k_x = i_x * stride - padding; - k_x < i_x * stride - padding + dim_kernel; k_x++) - { - if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in - && k_x < dim_im_in) - { - if (Im_in[i_ch_in + ch_im_in - * (k_x + k_y * dim_im_in)] > max) - { - max = Im_in[i_ch_in + ch_im_in - * (k_x + k_y * dim_im_in)]; - } - } - } - } - Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; - } - } - } - -#endif /* CSI_MATH_DSP */ - -} - - /** - * @brief Q7 average pooling function - * @param[in,out] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @param[in,out] Im_out pointer to output tensor - * @return none. - * - * @details - * - * Buffer size: - * - * bufferA size: 2*dim_im_out*ch_im_in - * - * The pooling function is implemented as split x-pooling then - * y-pooling. - * - * This pooling function is input-destructive. Input data is undefined - * after calling this function. - * - */ - -void -csi_avepool_q7_HWC(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out) -{ - -#if defined (CSI_MATH_DSP) - - q15_t *buffer = (q15_t *) bufferA; - int16_t i_x, i_y; - int16_t count = 0; - - /* first does the pooling along x axis */ - for (i_y = 0; i_y < dim_im_in; i_y++) - { - - for (i_x = 0; i_x < dim_im_out; i_x++) - { - /* for each output pixel */ - q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; - q7_t *win_start; - q7_t *win_stop; - if (i_x * stride - padding < 0) - { - win_start = target; - } else - { - win_start = Im_in + (i_y * dim_im_in + i_x * stride - - padding) * ch_im_in; - } - - if (i_x * stride - padding + dim_kernel >= dim_im_in) - { - win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; - } else - { - win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding - + dim_kernel) * ch_im_in; - } - - /* first step is to copy over initial data */ - csi_q7_to_q15_no_shift(win_start, buffer, ch_im_in); - count = 1; - - /* start the max operation from the second part */ - win_start += ch_im_in; - for (; win_start < win_stop; win_start += ch_im_in) - { - accumulate_q7_to_q15(buffer, win_start, ch_im_in); - count++; - } - buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); - } - } - - /* then does the pooling along y axis */ - for (i_y = 0; i_y < dim_im_out; i_y++) - { - /* for each output row */ - q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; - q7_t *row_start; - q7_t *row_end; - /* setting the starting row */ - if (i_y * stride - padding < 0) - { - row_start = Im_in; - } else - { - row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; - } - /* setting the stopping row */ - if (i_y * stride - padding + dim_kernel >= dim_im_in) - { - row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; - } else - { - row_end = Im_in + (i_y * stride - padding + dim_kernel) - * dim_im_in * ch_im_in; - } - - /* copy over the first row */ - csi_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); - count = 1; - - /* move over to next row */ - row_start += ch_im_in * dim_im_in; - - for (; row_start < row_end; row_start += dim_im_in * ch_im_in) - { - accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); - count++; - } - buffer_scale_back_q15_to_q7(buffer, target, - dim_im_out * ch_im_in, count); - } - -#else - - int16_t i_ch_in, i_x, i_y; - int16_t k_x, k_y; - - for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) - { - for (i_y = 0; i_y < dim_im_out; i_y++) - { - for (i_x = 0; i_x < dim_im_out; i_x++) - { - int sum = 0; - int count = 0; - for (k_y = i_y * stride - padding; k_y < i_y * stride - padding - + dim_kernel; k_y++) - { - for (k_x = i_x * stride - padding; k_x < i_x * stride - - padding + dim_kernel; k_x++) - { - if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in - && k_x < dim_im_in) - { - sum += Im_in[i_ch_in + ch_im_in - * (k_x + k_y * dim_im_in)]; - count++; - } - } - } - Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = - sum / count; - } - } - } - -#endif /* CSI_MATH_DSP */ - -} - -/** - * @} end of Pooling group - */ diff --git a/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c b/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c new file mode 100644 index 00000000..1e8f62d3 --- /dev/null +++ b/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "i805_ref_function.h" + +void shl_avepool_q7_HWC_nonsquare(q7_t *Im_in, // input image + const uint16_t dim_im_in_x, // input image dimension + const uint16_t dim_im_in_y, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel_x, // window kernel size + const uint16_t dim_kernel_y, // window kernel size + const uint16_t padding_x, // padding sizes + const uint16_t padding_y, // padding sizes + const uint16_t stride_x, // stride + const uint16_t stride_y, // stride + const uint16_t dim_im_out_x, // output image dimension + const uint16_t dim_im_out_y, // output image dimension + q7_t *bufferA, // a buffer for local storage + q7_t *Im_out, // output feature + const uint16_t out_lshift) // output left shift (scaling) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) { + for (i_y = 0; i_y < dim_im_out_y; i_y++) { + for (i_x = 0; i_x < dim_im_out_x; i_x++) { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; + k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) { + for (k_x = i_x * stride_x - padding_x; + k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++; + } + } + } + sum = __SSAT_8((sum << out_lshift) / count); + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum; + } + } + } +} diff --git a/source/i805_ref/pooling/shl_pool_q7_HWC.c b/source/i805_ref/pooling/shl_pool_q7_HWC.c new file mode 100644 index 00000000..aa11a150 --- /dev/null +++ b/source/i805_ref/pooling/shl_pool_q7_HWC.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Title: csi_pool_q7_HWC.c + * Description: Pooling function implementations + * + * -------------------------------------------------------------------- */ + +#include "i805_ref_function.h" + +/** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void shl_maxpool2d_q7_HWC(q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, + const uint16_t dim_im_out, q7_t* bufferA, q7_t* Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) { + for (i_y = 0; i_y < dim_im_out; i_y++) { + for (i_x = 0; i_x < dim_im_out; i_x++) { + int max = -129; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; + k_y++) { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; + k_x++) { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; + } + } + } +} + +/** + * @brief Q7 average pooling function + * @param[in,out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*dim_im_out*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void shl_avepool_q7_HWC(q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, + const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, + const uint16_t dim_im_out, q7_t* bufferA, q7_t* Im_out) +{ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) { + for (i_y = 0; i_y < dim_im_out; i_y++) { + for (i_x = 0; i_x < dim_im_out; i_x++) { + int sum = 0; + int count = 0; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; + k_y++) { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; + k_x++) { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; + } + } + } +} diff --git a/source/i805_ref/relu.c b/source/i805_ref/relu.c index 2b874cf6..145b19a6 100644 --- a/source/i805_ref/relu.c +++ b/source/i805_ref/relu.c @@ -16,29 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -int csi_ref_i805_relu_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_ref_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); - csi_relu_q7(input_data, size); + int size = csinn_tensor_size(input); + shl_relu_q7(input_data, size); output->data = input_data; return CSINN_TRUE; } -int csi_ref_i805_relu_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int shl_i805_ref_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); - csi_relu_q15(input_data, size); + int size = csinn_tensor_size(input); + shl_relu_q15(input_data, size); output->data = input_data; return CSINN_TRUE; } diff --git a/source/i805_ref/setup.c b/source/i805_ref/setup.c index 761281aa..6be70c5e 100644 --- a/source/i805_ref/setup.c +++ b/source/i805_ref/setup.c @@ -16,93 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "shl_ref_i805.h" -static void *setup_init_map() +static void *setup_cb_map() { - static void* init_map[CSINN_OP_AND_UTILS_SIZE][2]; - /* q7 dtype */ - init_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_i805_avgpool2d_init_q7; - init_map[CSINN_OP_CONV2D][0] = csi_ref_i805_conv2d_init_q7; - init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_i805_depthwise_conv2d_init_q7; - init_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_i805_maxpool2d_init_q7; - - /* q15 dtype */ - init_map[CSINN_OP_CONV2D][1] = csi_ref_i805_conv2d_init_q15; - - return init_map; -} - -static int get_init_map_index(int op, int dtype) -{ - switch (dtype) { - case CSINN_DTYPE_INT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; - } -} - -void *csi_init_map_ref_i805(int op, int dtype) -{ - void **init_map_table = setup_init_map(); - return init_map_table[get_init_map_index(op, dtype)]; -} - - -static void *setup_bc_map() -{ - static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2]; + static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2]; + memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2); /* q7 dtype */ - bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant; - bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_depthwise_conv2d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_ref_i805_fullyconnected_q7; - bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant; - bc_map[CSINN_OP_RELU][0] = csi_ref_i805_relu_q7; - bc_map[CSINN_OP_SIGMOID][0] = csi_ref_i805_sigmoid_q7; - bc_map[CSINN_OP_SOFTMAX][0] = csi_ref_i805_softmax_q7; - bc_map[CSINN_OP_TANH][0] = csi_ref_i805_tanh_q7; + cb_map[CSINN_OP_AVGPOOL2D][0].init = shl_i805_ref_avgpool2d_init_q7; + cb_map[CSINN_OP_CONV2D][0].init = shl_i805_ref_conv2d_init_q7; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_i805_ref_depthwise_conv2d_init_q7; + cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_i805_ref_maxpool2d_init_q7; + cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_i805_ref_fullyconnected_q7; + cb_map[CSINN_OP_RELU][0].exec = shl_i805_ref_relu_q7; + cb_map[CSINN_OP_SIGMOID][0].exec = shl_i805_ref_sigmoid_q7; + cb_map[CSINN_OP_SOFTMAX][0].exec = shl_i805_ref_softmax_q7; + cb_map[CSINN_OP_TANH][0].exec = shl_i805_ref_tanh_q7; /* q15 dtype */ - bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_ref_i805_fullyconnected_q15; - bc_map[CSINN_OP_RELU][1] = csi_ref_i805_relu_q15; - bc_map[CSINN_OP_SIGMOID][1] = csi_ref_i805_sigmoid_q15; - bc_map[CSINN_OP_SOFTMAX][1] = csi_ref_i805_softmax_q15; - bc_map[CSINN_OP_TANH][1] = csi_ref_i805_tanh_q15; + cb_map[CSINN_OP_CONV2D][1].init = shl_i805_ref_conv2d_init_q15; + cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_i805_ref_fullyconnected_q15; + cb_map[CSINN_OP_RELU][1].exec = shl_i805_ref_relu_q15; + cb_map[CSINN_OP_SIGMOID][1].exec = shl_i805_ref_sigmoid_q15; + cb_map[CSINN_OP_SOFTMAX][1].exec = shl_i805_ref_softmax_q15; + cb_map[CSINN_OP_TANH][1].exec = shl_i805_ref_tanh_q15; - return bc_map; + return cb_map; } -static int get_bc_map_index(int op, int dtype) +static int get_cb_map_index(int op, int dtype) { switch (dtype) { - case CSINN_DTYPE_INT8: - return op * 2; - break; - case CSINN_DTYPE_INT16: - return op * 2 + 1; - break; - default: - return CSINN_UNSUPPORT_DTYPE; + case CSINN_DTYPE_INT8: + return op * 2; + break; + case CSINN_DTYPE_INT16: + return op * 2 + 1; + break; + default: + return CSINN_UNSUPPORT_DTYPE; } } -void *csi_bc_map_ref_i805(int op, int dtype) +static struct csinn_callback *__cb_map_table_ref_i805; +struct csinn_callback *shl_cb_map_ref_i805(int op, int dtype) { - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; - } - return bc_map_table[get_bc_map_index(op, dtype)]; + return &__cb_map_table_ref_i805[get_cb_map_index(op, dtype)]; } + +void shl_target_init_ref_i805() +{ + __cb_map_table_ref_i805 = setup_cb_map(); + shl_register_runtime_callback(CSINN_REF_I805, NULL); + shl_register_op_callback(CSINN_REF_I805, shl_cb_map_ref_i805); +} \ No newline at end of file diff --git a/source/i805_ref/sigmoid.c b/source/i805_ref/sigmoid.c index d434c8f6..d06584b3 100644 --- a/source/i805_ref/sigmoid.c +++ b/source/i805_ref/sigmoid.c @@ -16,37 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -int csi_ref_i805_sigmoid_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_i805_ref_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); - csi_nn_activations_direct_q7(input_data, size, int_width, 0); + int size = csinn_tensor_size(input); + shl_activations_direct_q7(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; } -int csi_ref_i805_sigmoid_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int shl_i805_ref_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); - csi_nn_activations_direct_q15(input_data, size, int_width, 0); + int size = csinn_tensor_size(input); + shl_activations_direct_q15(input_data, size, int_width, 0); output->data = input_data; return CSINN_TRUE; } diff --git a/source/i805_ref/softmax.c b/source/i805_ref/softmax.c index 951690bd..4c02322e 100644 --- a/source/i805_ref/softmax.c +++ b/source/i805_ref/softmax.c @@ -16,29 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -int csi_ref_i805_softmax_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_i805_ref_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q7_t *input_data = (q7_t *)input->data; q7_t *output_data = (q7_t *)output->data; - int size = csi_tensor_size(input); - csi_softmax_q7(input_data, size, output_data); + int size = csinn_tensor_size(input); + shl_softmax_q7(input_data, size, output_data); return CSINN_TRUE; } -int csi_ref_i805_softmax_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int shl_i805_ref_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { q15_t *input_data = (q15_t *)input->data; q15_t *output_data = (q15_t *)output->data; - int size = csi_tensor_size(input); - csi_softmax_q15(input_data, size, output_data); + int size = csinn_tensor_size(input); + shl_softmax_q15(input_data, size, output_data); return CSINN_TRUE; } diff --git a/source/i805_ref/softmax/csi_softmax_q15.c b/source/i805_ref/softmax/shl_softmax_q15.c similarity index 59% rename from source/i805_ref/softmax/csi_softmax_q15.c rename to source/i805_ref/softmax/shl_softmax_q15.c index c5379623..af55184d 100644 --- a/source/i805_ref/softmax/csi_softmax_q15.c +++ b/source/i805_ref/softmax/shl_softmax_q15.c @@ -17,12 +17,12 @@ */ /* ---------------------------------------------------------------------- - * Title: csi_softmax_q15.c + * Title: shl_softmax_q15.c * Description: Q15 softmax function * * -------------------------------------------------------------------- */ -#include "csi_nnfunctions.h" +#include "i805_ref_function.h" /** * @ingroup groupNN @@ -33,38 +33,35 @@ * @{ */ - /** - * @brief Q15 softmax function - * @param[in] vec_in pointer to input vector - * @param[in] dim_vec input vector dimention - * @param[out] p_out pointer to output vector - * @return none. - * - * @details - * - * Here, instead of typical e based softmax, we use - * 2-based softmax, i.e.,: - * - * y_i = 2^(x_i) / sum(2^x_j) - * - * The relative output will be different here. - * But mathematically, the gradient will be the same - * with a log(2) scaling factor. - * - */ +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + * @details + * + * Here, instead of typical e based softmax, we use + * 2-based softmax, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ -void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, - q15_t * p_out) +void shl_softmax_q15(const q15_t* vec_in, const uint16_t dim_vec, q15_t* p_out) { - q31_t sum; - int16_t i; - uint8_t shift; - q31_t base; + q31_t sum; + int16_t i; + uint8_t shift; + q31_t base; base = -1 * 0x100000; - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { base = vec_in[i]; } } @@ -77,10 +74,8 @@ void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, sum = 0; - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { shift = (uint8_t)__USAT(vec_in[i] - base, 5); sum += 0x1 << shift; } @@ -94,15 +89,12 @@ void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16 * and vec_in[i]-base = 16 */ - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { /* Here minimum value of 17+base-vec[i] will be 1 */ - shift = (uint8_t)__USAT(17+base-vec_in[i], 5); - p_out[i] = (q15_t) __SSAT((output_base >> shift), 16); - } else - { + shift = (uint8_t)__USAT(17 + base - vec_in[i], 5); + p_out[i] = (q15_t)__SSAT((output_base >> shift), 16); + } else { p_out[i] = 0; } } diff --git a/source/i805_ref/softmax/csi_softmax_q7.c b/source/i805_ref/softmax/shl_softmax_q7.c similarity index 57% rename from source/i805_ref/softmax/csi_softmax_q7.c rename to source/i805_ref/softmax/shl_softmax_q7.c index d9b41723..8c5bd8ee 100644 --- a/source/i805_ref/softmax/csi_softmax_q7.c +++ b/source/i805_ref/softmax/shl_softmax_q7.c @@ -17,12 +17,12 @@ */ /* ---------------------------------------------------------------------- - * Title: csi_softmax_q7.c + * Title: shl_softmax_q7.c * Description: Q7 softmax function * * -------------------------------------------------------------------- */ -#include "csi_nnfunctions.h" +#include "i805_ref_function.h" /** * @ingroup groupNN @@ -33,56 +33,52 @@ * @{ */ - /** - * @brief Q7 softmax function - * @param[in] vec_in pointer to input vector - * @param[in] dim_vec input vector dimention - * @param[out] p_out pointer to output vector - * @return none. - * - * @details - * - * Here, instead of typical natural logarithm e based softmax, we use - * 2-based softmax here, i.e.,: - * - * y_i = 2^(x_i) / sum(2^x_j) - * - * The relative output will be different here. - * But mathematically, the gradient will be the same - * with a log(2) scaling factor. - * - */ +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * @return none. + * + * @details + * + * Here, instead of typical natural logarithm e based softmax, we use + * 2-based softmax here, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ -void csi_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out) +void shl_softmax_q7(const q7_t* vec_in, const uint16_t dim_vec, q7_t* p_out) { - q31_t sum; - int16_t i; - uint8_t shift; - q15_t base; + q31_t sum; + int16_t i; + uint8_t shift; + q15_t base; base = -257; /* We first search for the maximum */ - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { base = vec_in[i]; } } - /* - * So the base is set to max-8, meaning - * that we ignore really small values. + /* + * So the base is set to max-8, meaning + * that we ignore really small values. * anyway, they will be 0 after shrinking to q7_t. */ base = base - 8; sum = 0; - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { shift = (uint8_t)__USAT(vec_in[i] - base, 5); sum += 0x1 << shift; } @@ -96,18 +92,15 @@ void csi_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out) * so 128 (0x1<<7) -> 100% confidence when sum = 0x1 << 8, output_base = 0x1 << 12 * and vec_in[i]-base = 8 */ - for (i = 0; i < dim_vec; i++) - { - if (vec_in[i] > base) - { + for (i = 0; i < dim_vec; i++) { + if (vec_in[i] > base) { /* Here minimum value of 13+base-vec_in[i] will be 5 */ - shift = (uint8_t)__USAT(13+base-vec_in[i], 5); - p_out[i] = (q7_t) __SSAT((output_base >> shift), 8); + shift = (uint8_t)__USAT(13 + base - vec_in[i], 5); + p_out[i] = (q7_t)__SSAT((output_base >> shift), 8); } else { p_out[i] = 0; } } - } /** diff --git a/source/i805_ref/tanh.c b/source/i805_ref/tanh.c index 65f56b2b..41b48cf8 100644 --- a/source/i805_ref/tanh.c +++ b/source/i805_ref/tanh.c @@ -16,37 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref_i805.h" +#include "i805_ref_function.h" +#include "shl_ref_i805.h" - -int csi_ref_i805_tanh_q7(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_i805_ref_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q7_t *input_data = (q7_t *)input->data; - int size = csi_tensor_size(input); - csi_nn_activations_direct_q7(input_data, size, int_width, 1); + int size = csinn_tensor_size(input); + shl_activations_direct_q7(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; } -int csi_ref_i805_tanh_q15(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int shl_i805_ref_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max)); int int_width = ceilf(log(tensor_max) / log(2)); int_width = int_width > 3 ? 3 : int_width; q15_t *input_data = (q15_t *)input->data; - int size = csi_tensor_size(input); - csi_nn_activations_direct_q15(input_data, size, int_width, 1); + int size = csinn_tensor_size(input); + shl_activations_direct_q15(input_data, size, int_width, 1); output->data = input_data; return CSINN_TRUE; } \ No newline at end of file diff --git a/source/nn2/abs.c b/source/nn2/abs.c index 82648425..69fc34cd 100644 --- a/source/nn2/abs.c +++ b/source/nn2/abs.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_abs_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_abs_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ABS, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ABS, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_abs(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_abs(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/acos.c b/source/nn2/acos.c index de390369..9b9faf56 100644 --- a/source/nn2/acos.c +++ b/source/nn2/acos.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_acos_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_acos_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ACOS, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ACOS, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_acos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_acos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/acosh.c b/source/nn2/acosh.c index 52b5d9d1..2f7d985d 100644 --- a/source/nn2/acosh.c +++ b/source/nn2/acosh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_acosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_acosh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ACOSH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ACOSH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_acosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_acosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/add.c b/source/nn2/add.c index 3c5a94f1..14784974 100644 --- a/source/nn2/add.c +++ b/source/nn2/add.c @@ -16,37 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_add_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_add_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_ADD, input0->dtype); - if (init_func != NULL) { - return init_func(input0, input1, output, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ADD, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ADD, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_add(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/all.c b/source/nn2/all.c index f02a20de..4f0fb87c 100644 --- a/source/nn2/all.c +++ b/source/nn2/all.c @@ -16,24 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_all_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_all_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return CSINN_FALSE; + shl_op_callback_map(¶ms->base, CSINN_OP_ALL, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); + } + return CSINN_TRUE; } -int csi_all(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_all(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/and.c b/source/nn2/and.c index c4bc2399..bae739f8 100644 --- a/source/nn2/and.c +++ b/source/nn2/and.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_and_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AND, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_AND, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_and(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/any.c b/source/nn2/any.c index c94247d9..acd77cb7 100644 --- a/source/nn2/any.c +++ b/source/nn2/any.c @@ -16,24 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_any_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_any_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return CSINN_FALSE; + shl_op_callback_map(¶ms->base, CSINN_OP_ANY, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); + } + return CSINN_TRUE; } -int csi_any(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_any(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/arange.c b/source/nn2/arange.c index 9bd56831..ce7973c3 100644 --- a/source/nn2/arange.c +++ b/source/nn2/arange.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_arange_init(struct csi_tensor *output, - struct arange_params *params) +int csinn_arange_init(struct csinn_tensor *output, struct csinn_arange_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARANGE, output->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ARANGE, output->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(output, params); } return CSINN_TRUE; } -int csi_arange(struct csi_tensor *output, - struct arange_params *params) +int csinn_arange(struct csinn_tensor *output, struct csinn_arange_params *params) { - CSI_DEBUG_CALL(csi_arange_debug_info(output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(output, params); + SHL_DEBUG_CALL(shl_arange_debug_info(output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/argmax.c b/source/nn2/argmax.c index b2ae3eb5..04f6aeec 100644 --- a/source/nn2/argmax.c +++ b/source/nn2/argmax.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_argmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { + void *cbf = NULL; if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARGMAX, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ARGMAX, input->dtype); + struct csinn_callback *cb = params->base.cb; + if (cb->init) { + cb->init(input, output, params); } } + return CSINN_TRUE; } -int csi_argmax(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_argmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/argmin.c b/source/nn2/argmin.c index 14deb401..3739e460 100644 --- a/source/nn2/argmin.c +++ b/source/nn2/argmin.c @@ -16,32 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_argmin_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_argmin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { + void *cbf = NULL; if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARGMIN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ARGMIN, input->dtype); + struct csinn_callback *cb = params->base.cb; + if (cb->init) { + cb->init(input, output, params); } } return CSINN_TRUE; } -int csi_argmin(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_argmin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/asin.c b/source/nn2/asin.c index 2c96a2a0..a89a1053 100644 --- a/source/nn2/asin.c +++ b/source/nn2/asin.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_asin_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_asin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ASIN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ASIN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_asin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_asin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/asinh.c b/source/nn2/asinh.c index 6faa97fa..b924c28e 100644 --- a/source/nn2/asinh.c +++ b/source/nn2/asinh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_asinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_asinh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ASINH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ASINH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_asinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_asinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/atan.c b/source/nn2/atan.c index 6350def2..90b616e0 100644 --- a/source/nn2/atan.c +++ b/source/nn2/atan.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_atan_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_atan_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ATAN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ATAN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_atan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_atan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/atanh.c b/source/nn2/atanh.c index 453874d5..a7736ef2 100644 --- a/source/nn2/atanh.c +++ b/source/nn2/atanh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_atanh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_atanh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ATANH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ATANH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_atanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_atanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/averagepool.c b/source/nn2/averagepool.c index 96a07c2c..d4a9399f 100644 --- a/source/nn2/averagepool.c +++ b/source/nn2/averagepool.c @@ -16,37 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_avgpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_AVGPOOL2D, input->dtype); - if(init_func != NULL) { - return init_func(input, output, params); - } + shl_op_callback_map(¶ms->base, CSINN_OP_AVGPOOL2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } - - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AVGPOOL2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } - return CSINN_TRUE; } -int csi_avgpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/averagepool3d.c b/source/nn2/averagepool3d.c index f07590cd..fe17321e 100644 --- a/source/nn2/averagepool3d.c +++ b/source/nn2/averagepool3d.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_avgpool3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_avgpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AVGPOOL3D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_AVGPOOL3D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_avgpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if(params->base.bc !=NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/batch_normalization.c b/source/nn2/batch_normalization.c index 385d962a..ec6b579f 100644 --- a/source/nn2/batch_normalization.c +++ b/source/nn2/batch_normalization.c @@ -16,37 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_batch_normalization_init(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params) +int csinn_batch_normalization_init(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_BN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, mean, variance, gamma, beta, output, params); } - return CSINN_TRUE; } -int csi_batch_normalization(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params) +int csinn_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params) { - CSI_DEBUG_CALL(csi_bn_debug_info(input, mean, variance, gamma, beta, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, mean, variance, gamma, beta, output, params); + SHL_DEBUG_CALL(shl_bn_debug_info(input, mean, variance, gamma, beta, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, mean, variance, gamma, beta, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/batch_to_space.c b/source/nn2/batch_to_space.c index 24c2a388..72b0219b 100644 --- a/source/nn2/batch_to_space.c +++ b/source/nn2/batch_to_space.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_batch_to_space_init(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params) +int csinn_batch_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BATCH_TO_SPACE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_BATCH_TO_SPACE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_batch_to_space(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params) +int csinn_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params) { - CSI_DEBUG_CALL(csi_batch_to_space_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_batch_to_space_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/batch_to_space_nd.c b/source/nn2/batch_to_space_nd.c index 4b199497..d6ce5203 100644 --- a/source/nn2/batch_to_space_nd.c +++ b/source/nn2/batch_to_space_nd.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_batch_to_space_nd_init(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_nd_params *params) +int csinn_batch_to_space_nd_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BATCH_TO_SPACE_ND, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_BATCH_TO_SPACE_ND, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_batch_to_space_nd(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_nd_params *params) +int csinn_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params) { - CSI_DEBUG_CALL(csi_batch_to_space_nd_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_batch_to_space_nd_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/broadcast_to.c b/source/nn2/broadcast_to.c index 4bfc6315..2af8ab6c 100644 --- a/source/nn2/broadcast_to.c +++ b/source/nn2/broadcast_to.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_broadcast_to_init(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int csinn_broadcast_to_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BROADCOST, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_BROADCOST, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_broadcast_to(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int csinn_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { - CSI_DEBUG_CALL(csi_broadcast_to_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_broadcast_to_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cache_conv1d.c b/source/nn2/cache_conv1d.c index 90608de5..e3788894 100644 --- a/source/nn2/cache_conv1d.c +++ b/source/nn2/cache_conv1d.c @@ -16,28 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int csinn_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { - params->base.bc = - csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CACHE_CONV1D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CACHE_CONV1D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight, - struct csi_tensor *bias, struct cache_conv1d_params *params) +int csinn_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { - CSI_DEBUG_CALL(csi_cache_conv1d_debug_info(input, output, weight, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, weight, bias, params); + SHL_DEBUG_CALL(shl_cache_conv1d_debug_info(input, output, weight, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, weight, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cache_matmul.c b/source/nn2/cache_matmul.c index 7648b0d6..e90a62af 100644 --- a/source/nn2/cache_matmul.c +++ b/source/nn2/cache_matmul.c @@ -16,28 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) +int csinn_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) { - params->base.bc = - csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CACHE_MATMUL, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CACHE_MATMUL, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight, - struct csi_tensor *bias, struct cache_matmul_params *params) +int csinn_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) { - CSI_DEBUG_CALL(csi_cache_matmul_debug_info(input, output, weight, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, weight, bias, params); + SHL_DEBUG_CALL(shl_cache_matmul_debug_info(input, output, weight, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, weight, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/ceil.c b/source/nn2/ceil.c index bd792a37..d4608e57 100644 --- a/source/nn2/ceil.c +++ b/source/nn2/ceil.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_ceil_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_ceil_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CEIL, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CEIL, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_ceil(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_ceil(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/clip.c b/source/nn2/clip.c index 5dbe4e56..ea310a17 100644 --- a/source/nn2/clip.c +++ b/source/nn2/clip.c @@ -16,35 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_clip_init(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int csinn_clip_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_CLIP, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CLIP, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CLIP, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_clip(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int csinn_clip(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { - CSI_DEBUG_CALL(csi_clip_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_clip_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/col2im.c b/source/nn2/col2im.c index a979943d..b65e7dfe 100644 --- a/source/nn2/col2im.c +++ b/source/nn2/col2im.c @@ -16,31 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_col2im_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct col2im_params *params) +int csinn_col2im_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COL2IM, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_COL2IM, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, params); } - return CSINN_TRUE; } -int csi_col2im(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct col2im_params *params) +int csinn_col2im(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params) { - CSI_DEBUG_CALL(csi_col2im_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, kernel, params); + SHL_DEBUG_CALL(shl_col2im_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/concat.c b/source/nn2/concat.c index 31bdaca4..8d4ae690 100644 --- a/source/nn2/concat.c +++ b/source/nn2/concat.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_concat_init(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params) +int csinn_concat_init(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONCAT, output->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CONCAT, output->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_concat(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params) +int csinn_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { - CSI_DEBUG_CALL(csi_concat_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_concat_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/convolution.c b/source/nn2/convolution.c index 5ad95e99..0ddd5660 100644 --- a/source/nn2/convolution.c +++ b/source/nn2/convolution.c @@ -16,85 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_conv2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - if (params->base.layout == CSINN_LAYOUT_NCHW) { - if (params->group == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D, input->dtype); - } else if (params->group == input->dim[1] && kernel->dim[1] == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); - } else { - init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D, input->dtype); - } - } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - if (params->group == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D, input->dtype); - } else if (params->group == input->dim[3] && kernel->dim[0] == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); - } else { - init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D, input->dtype); - } - } else { - init_func = NULL; - } - if (init_func != NULL) { - return init_func(input, output, kernel, bias, params); - } - } - if (params->base.layout == CSINN_LAYOUT_NCHW) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D, input->dtype); - } else if (params->group == input->dim[1]) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D, input->dtype); + } else if (params->group == input->dim[1] && kernel->dim[1] == 1) { + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D, input->dtype); - } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D, input->dtype); } } else if (params->base.layout == CSINN_LAYOUT_NHWC) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D, input->dtype); - } else if (params->group == input->dim[3]) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D, input->dtype); + } else if (params->group == input->dim[3] && kernel->dim[0] == 1) { + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D, input->dtype); } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } - } - else { + } else { return CSINN_UNSUPPORT_LAYOUT; } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } return CSINN_TRUE; } -int csi_conv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) { - if (params->conv_extra.kernel_tm != NULL && params->conv_extra.conv_mode == CSINN_WINOGRAD) { - params->base.bc(input, output, params->conv_extra.kernel_tm, bias, params); - csi_mem_free(params->conv_extra.kernel_tm->data); - csi_free_tensor(params->conv_extra.kernel_tm); + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + struct csinn_callback *cb = params->base.cb; + if ((cb->exec == func) && (params->conv_extra.kernel_tm != NULL && + params->conv_extra.conv_mode == CSINN_WINOGRAD)) { + cb->exec(input, output, params->conv_extra.kernel_tm, bias, params); + shl_mem_free(params->conv_extra.kernel_tm->data); + csinn_free_tensor(params->conv_extra.kernel_tm); } else { - params->base.bc(input, output, kernel, bias, params); + func(input, output, kernel, bias, params); } } else { return CSINN_CALLBACK_UNSET; diff --git a/source/nn2/convolution1d.c b/source/nn2/convolution1d.c index d1de4cbb..2949e805 100644 --- a/source/nn2/convolution1d.c +++ b/source/nn2/convolution1d.c @@ -16,37 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_conv1d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv1d_params *params) +int csinn_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV1D, input->dtype); - if (params->base.bc == NULL) - { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CONV1D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } return CSINN_TRUE; } -int csi_conv1d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv1d_params *params) +int csinn_conv1d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params) { - CSI_DEBUG_CALL(csi_conv1d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) - { - params->base.bc(input, output, kernel, bias, params); - } - else - { + SHL_DEBUG_CALL(shl_conv1d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; diff --git a/source/nn2/convolution3d.c b/source/nn2/convolution3d.c index 09ffaa92..a8f98726 100644 --- a/source/nn2/convolution3d.c +++ b/source/nn2/convolution3d.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_conv3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csinn_conv3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { if (input->layout == CSINN_LAYOUT_NCDHW) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV3D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_CONV3D, input->dtype); } else { return CSINN_UNSUPPORT_LAYOUT; } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } return CSINN_TRUE; } -int csi_conv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csinn_conv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - CSI_DEBUG_CALL(csi_conv3d_debug_info(input, output, kernel, bias, params, __func__)); - if(params->base.bc != NULL) { - params->base.bc(input, output, kernel, bias, params); + SHL_DEBUG_CALL(shl_conv3d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/convolution_relu.c b/source/nn2/convolution_relu.c index 672e25cd..79bea988 100644 --- a/source/nn2/convolution_relu.c +++ b/source/nn2/convolution_relu.c @@ -17,80 +17,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_conv2d_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - if (params->base.layout == CSINN_LAYOUT_NCHW) { - if (params->group == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D_RELU, input->dtype); - } else if (params->group == input->dim[1] && kernel->dim[1] == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); - } else { - init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); - } - } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - if (params->group == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D_RELU, input->dtype); - } else if (params->group == input->dim[3] && kernel->dim[0] == 1) { - init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); - } else { - init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); - } - } else { - init_func = NULL; - } - if (init_func != NULL) { - return init_func(input, output, kernel, bias, params); - } - } - - if (params->base.layout == CSINN_LAYOUT_NCHW) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D_RELU, input->dtype); } else if (params->group == input->dim[1] && kernel->dim[1] == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); - } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); } } else if (params->base.layout == CSINN_LAYOUT_NHWC) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D_RELU, input->dtype); } else if (params->group == input->dim[3] && kernel->dim[0] == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); - } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); } } else { return CSINN_UNSUPPORT_LAYOUT; } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } return CSINN_TRUE; } -int csi_conv2d_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, kernel, bias, params); + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/convolution_relu6.c b/source/nn2/convolution_relu6.c index d6efec99..7f2d781f 100644 --- a/source/nn2/convolution_relu6.c +++ b/source/nn2/convolution_relu6.c @@ -16,53 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_conv2d_relu6_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU6, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D_RELU6, input->dtype); } else if (params->group == input->dim[1] && kernel->dim[1] == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype); - } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype); } } else if (params->base.layout == CSINN_LAYOUT_NHWC) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU6, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_CONV2D_RELU6, input->dtype); } else if (params->group == input->dim[3] && kernel->dim[0] == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype); + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype); - } - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype); } } else { return CSINN_UNSUPPORT_LAYOUT; } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } return CSINN_TRUE; } -int csi_conv2d_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, kernel, bias, params); + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cos.c b/source/nn2/cos.c index 3c788dd0..ba4af7b0 100644 --- a/source/nn2/cos.c +++ b/source/nn2/cos.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cos_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_cos_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COS, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_COS, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_cos(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cosh.c b/source/nn2/cosh.c index a788393c..aa40f6ec 100644 --- a/source/nn2/cosh.c +++ b/source/nn2/cosh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_cosh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COSH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_COSH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_cosh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/crop.c b/source/nn2/crop.c index 246f19c1..f21fd7e6 100644 --- a/source/nn2/crop.c +++ b/source/nn2/crop.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_crop_init(struct csi_tensor *input, - struct csi_tensor *output, - struct crop_params *params) +int csinn_crop_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CROP, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CROP, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_crop(struct csi_tensor *input, - struct csi_tensor *output, - struct crop_params *params) +int csinn_crop(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params) { - CSI_DEBUG_CALL(csi_crop_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_crop_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cumprod.c b/source/nn2/cumprod.c index ed971d75..3278060b 100644 --- a/source/nn2/cumprod.c +++ b/source/nn2/cumprod.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cumprod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) +int csinn_cumprod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CUMPROD, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CUMPROD, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cumprod(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) +int csinn_cumprod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params) { - CSI_DEBUG_CALL(csi_cumprod_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_cumprod_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/cumsum.c b/source/nn2/cumsum.c index 820522cf..ff869298 100644 --- a/source/nn2/cumsum.c +++ b/source/nn2/cumsum.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_cumsum_init(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params) +int csinn_cumsum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CUMSUM, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_CUMSUM, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_cumsum(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params) +int csinn_cumsum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params) { - CSI_DEBUG_CALL(csi_cumsum_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_cumsum_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/data_convert.c b/source/nn2/data_convert.c index e4043122..a9f8ffba 100644 --- a/source/nn2/data_convert.c +++ b/source/nn2/data_convert.c @@ -19,24 +19,27 @@ /* CSI-NN2 version 1.11.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_data_convert_init(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int csinn_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = - csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DATA_CONVERT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_DATA_CONVERT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_data_convert(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int csinn_data_convert(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/deconvolution.c b/source/nn2/deconvolution.c index 74c4223d..002413b3 100644 --- a/source/nn2/deconvolution.c +++ b/source/nn2/deconvolution.c @@ -16,42 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_deconv2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_deconv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->group == 1) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DECONV2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } - } else if ( (params->group == output->dim[1] && params->base.layout == CSINN_LAYOUT_NCHW) || - (params->group == output->dim[3] && params->base.layout == CSINN_LAYOUT_NHWC) ) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_DECONV2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_DECONV2D, input->dtype); + } else if ((params->group == output->dim[1] && params->base.layout == CSINN_LAYOUT_NCHW) || + (params->group == output->dim[3] && params->base.layout == CSINN_LAYOUT_NHWC)) { + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_DECONV2D, input->dtype); } else { return CSINN_FALSE; } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } return CSINN_TRUE; } -int csi_deconv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csinn_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, kernel, bias, params); + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/deconvolution3d.c b/source/nn2/deconvolution3d.c index def29799..ee42a551 100644 --- a/source/nn2/deconvolution3d.c +++ b/source/nn2/deconvolution3d.c @@ -16,34 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_deconv3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csinn_deconv3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { if (input->layout == CSINN_LAYOUT_NCDHW) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DECONV3D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_DECONV3D, input->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } return CSINN_TRUE; } -int csi_deconv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csinn_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - CSI_DEBUG_CALL(csi_conv3d_debug_info(input, output, kernel, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, kernel, bias, params); + SHL_DEBUG_CALL(shl_conv3d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/depth_to_space.c b/source/nn2/depth_to_space.c index 63c831fb..ce6cac07 100644 --- a/source/nn2/depth_to_space.c +++ b/source/nn2/depth_to_space.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_depth_to_space_init(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params) +int csinn_depth_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTH_TO_SPACE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTH_TO_SPACE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_depth_to_space(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params) +int csinn_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { - CSI_DEBUG_CALL(csi_depth_to_space_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_depth_to_space_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/depthwise_conv2d.c b/source/nn2/depthwise_conv2d.c new file mode 100644 index 00000000..0a1cf6bf --- /dev/null +++ b/source/nn2/depthwise_conv2d.c @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "csi_nn.h" +#include "shl_utils.h" + +int csinn_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } + return CSINN_TRUE; +} + +int csinn_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/nn2/depthwise_conv2d_relu.c b/source/nn2/depthwise_conv2d_relu.c new file mode 100644 index 00000000..d711af3d --- /dev/null +++ b/source/nn2/depthwise_conv2d_relu.c @@ -0,0 +1,51 @@ + +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "csi_nn.h" +#include "shl_utils.h" + +int csinn_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + shl_op_callback_map(¶ms->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } + return CSINN_TRUE; +} + +int csinn_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/nn2/div.c b/source/nn2/div.c index 3ca7ab73..28090edf 100644 --- a/source/nn2/div.c +++ b/source/nn2/div.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_div_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DIV, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_DIV, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_div(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/elu.c b/source/nn2/elu.c index 51698d45..f8b47de7 100644 --- a/source/nn2/elu.c +++ b/source/nn2/elu.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_elu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_elu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ELU, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ELU, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_elu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_elu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/equal.c b/source/nn2/equal.c index ea039284..ca542e80 100644 --- a/source/nn2/equal.c +++ b/source/nn2/equal.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EQUANL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_EQUANL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/erf.c b/source/nn2/erf.c index 47e9b638..091efaf2 100644 --- a/source/nn2/erf.c +++ b/source/nn2/erf.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_erf_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_erf_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ERF, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ERF, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_erf(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_erf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/exp.c b/source/nn2/exp.c index 07ba2aac..00962887 100644 --- a/source/nn2/exp.c +++ b/source/nn2/exp.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_exp_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_exp_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXP, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_EXP, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_exp(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_exp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/expand_dims.c b/source/nn2/expand_dims.c index a6b17d92..c4cc0283 100644 --- a/source/nn2/expand_dims.c +++ b/source/nn2/expand_dims.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_expand_dims_init(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params) +int csinn_expand_dims_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXPAND_DIMS, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_EXPAND_DIMS, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_expand_dims(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params) +int csinn_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) { - CSI_DEBUG_CALL(csi_expand_dims_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_expand_dims_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/expm1.c b/source/nn2/expm1.c index 8080ea98..ef9692f5 100644 --- a/source/nn2/expm1.c +++ b/source/nn2/expm1.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_expm1_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_expm1_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXPM1, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_EXPM1, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_expm1(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_expm1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/flatten.c b/source/nn2/flatten.c index a668049b..22d729ec 100644 --- a/source/nn2/flatten.c +++ b/source/nn2/flatten.c @@ -16,37 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_flatten_init(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params) +int csinn_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params) { - - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_FLATTEN, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLATTEN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FLATTEN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_flatten(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params) +int csinn_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params) { - CSI_DEBUG_CALL(csi_flatten_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_flatten_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/floor.c b/source/nn2/floor.c index 44fbaf88..543cf50d 100644 --- a/source/nn2/floor.c +++ b/source/nn2/floor.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_floor_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_floor_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FLOOR, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_floor(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_floor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/floor_divide.c b/source/nn2/floor_divide.c index 4b7d01f0..26f02c4c 100644 --- a/source/nn2/floor_divide.c +++ b/source/nn2/floor_divide.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_floor_divide_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_floor_divide_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR_DIVIDE, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FLOOR_DIVIDE, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_floor_divide(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/floor_mod.c b/source/nn2/floor_mod.c index 4bab78a6..b8a12c3f 100644 --- a/source/nn2/floor_mod.c +++ b/source/nn2/floor_mod.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_floor_mod_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_floor_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR_MOD, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FLOOR_MOD, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_floor_mod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/format.c b/source/nn2/format.c new file mode 100644 index 00000000..5c2ec409 --- /dev/null +++ b/source/nn2/format.c @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "csi_nn.h" +#include "shl_utils.h" + +char *shl_bm_header_str() +{ + static char ret_str[4096] = + "Heterogeneous Honey Badger binary model\n\nbinary model version 1.0\n\nHHB_VERSION "; + csinn_version(ret_str + 79); + return ret_str; +} + +void shl_dump_bm_header(FILE *f) +{ + char *header = shl_bm_header_str(); + fwrite(header, 1, 4096, f); +} + +void shl_dump_bm_section_info(FILE *f, struct shl_binary_model_section_info *info) +{ + if (info->section_info_size == 0) { + info->section_info_size = 4096; + } + fwrite(info, 1, info->section_info_size, f); +} + +static inline int32_t read_offset(void *ptr) +{ + /* when 64bit, get 32bit too */ + int32_t ret = *(int32_t *)&ptr; + return ret; +} + +static inline char *offset_to_ptr(int offset) +{ + char *ret; + *(int *)(&ret) = offset; + return ret; +} + +static char *tensor_dump(struct csinn_tensor *tensor, int *size) +{ + int tensor_size = sizeof(struct csinn_tensor); + size_t name_size = strlen(tensor->name); + tensor_size += name_size; + int qinfo_size = tensor->quant_channel * sizeof(struct csinn_quant_info); + tensor_size += qinfo_size; + + struct csinn_tensor *ret = shl_mem_alloc(tensor_size); + /* ignore data */ + ret->data = 0; + /* ignore sess */ + ret->sess = 0; + char *append_ptr = (char *)ret + sizeof(struct csinn_tensor); + memcpy(append_ptr, tensor->name, name_size); + /* offset from base */ + ret->name = (char *)(append_ptr - (char *)ret); + append_ptr += name_size; + memcpy(append_ptr, tensor->qinfo, qinfo_size); + ret->qinfo = (struct csinn_quant_info *)(append_ptr - (char *)ret); + + ret->dtype = tensor->dtype; + ret->mtype = tensor->mtype; + ret->dim_count = tensor->dim_count; + memcpy(ret->dim, tensor->dim, MAX_DIM * 4); + ret->is_const = tensor->is_const; + ret->layout = tensor->layout; + ret->quant_channel = tensor->quant_channel; + + *size = tensor_size; + return (char *)ret; +} + +static void tensor_load(struct csinn_tensor *dest, struct csinn_tensor *src) +{ + dest->data = src->data; + dest->dtype = src->dtype; + dest->mtype = src->mtype; + memcpy(dest->dim, src->dim, MAX_DIM * 4); + dest->dim_count = src->dim_count; + dest->name = read_offset(src->name) + (char *)src; + dest->layout = src->layout; + if (src->quant_channel != dest->quant_channel && src->quant_channel != 0) { + csinn_realloc_quant_info(dest, src->quant_channel); + } + dest->is_const = src->is_const; + char *src_qinfo = (char *)src + read_offset(src->qinfo); + memcpy(dest->qinfo, src_qinfo, sizeof(struct csinn_quant_info) * src->quant_channel); +} + +static char *session_dump(struct csinn_session *sess, int *size) +{ + int sess_size = sizeof(struct csinn_session); + + char *input_buf[sess->input_num]; + int input_size[sess->input_num]; + char *output_buf[sess->output_num]; + int output_size[sess->output_num]; + + for (int i = 0; i < sess->input_num; i++) { + input_buf[i] = tensor_dump(sess->input[i], &input_size[i]); + sess_size += input_size[i]; + } + + for (int i = 0; i < sess->output_num; i++) { + output_buf[i] = tensor_dump(sess->output[i], &output_size[i]); + sess_size += output_size[i]; + } + + sess_size += sizeof(struct csinn_tensor *) * (sess->input_num + sess->output_num); + + struct csinn_session *ret = shl_mem_alloc(sess_size); + ret->input = shl_mem_alloc(sizeof(struct csinn_tensor *) * sess->input_num); + ret->output = shl_mem_alloc(sizeof(struct csinn_tensor *) * sess->output_num); + + char *append_ptr = (char *)ret + sizeof(struct csinn_session); + int input_offset = append_ptr - (char *)ret; + append_ptr += sizeof(char *) * sess->input_num; + for (int i = 0; i < sess->input_num; i++) { + memcpy(append_ptr, input_buf[i], input_size[i]); + ret->input[i] = (struct csinn_tensor *)(append_ptr - (char *)ret); + append_ptr += input_size[i]; + shl_mem_free(input_buf[i]); + } + memcpy(input_offset + (char *)ret, ret->input, sizeof(char *) * sess->input_num); + + int output_offset = append_ptr - (char *)ret; + append_ptr += sizeof(char *) * sess->output_num; + for (int i = 0; i < sess->output_num; i++) { + memcpy(append_ptr, output_buf[i], output_size[i]); + ret->output[i] = (struct csinn_tensor *)(append_ptr - (char *)ret); + append_ptr += output_size[i]; + shl_mem_free(output_buf[i]); + } + memcpy(output_offset + (char *)ret, ret->output, sizeof(char *) * sess->output_num); + + ret->base_dtype = sess->base_dtype; + ret->base_layout = sess->base_layout; + ret->base_api = sess->base_api; + ret->base_run_mode = sess->base_run_mode; + ret->base_quant_type = sess->base_quant_type; + ret->model.bm_addr = sess->model.bm_addr; + ret->model.bm_path = sess->model.bm_path; + ret->model.bm_size = sess->model.bm_size; + ret->model.priority = sess->model.priority; + ret->model.save_mode = sess->model.save_mode; + ret->debug_level = sess->debug_level; + ret->profiler_level = sess->profiler_level; + ret->input_num = sess->input_num; + ret->output_num = sess->output_num; + ret->input = (struct csinn_tensor **)offset_to_ptr(input_offset); + ret->output = (struct csinn_tensor **)offset_to_ptr(output_offset); + + /* TODO: dump target data */ + + *size = sess_size; + return (char *)ret; +} + +void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src) +{ + dest->base_quant_type = src->base_quant_type; + dest->model.priority = src->model.priority; + dest->base_api = src->base_api; + dest->base_dtype = src->base_dtype; + dest->debug_level = src->debug_level; + csinn_session_init(dest); + csinn_set_input_number(src->input_num, dest); + csinn_set_output_number(src->output_num, dest); + + src->input = (struct csinn_tensor **)((char *)src + read_offset(src->input)); + for (int i = 0; i < src->input_num; i++) { + dest->input[i] = csinn_alloc_tensor(dest); + struct csinn_tensor *src_input = + (struct csinn_tensor *)((char *)src + read_offset(src->input[i])); + tensor_load(dest->input[i], src_input); + csinn_set_tensor_entry(dest->input[i], dest); + csinn_set_input(i, dest->input[i], dest); + } + + src->output = (struct csinn_tensor **)((char *)src + read_offset(src->output)); + for (int i = 0; i < src->output_num; i++) { + dest->output[i] = csinn_alloc_tensor(dest); + struct csinn_tensor *src_output = + (struct csinn_tensor *)((char *)src + read_offset(src->output[i])); + tensor_load(dest->output[i], src_output); + csinn_set_tensor_entry(dest->output[i], dest); + csinn_set_output(i, dest->output[i], dest); + } +} + +void shl_dump_bm_graph_info_section(FILE *f, struct csinn_session *sess) +{ + int size = 0; + char *buf = session_dump(sess, &size); + fwrite(buf, 1, size, f); + shl_mem_free(buf); +} + +struct csinn_session *__attribute__((weak)) csinn_import_binary_model(char *bm_addr) +{ + struct shl_binary_model_section_info *sinfo = + (struct shl_binary_model_section_info *)(bm_addr + 4096); + struct csinn_session *bm_sess = + (struct csinn_session *)(bm_addr + sinfo->sections->info_offset * 4096); + struct csinn_session *sess = csinn_alloc_session(); + shl_bm_session_load(sess, bm_sess); + sess->model.bm_addr = bm_addr + sinfo->sections->graph_offset * 4096; + sess->model.bm_size = sinfo->sections->graph_size; + csinn_load_binary_model(sess); + return sess; +} diff --git a/source/nn2/fsmn.c b/source/nn2/fsmn.c index 46583837..d42ce96b 100644 --- a/source/nn2/fsmn.c +++ b/source/nn2/fsmn.c @@ -16,36 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_fsmn_init(struct csi_tensor *frame, - struct csi_tensor *l_filter, - struct csi_tensor *r_filter, - struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, - struct csi_tensor *output, - struct fsmn_params *params) +int csinn_fsmn_init(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FSMN, frame->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FSMN, frame->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params); } return CSINN_TRUE; } -int csi_fsmn(struct csi_tensor *frame, - struct csi_tensor *l_filter, - struct csi_tensor *r_filter, - struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, - struct csi_tensor *output, - struct fsmn_params *params) +int csinn_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params) { - CSI_DEBUG_CALL(csi_fsmn_debug_info(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params); + SHL_DEBUG_CALL(shl_fsmn_debug_info(frame, l_filter, r_filter, frame_sequence, frame_counter, + output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/fullyconnected.c b/source/nn2/fullyconnected.c index 64c5e370..d742b468 100644 --- a/source/nn2/fullyconnected.c +++ b/source/nn2/fullyconnected.c @@ -16,39 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_fullyconnected_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int csinn_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_FULLYCONNECTED, input->dtype); - if (init_func != NULL) { - return init_func(input, output, weights, bias, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FULLYCONNECTED, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_FULLYCONNECTED, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, weights, bias, params); } return CSINN_TRUE; } -int csi_fullyconnected(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int csinn_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { - CSI_DEBUG_CALL(csi_fullyconnected_debug_info(input, output, weights, bias, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, weights, bias, params); + SHL_DEBUG_CALL(shl_fullyconnected_debug_info(input, output, weights, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, weights, bias, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/gather.c b/source/nn2/gather.c index 7e62edee..be8bd22a 100644 --- a/source/nn2/gather.c +++ b/source/nn2/gather.c @@ -16,33 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_gather_init(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_params *params) +int csinn_gather_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GATHER, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GATHER, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, indices, output, params); } return CSINN_TRUE; } -int csi_gather(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_params *params) +int csinn_gather(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { - CSI_DEBUG_CALL(csi_gather_debug_info(input, indices, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, indices, output, params); + SHL_DEBUG_CALL(shl_gather_debug_info(input, indices, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, indices, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/gather_nd.c b/source/nn2/gather_nd.c index b8641413..1158ec47 100644 --- a/source/nn2/gather_nd.c +++ b/source/nn2/gather_nd.c @@ -16,33 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_gather_nd_init(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params) +int csinn_gather_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GATHER_ND, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GATHER_ND, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, indices, output, params); } return CSINN_TRUE; } -int csi_gather_nd(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params) +int csinn_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params) { - CSI_DEBUG_CALL(csi_gather_nd_debug_info(input, indices, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, indices, output, params); + SHL_DEBUG_CALL(shl_gather_nd_debug_info(input, indices, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, indices, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/global_averagepool.c b/source/nn2/global_averagepool.c index ffbcbef9..276ad9b5 100644 --- a/source/nn2/global_averagepool.c +++ b/source/nn2/global_averagepool.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_global_avgpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GLOBAL_AVGPOOL2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GLOBAL_AVGPOOL2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_global_avgpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/global_maxpool.c b/source/nn2/global_maxpool.c index 53d62354..58a632ae 100644 --- a/source/nn2/global_maxpool.c +++ b/source/nn2/global_maxpool.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_global_maxpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GLOBAL_MAXPOOL2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GLOBAL_MAXPOOL2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_global_maxpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/greater.c b/source/nn2/greater.c index cada7e57..6643dd33 100644 --- a/source/nn2/greater.c +++ b/source/nn2/greater.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_greater_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_greater_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GREATHER_EQUAL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GREATHER_EQUAL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_greater(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_greater(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/greater_equal.c b/source/nn2/greater_equal.c index c1e1a794..388e9d35 100644 --- a/source/nn2/greater_equal.c +++ b/source/nn2/greater_equal.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_greater_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_greater_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GREATHER_EQUAL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_GREATHER_EQUAL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_greater_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/group_conv2d.c b/source/nn2/group_conv2d.c new file mode 100644 index 00000000..c01e13ba --- /dev/null +++ b/source/nn2/group_conv2d.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "csi_nn.h" +#include "shl_utils.h" + +int csinn_group_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + shl_op_callback_map(¶ms->base, CSINN_OP_GROUP_CONV2D, input->dtype); + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } + return CSINN_TRUE; +} + +int csinn_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, kernel, bias, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/nn2/hard_sigmoid.c b/source/nn2/hard_sigmoid.c index 4fee974d..6d272b58 100644 --- a/source/nn2/hard_sigmoid.c +++ b/source/nn2/hard_sigmoid.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_hard_sigmoid_init(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csinn_hard_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_HARD_SIGMOID, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_HARD_SIGMOID, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_hard_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csinn_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - CSI_DEBUG_CALL(csi_sigmoid_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_sigmoid_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/im2col.c b/source/nn2/im2col.c index d2e50aa1..b811eeb6 100644 --- a/source/nn2/im2col.c +++ b/source/nn2/im2col.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_im2col_init(struct csi_tensor *input, - struct csi_tensor *output, - struct im2col_params *params) +int csinn_im2col_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_IM2COL, input->dtype); - if(params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_IM2COL, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_im2col(struct csi_tensor *input, - struct csi_tensor *output, - struct im2col_params *params) +int csinn_im2col(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { - CSI_DEBUG_CALL(csi_im2col_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_im2col_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/isnan.c b/source/nn2/isnan.c index 7d077ef3..141a548b 100644 --- a/source/nn2/isnan.c +++ b/source/nn2/isnan.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_isnan_bool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_isnan_bool_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ISNAN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ISNAN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_isnan_bool(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/l2_normalization.c b/source/nn2/l2_normalization.c index 9cb4d49b..95f2a882 100644 --- a/source/nn2/l2_normalization.c +++ b/source/nn2/l2_normalization.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_l2_normalization_init(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) +int csinn_l2_normalization_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_L2N, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_L2N, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_l2_normalization(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) +int csinn_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params) { - CSI_DEBUG_CALL(csi_l2n_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_l2n_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/l2pool.c b/source/nn2/l2pool.c index f0db727f..e1a889f3 100644 --- a/source/nn2/l2pool.c +++ b/source/nn2/l2pool.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_l2pool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_l2pool_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_L2POOL2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_L2POOL2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_l2pool(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_l2pool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/layer_norm.c b/source/nn2/layer_norm.c index a7b2c37a..a6353e89 100644 --- a/source/nn2/layer_norm.c +++ b/source/nn2/layer_norm.c @@ -16,37 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_layer_norm_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct layer_norm_params *params) +int csinn_layer_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LAYER_NORM, input->dtype); - if (params->base.bc == NULL) - { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LAYER_NORM, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, gamma, beta, params); } return CSINN_TRUE; } -int csi_layer_norm(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct layer_norm_params *params) +int csinn_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { - CSI_DEBUG_CALL(csi_layer_norm_debug_info(input, output, gamma, beta, params, __func__)); - if (params->base.bc != NULL) - { - params->base.bc(input, output, gamma, beta, params); - } - else - { + SHL_DEBUG_CALL(shl_layer_norm_debug_info(input, output, gamma, beta, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, gamma, beta, params); + } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; diff --git a/source/nn2/leaky_relu.c b/source/nn2/leaky_relu.c index 689d4846..03f56647 100644 --- a/source/nn2/leaky_relu.c +++ b/source/nn2/leaky_relu.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_leaky_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_leaky_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LEAKY_RELU, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LEAKY_RELU, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_leaky_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/less.c b/source/nn2/less.c index b1ccef84..a8dda6c8 100644 --- a/source/nn2/less.c +++ b/source/nn2/less.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_less_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_less_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LESS, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LESS, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_less(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_less(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/less_equal.c b/source/nn2/less_equal.c index 9c2f8176..a8905f28 100644 --- a/source/nn2/less_equal.c +++ b/source/nn2/less_equal.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_less_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_less_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LESS_EQUAL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LESS_EQUAL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_less_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/log.c b/source/nn2/log.c index 1575ee69..e0738e65 100644 --- a/source/nn2/log.c +++ b/source/nn2/log.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_log_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_log_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOG, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_log(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_log(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/log1p.c b/source/nn2/log1p.c index 1bdcad25..3eb904d8 100644 --- a/source/nn2/log1p.c +++ b/source/nn2/log1p.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_log1p_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_log1p_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG1P, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOG1P, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_log1p(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_log1p(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/log_softmax.c b/source/nn2/log_softmax.c index 6d60f0be..525cb36b 100644 --- a/source/nn2/log_softmax.c +++ b/source/nn2/log_softmax.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_log_softmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csinn_log_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG_SOFTMAX, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOG_SOFTMAX, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_log_softmax(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csinn_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - CSI_DEBUG_CALL(csi_softmax_debug_info(input, output, params, __func__)); - if(params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_softmax_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/logical_and.c b/source/nn2/logical_and.c index 507e6023..dae80a51 100644 --- a/source/nn2/logical_and.c +++ b/source/nn2/logical_and.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_logical_and_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_AND, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOGICAL_AND, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_logical_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/logical_not.c b/source/nn2/logical_not.c index 907933a5..6c2ab616 100644 --- a/source/nn2/logical_not.c +++ b/source/nn2/logical_not.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_logical_not_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_logical_not_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_NOT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOGICAL_NOT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_logical_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_logical_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/logical_or.c b/source/nn2/logical_or.c index 7c4cad30..5737a5c0 100644 --- a/source/nn2/logical_or.c +++ b/source/nn2/logical_or.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_logical_or_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_OR, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOGICAL_OR, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_logical_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/logical_xor.c b/source/nn2/logical_xor.c index 5454e266..0fc3de75 100644 --- a/source/nn2/logical_xor.c +++ b/source/nn2/logical_xor.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_logical_xor_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_XOR, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LOGICAL_XOR, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_logical_xor(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/lrn.c b/source/nn2/lrn.c index 9e8d24ba..1f9a9f71 100644 --- a/source/nn2/lrn.c +++ b/source/nn2/lrn.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_lrn_init(struct csi_tensor *input, - struct csi_tensor *output, - struct lrn_params *params) +int csinn_lrn_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LRN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_LRN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_lrn(struct csi_tensor *input, - struct csi_tensor *output, - struct lrn_params *params) +int csinn_lrn(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { - CSI_DEBUG_CALL(csi_lrn_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_lrn_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/matmul.c b/source/nn2/matmul.c index a862fad2..d16c8471 100644 --- a/source/nn2/matmul.c +++ b/source/nn2/matmul.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_matmul_init(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params) +int csinn_matmul_init(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MATMUL, mat0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MATMUL, mat0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(mat0, mat1, output, params); } return CSINN_TRUE; } -int csi_matmul(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params) +int csinn_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output, + struct csinn_matmul_params *params) { - CSI_DEBUG_CALL(csi_matmul_debug_info(mat0, mat1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(mat0, mat1, output, params); + SHL_DEBUG_CALL(shl_matmul_debug_info(mat0, mat1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(mat0, mat1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/max.c b/source/nn2/max.c index 54211abc..a4d11d91 100644 --- a/source/nn2/max.c +++ b/source/nn2/max.c @@ -16,35 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_max_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_max_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - if (params->n == 0 && params->m == 0) { - return CSINN_FALSE; - } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAX, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_MAX, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_max(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/maximum.c b/source/nn2/maximum.c index 4b03df6d..7d168b41 100644 --- a/source/nn2/maximum.c +++ b/source/nn2/maximum.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_maximum_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_maximum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXIMUM, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MAXIMUM, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_maximum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/maxpool.c b/source/nn2/maxpool.c index 1edb1371..beebc1fc 100644 --- a/source/nn2/maxpool.c +++ b/source/nn2/maxpool.c @@ -16,36 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_maxpool2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_MAXPOOL2D, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL2D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MAXPOOL2D, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_maxpool2d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/maxpool2d_locat.c b/source/nn2/maxpool2d_locat.c index 2cdaaf12..32402f65 100644 --- a/source/nn2/maxpool2d_locat.c +++ b/source/nn2/maxpool2d_locat.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_maxpool2d_locat_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool2d_locat_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL2D_LOCAT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MAXPOOL2D_LOCAT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_maxpool2d_locat(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/maxpool3d.c b/source/nn2/maxpool3d.c index 0070f756..16cbe15b 100644 --- a/source/nn2/maxpool3d.c +++ b/source/nn2/maxpool3d.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_maxpool3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - if(input->layout == CSINN_LAYOUT_NCDHW) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL3D, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + if (input->layout == CSINN_LAYOUT_NCDHW) { + shl_op_callback_map(¶ms->base, CSINN_OP_MAXPOOL3D, input->dtype); } else { return CSINN_UNSUPPORT_LAYOUT; } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); + } return CSINN_TRUE; } -int csi_maxpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csinn_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__)); - if(params->base.bc !=NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/mean.c b/source/nn2/mean.c index 1022c686..0b990e3a 100644 --- a/source/nn2/mean.c +++ b/source/nn2/mean.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_mean_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_mean_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MEAN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MEAN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_mean(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/min.c b/source/nn2/min.c index 118028c7..18ccf59f 100644 --- a/source/nn2/min.c +++ b/source/nn2/min.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_min_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_min_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MIN_STRIDE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_MIN_STRIDE, input->dtype); + } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/minimum.c b/source/nn2/minimum.c index f648fe5e..135f9a9d 100644 --- a/source/nn2/minimum.c +++ b/source/nn2/minimum.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_minimum_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_minimum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MINIMUM, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MINIMUM, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_minimum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/mod.c b/source/nn2/mod.c index 91dea742..ff7c1c4f 100644 --- a/source/nn2/mod.c +++ b/source/nn2/mod.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_mod_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MOD, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MOD, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_mod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/mul.c b/source/nn2/mul.c index 89c50cb2..464f7114 100644 --- a/source/nn2/mul.c +++ b/source/nn2/mul.c @@ -16,37 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_mul_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_mul_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_MUL, input0->dtype); - if (init_func != NULL) { - return init_func(input0, input1, output, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MUL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_MUL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_mul(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/ndarray_size.c b/source/nn2/ndarray_size.c index 2b6c6f48..7f2e09c3 100644 --- a/source/nn2/ndarray_size.c +++ b/source/nn2/ndarray_size.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_ndarray_size_init(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int csinn_ndarray_size_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NDARRAY_SIZE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_NDARRAY_SIZE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_ndarray_size(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int csinn_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { - CSI_DEBUG_CALL(csi_ndarray_size_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_ndarray_size_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/negative.c b/source/nn2/negative.c index daacccc1..51a96b39 100644 --- a/source/nn2/negative.c +++ b/source/nn2/negative.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_negative_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_negative_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NEGATIIVE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_NEGATIIVE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_negative(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_negative(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/node.c b/source/nn2/node.c index 5819ed5b..18783bee 100644 --- a/source/nn2/node.c +++ b/source/nn2/node.c @@ -16,14 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_nn.h" -#include "csi_node.h" +#include "shl_memory.h" +#include "shl_node.h" +#include "shl_utils.h" -struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_num, void *data) +struct shl_node *shl_node_alloc(int node_type, char *name, int in_num, int out_num, void *data) { - struct csi_node *ret = csi_mem_alloc(sizeof(struct csi_node)); + struct shl_node *ret = shl_mem_alloc(sizeof(struct shl_node)); ret->type = node_type; ret->name = name; @@ -31,41 +32,41 @@ struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_n ret->in_num = in_num; ret->out_num = out_num; if (in_num != 0) { - ret->in = csi_mem_alloc(in_num * sizeof(struct csi_node *)); + ret->in = shl_mem_alloc(in_num * sizeof(struct shl_node *)); } if (out_num != 0) { - ret->out = csi_mem_alloc(out_num * sizeof(struct csi_node *)); + ret->out = shl_mem_alloc(out_num * sizeof(struct shl_node *)); } ret->subgraph_idx = -1; return ret; } -struct csi_node *csi_node_var_alloc(char *name, void *data) +struct shl_node *shl_node_var_alloc(char *name, void *data) { - return csi_node_alloc(CSINN_TENSOR, name, 1, 1, data); + return shl_node_alloc(CSINN_TENSOR, name, 1, 1, data); } -struct csi_node *csi_node_const_var_alloc(char *name, void *data) +struct shl_node *shl_node_const_var_alloc(char *name, void *data) { - return csi_node_alloc(CSINN_TENSOR, name, 0, 1, data); + return shl_node_alloc(CSINN_TENSOR, name, 0, 1, data); } -int csi_node_free(struct csi_node *node) +int shl_node_free(struct shl_node *node) { - csi_mem_free(node->in); - csi_mem_free(node->out); - csi_mem_free(node); + shl_mem_free(node->in); + shl_mem_free(node->out); + shl_mem_free(node); return CSINN_TRUE; } -int csi_node_add_in(struct csi_node *node, struct csi_node *in, int index) +int shl_node_add_in(struct shl_node *node, struct shl_node *in, int index) { node->in[index] = in; return CSINN_TRUE; } -int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index) +int shl_node_add_out(struct shl_node *node, struct shl_node *out, int index) { node->out[index] = out; @@ -75,49 +76,37 @@ int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index) return CSINN_TRUE; } -int csi_node_get_in_number(struct csi_node *node) -{ - return node->in_num; -} +int shl_node_get_in_number(struct shl_node *node) { return node->in_num; } -int csi_node_get_out_number(struct csi_node *node) -{ - return node->out_num; -} +int shl_node_get_out_number(struct shl_node *node) { return node->out_num; } -int csi_node_get_non_const_in_number(struct csi_node *node) +int shl_node_get_non_const_in_number(struct shl_node *node) { - int in_num = csi_node_get_in_number(node); + int in_num = shl_node_get_in_number(node); int const_in_num = 0; for (int i = 0; i < in_num; i++) { - struct csi_tensor *data = node->in[i]->data; + struct csinn_tensor *data = node->in[i]->data; if (data->is_const) { - const_in_num ++; + const_in_num++; } } return (in_num - const_in_num); } -struct csi_node *csi_node_get_in(struct csi_node *node, int index) -{ - return node->in[index]; -} +struct shl_node *shl_node_get_in(struct shl_node *node, int index) { return node->in[index]; } -struct csi_node *csi_node_get_out(struct csi_node *node, int index) -{ - return node->out[index]; -} +struct shl_node *shl_node_get_out(struct shl_node *node, int index) { return node->out[index]; } -int csi_node_restrict_map_insert(int value, struct csi_node *node) +int shl_node_restrict_map_insert(int value, struct shl_node *node) { node->restricted_map = - csi_mem_realloc(node->restricted_map, (node->restricted_map_num + 1) * sizeof(int)); + shl_mem_realloc(node->restricted_map, (node->restricted_map_num + 1) * sizeof(int)); node->restricted_map[node->restricted_map_num] = value; node->restricted_map_num++; return CSINN_TRUE; } -int csi_node_find(struct csi_node **list, int len, struct csi_node *node) +int shl_node_find(struct shl_node **list, int len, struct shl_node *node) { int res = -1; if (!list || len < 1) { diff --git a/source/nn2/non_max_suppression.c b/source/nn2/non_max_suppression.c index 19e2ffc4..67d9666d 100644 --- a/source/nn2/non_max_suppression.c +++ b/source/nn2/non_max_suppression.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_non_max_suppression_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params) +int csinn_non_max_suppression_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NON_MAX_SUPPRESSION, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_NON_MAX_SUPPRESSION, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } - -int csi_non_max_suppression(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params) +int csinn_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params) { - CSI_DEBUG_CALL(csi_nms_debug_info(input0, input1, output, params, __func__)); - if(params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_nms_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/not.c b/source/nn2/not.c index 57fca4b6..d5bba5e6 100644 --- a/source/nn2/not.c +++ b/source/nn2/not.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_not_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_not_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NOT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_NOT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/not_equal.c b/source/nn2/not_equal.c index 19898e64..48bce506 100644 --- a/source/nn2/not_equal.c +++ b/source/nn2/not_equal.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_not_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_not_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NOT_EQUAL, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_NOT_EQUAL, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_not_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/one_hot.c b/source/nn2/one_hot.c index 82b4ca23..01c3d2b7 100644 --- a/source/nn2/one_hot.c +++ b/source/nn2/one_hot.c @@ -16,24 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_one_hot_init(struct csi_tensor *input, - struct csi_tensor *output, - struct one_hot_params *params) +int csinn_one_hot_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params) { return CSINN_FALSE; } -int csi_one_hot(struct csi_tensor *input, - struct csi_tensor *output, - struct one_hot_params *params) +int csinn_one_hot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params) { - CSI_DEBUG_CALL(csi_one_hot_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_one_hot_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/or.c b/source/nn2/or.c index 17f152d2..a5ef277f 100644 --- a/source/nn2/or.c +++ b/source/nn2/or.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_or_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_OR, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_OR, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_or(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/pad.c b/source/nn2/pad.c index 22608c3a..10385f58 100644 --- a/source/nn2/pad.c +++ b/source/nn2/pad.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_pad_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int csinn_pad_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PAD, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_PAD, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } - return CSINN_TRUE; + return CSINN_TRUE; } -int csi_pad(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int csinn_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { - CSI_DEBUG_CALL(csi_pad_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_pad_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/power.c b/source/nn2/power.c index 5e3ea54c..e6552184 100644 --- a/source/nn2/power.c +++ b/source/nn2/power.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_power_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_power_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_POWER, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_POWER, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_power(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_power(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/prelu.c b/source/nn2/prelu.c index 23e03295..fa181256 100644 --- a/source/nn2/prelu.c +++ b/source/nn2/prelu.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_prelu_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct prelu_params *params) +int csinn_prelu_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_prelu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PRELU, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_PRELU, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_prelu(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct prelu_params *params) +int csinn_prelu(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_prelu_params *params) { - CSI_DEBUG_CALL(csi_prelu_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_prelu_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/prod.c b/source/nn2/prod.c index 122ab882..c98098de 100644 --- a/source/nn2/prod.c +++ b/source/nn2/prod.c @@ -16,32 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_prod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_prod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { + enum csinn_rmode_enum run_mode = shl_get_run_mode(¶ms->base); + void *cbf = NULL; if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PROD, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_PROD, input->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_prod(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/proposal.c b/source/nn2/proposal.c index fe16b026..ee344980 100644 --- a/source/nn2/proposal.c +++ b/source/nn2/proposal.c @@ -16,32 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_proposal_init(struct csi_tensor *cls_prob, - struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, - struct csi_tensor *output, - struct proposal_params *params) +int csinn_proposal_init(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PROPOSAL, output->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_PROPOSAL, output->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(cls_prob, bbox_pred, im_info, output, params); } return CSINN_TRUE; } -int csi_proposal(struct csi_tensor *cls_prob, - struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, - struct csi_tensor *output, - struct proposal_params *params) +int csinn_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params) { - CSI_DEBUG_CALL(csi_proposal_debug_info(cls_prob, bbox_pred, im_info, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(cls_prob, bbox_pred, im_info, output, params); + SHL_DEBUG_CALL(shl_proposal_debug_info(cls_prob, bbox_pred, im_info, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(cls_prob, bbox_pred, im_info, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/psroipooling.c b/source/nn2/psroipooling.c index abd81074..ffbbc036 100644 --- a/source/nn2/psroipooling.c +++ b/source/nn2/psroipooling.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_psroipooling_init(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params) +int csinn_psroipooling_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PSROIPOOLING, data->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_PSROIPOOLING, data->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } return CSINN_TRUE; } -int csi_psroipooling(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params) +int csinn_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params) { - CSI_DEBUG_CALL(csi_psroipooling_debug_info(data, rois, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(data, rois, output, params); + SHL_DEBUG_CALL(shl_psroipooling_debug_info(data, rois, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_logsumexp.c b/source/nn2/reduce_logsumexp.c index 8208f911..5e22e489 100644 --- a/source/nn2/reduce_logsumexp.c +++ b/source/nn2/reduce_logsumexp.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_logsumexp_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_logsumexp_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_LOGSUMEXP, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_LOGSUMEXP, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_logsumexp(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_max.c b/source/nn2/reduce_max.c index 4beda5a8..c91548f7 100644 --- a/source/nn2/reduce_max.c +++ b/source/nn2/reduce_max.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_max_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_max_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MAX, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_MAX, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_max(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_mean.c b/source/nn2/reduce_mean.c index 71d30fb9..dd74c73c 100644 --- a/source/nn2/reduce_mean.c +++ b/source/nn2/reduce_mean.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_mean_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_mean_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MEAN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_MEAN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_mean(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_min.c b/source/nn2/reduce_min.c index a7587be0..5daf03e1 100644 --- a/source/nn2/reduce_min.c +++ b/source/nn2/reduce_min.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_min_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_min_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MIN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_MIN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_prod.c b/source/nn2/reduce_prod.c index 11f2e241..aa5c7ae3 100644 --- a/source/nn2/reduce_prod.c +++ b/source/nn2/reduce_prod.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_prod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_prod_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_PROD, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_PROD, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_prod(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reduce_sum.c b/source/nn2/reduce_sum.c index 0be101b9..3a949814 100644 --- a/source/nn2/reduce_sum.c +++ b/source/nn2/reduce_sum.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reduce_sum_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_sum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_SUM, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REDUCE_SUM, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reduce_sum(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/relu.c b/source/nn2/relu.c index 4ecbe064..82faab08 100644 --- a/source/nn2/relu.c +++ b/source/nn2/relu.c @@ -16,38 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_RELU, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RELU, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/relu1.c b/source/nn2/relu1.c index 60616b97..9afbb7dd 100644 --- a/source/nn2/relu1.c +++ b/source/nn2/relu1.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_relu1_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu1_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU1, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RELU1, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_relu1(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/relu6.c b/source/nn2/relu6.c index ed04a7b2..9d7873d9 100644 --- a/source/nn2/relu6.c +++ b/source/nn2/relu6.c @@ -16,35 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_relu6_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_RELU6, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU6, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RELU6, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/relun.c b/source/nn2/relun.c index df4191e4..3772fd5f 100644 --- a/source/nn2/relun.c +++ b/source/nn2/relun.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_relun_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relun_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELUN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RELUN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_relun(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_relun(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reorg.c b/source/nn2/reorg.c index 34c10ed1..83207808 100644 --- a/source/nn2/reorg.c +++ b/source/nn2/reorg.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reorg_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reorg_params *params) +int csinn_reorg_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REORG, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REORG, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reorg(struct csi_tensor *input, - struct csi_tensor *output, - struct reorg_params *params) +int csinn_reorg(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params) { - CSI_DEBUG_CALL(csi_reorg_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reorg_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reshape.c b/source/nn2/reshape.c index 35135f66..0f53ff6e 100644 --- a/source/nn2/reshape.c +++ b/source/nn2/reshape.c @@ -16,39 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reshape_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int csinn_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_RESHAPE, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RESHAPE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RESHAPE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reshape(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int csinn_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { - CSI_DEBUG_CALL(csi_reshape_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reshape_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/resize.c b/source/nn2/resize.c index d9ae5ab7..730eed3f 100644 --- a/source/nn2/resize.c +++ b/source/nn2/resize.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_resize_init(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params) +int csinn_resize_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RESIZE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RESIZE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_resize(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params) +int csinn_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params) { - CSI_DEBUG_CALL(csi_resize_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_resize_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/reverse.c b/source/nn2/reverse.c index 7663f2cf..4627a43b 100644 --- a/source/nn2/reverse.c +++ b/source/nn2/reverse.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_reverse_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reverse_params *params) +int csinn_reverse_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REVERSE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_REVERSE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_reverse(struct csi_tensor *input, - struct csi_tensor *output, - struct reverse_params *params) +int csinn_reverse(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params) { - CSI_DEBUG_CALL(csi_reverse_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reverse_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/roialign.c b/source/nn2/roialign.c index b5d0694d..fa32d691 100644 --- a/source/nn2/roialign.c +++ b/source/nn2/roialign.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_roi_align_init(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_align_params *params) +int csinn_roi_align_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROIALIGN, data->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ROIALIGN, data->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } return CSINN_TRUE; } -int csi_roi_align(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_align_params *params) +int csinn_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params) { - CSI_DEBUG_CALL(csi_roi_align_debug_info(data, rois, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(data, rois, output, params); + SHL_DEBUG_CALL(shl_roi_align_debug_info(data, rois, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/roipool.c b/source/nn2/roipool.c index 6e36b70a..574e3e98 100644 --- a/source/nn2/roipool.c +++ b/source/nn2/roipool.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_roipool_init(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params) +int csinn_roipool_init(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROIPOOL, data->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ROIPOOL, data->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } return CSINN_TRUE; } -int csi_roipool(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params) +int csinn_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, struct csinn_tensor *output, + struct csinn_roi_pool_params *params) { - CSI_DEBUG_CALL(csi_roi_pool_debug_info(data, rois, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(data, rois, output, params); + SHL_DEBUG_CALL(shl_roi_pool_debug_info(data, rois, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(data, rois, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/round.c b/source/nn2/round.c index c01c7f84..6abc373b 100644 --- a/source/nn2/round.c +++ b/source/nn2/round.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_round_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_round_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROUND, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_ROUND, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_round(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_round(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/rsqrt.c b/source/nn2/rsqrt.c index 3aa35526..0b6ba283 100644 --- a/source/nn2/rsqrt.c +++ b/source/nn2/rsqrt.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_rsqrt_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_rsqrt_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RSQRT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_RSQRT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_rsqrt(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/scatter.c b/source/nn2/scatter.c index 2eab72f3..ef975758 100644 --- a/source/nn2/scatter.c +++ b/source/nn2/scatter.c @@ -16,33 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" - -int csi_scatter_nd_init(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *updates, - struct csi_tensor *output, - struct scatter_nd_params *params) +int csinn_scatter_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SCATTER_ND, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SCATTER_ND, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, indices, updates, output, params); } return CSINN_TRUE; } -int csi_scatter_nd(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *updates, - struct csi_tensor *output, - struct scatter_nd_params *params) +int csinn_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params) { - CSI_DEBUG_CALL(csi_scatter_nd_debug_info(input, indices, updates, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, indices, updates, output, params); + SHL_DEBUG_CALL(shl_scatter_nd_debug_info(input, indices, updates, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, indices, updates, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/segment_max.c b/source/nn2/segment_max.c index 46091951..69cde56b 100644 --- a/source/nn2/segment_max.c +++ b/source/nn2/segment_max.c @@ -16,37 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_segment_max_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_max_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { if (params->unsorted == CSINN_TRUE) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MAX, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_UNSORTED_SEGMENT_MAX, input0->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MAX, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SEGMENT_MAX, input0->dtype); + } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_segment_max(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/segment_mean.c b/source/nn2/segment_mean.c index 2f2262d5..e9863f3e 100644 --- a/source/nn2/segment_mean.c +++ b/source/nn2/segment_mean.c @@ -16,37 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_segment_mean_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_mean_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { if (params->unsorted == CSINN_TRUE) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MEAN, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_UNSORTED_SEGMENT_MEAN, input0->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MEAN, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SEGMENT_MEAN, input0->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_segment_mean(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/segment_min.c b/source/nn2/segment_min.c index 9acc72cc..a8a0e958 100644 --- a/source/nn2/segment_min.c +++ b/source/nn2/segment_min.c @@ -16,37 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_segment_min_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_min_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { + enum csinn_rmode_enum run_mode = shl_get_run_mode(¶ms->base); + void *cbf = NULL; if (params->unsorted == CSINN_TRUE) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MIN, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_UNSORTED_SEGMENT_MIN, input0->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MIN, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SEGMENT_MIN, input0->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_segment_min(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/segment_prod.c b/source/nn2/segment_prod.c index 6453e7e4..80cfb00b 100644 --- a/source/nn2/segment_prod.c +++ b/source/nn2/segment_prod.c @@ -16,37 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_segment_prod_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_prod_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { + enum csinn_rmode_enum run_mode = shl_get_run_mode(¶ms->base); + void *cbf = NULL; if (params->unsorted == CSINN_TRUE) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_PROD, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_UNSORTED_SEGMENT_PROD, input0->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_PROD, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SEGMENT_PROD, input0->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_segment_prod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/segment_sum.c b/source/nn2/segment_sum.c index 84a08bf9..1df54ce9 100644 --- a/source/nn2/segment_sum.c +++ b/source/nn2/segment_sum.c @@ -16,37 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_segment_sum_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_sum_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { + enum csinn_rmode_enum run_mode = shl_get_run_mode(¶ms->base); + void *cbf = NULL; if (params->unsorted == CSINN_TRUE) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_SUM, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_UNSORTED_SEGMENT_SUM, input0->dtype); } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_SUM, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SEGMENT_SUM, input0->dtype); + } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_segment_sum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) +int csinn_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params) { - CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/select.c b/source/nn2/select.c index 66f72318..9ce190f1 100644 --- a/source/nn2/select.c +++ b/source/nn2/select.c @@ -16,32 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_select_init(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params) +int csinn_select_init(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SELECT, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SELECT, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(condition, input0, input1, output, params); } return CSINN_TRUE; } -int csi_select(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params) +int csinn_select(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { - CSI_DEBUG_CALL(csi_select_debug_info(condition, input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(condition, input0, input1, output, params); + SHL_DEBUG_CALL(shl_select_debug_info(condition, input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(condition, input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sequence_mask.c b/source/nn2/sequence_mask.c index f4d4e691..c1ef1f76 100644 --- a/source/nn2/sequence_mask.c +++ b/source/nn2/sequence_mask.c @@ -16,26 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sequence_mask_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct sequence_mask_params *params) +int csinn_sequence_mask_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_sequence_mask_params *params) { return CSINN_FALSE; } -int csi_sequence_mask(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct sequence_mask_params *params) +int csinn_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_sequence_mask_params *params) { - CSI_DEBUG_CALL(csi_sequence_mask_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_sequence_mask_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/setup.c b/source/nn2/setup.c index 1c16d2d4..11a5013d 100644 --- a/source/nn2/setup.c +++ b/source/nn2/setup.c @@ -16,189 +16,165 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_utils.h" - -struct csi_session *csi_alloc_session() { return csi_mem_alloc(sizeof(struct csi_session)); } - -void csi_free_session(struct csi_session *sess) { csi_mem_free(sess); } - -void *csi_bc_map_ref(int op, int dtype); -void *csi_bc_map_gref(int op, int dtype); -void *csi_bc_map_c906(int op, int dtype); -void *csi_bc_map_i805(int op, int dtype); -void *csi_bc_map_e804(int op, int dtype); -void *csi_bc_map_ref_i805(int op, int dtype); -void *csi_bc_map_rvv(int op, int dtype); -void *csi_bc_func_table[CSINN_API_SIZE] = { -#ifdef CSI_BUILD_REF - csi_bc_map_ref, -#else - NULL, /* c code */ +#include "shl_utils.h" + +void shl_target_init_ref(); +void shl_target_init_gref(); +void shl_target_init_ovx(); +void shl_target_init_c906(); +void shl_target_init_pnna(); +void shl_target_init_i805(); +void shl_target_init_e804(); +void shl_target_init_ref_i805(); +void shl_target_init_c908(); +void shl_target_init_asp(); +void shl_target_init_rvv(); + +static int __shl_has_init; + +void shl_init() +{ +#ifdef SHL_BUILD_REF + shl_target_init_ref(); +#endif +#ifdef SHL_BUILD_GREF + shl_target_init_gref(); +#endif +#ifdef SHL_BUILD_C906 + shl_target_init_c906(); #endif -#ifdef CSI_BUILD_GREF - csi_bc_map_gref, -#else - NULL, /* gref */ +#ifdef SHL_BUILD_OPENVX + shl_target_init_ovx(); #endif - NULL, /* c860 */ -#ifdef CSI_BUILD_C906 - csi_bc_map_c906, -#else - NULL, /* c906 */ +#ifdef SHL_BUILD_PNNA + shl_target_init_pnna(); #endif - NULL, - NULL, - NULL, - NULL, - NULL, -#ifdef CSI_BUILD_I805 - csi_bc_map_i805, -#else - NULL, /* xt800v : i805/ck805 */ +#ifdef SHL_BUILD_I805 + shl_target_init_i805(); #endif -#ifdef CSI_BUILD_E804 - csi_bc_map_e804, -#else - NULL, /* xt800p : e804d/ck804 */ +#ifdef SHL_BUILD_E804 + shl_target_init_e804(); #endif -#ifdef CSI_BUILD_REF_I805 - csi_bc_map_ref_i805, -#else - NULL, +#ifdef SHL_BUILD_REF_I805 + shl_target_init_ref_i805(); #endif - NULL, - NULL, - NULL, -#ifdef CSI_BUILD_RVV - csi_bc_map_rvv, -#else - NULL, /* rvv */ +#ifdef SHL_BUILD_C908 + shl_target_init_c908(); #endif -}; +#ifdef SHL_BUILD_ASP + shl_target_init_asp(); +#endif +#ifdef SHL_BUILD_RVV + shl_target_init_rvv(); +#endif +} -void *csi_bc_map(int api, int rmode, int op, int dtype) +struct csinn_session *csinn_alloc_session() { - void *(*func)(); - if (rmode == CSINN_RM_CPU_GRAPH) { - func = csi_bc_func_table[CSINN_GREF]; + if (__shl_has_init == 0) { + shl_init(); + __shl_has_init = 1; + } + return shl_mem_alloc(sizeof(struct csinn_session)); +} + +void csinn_free_session(struct csinn_session *sess) { shl_mem_free(sess); } + +static void *shl_cb_func_table[CSINN_API_SIZE]; +void shl_register_op_callback(int api, void *cb) { shl_cb_func_table[api] = cb; } + +int shl_op_callback_map(struct csinn_params_base *base, int op, int dtype) +{ + void *(*op_map)(); + if (base->sess && base->sess->base_run_mode == CSINN_RM_CPU_GRAPH && + base->sess->base_api == CSINN_REF) { + /* Heterogeneous use GREF */ + op_map = shl_cb_func_table[CSINN_GREF]; } else { - func = csi_bc_func_table[api]; + op_map = shl_cb_func_table[base->api]; } - return func(op, dtype); + + if (op_map == NULL) { + return CSINN_FALSE; + } + + struct csinn_callback *cb = op_map(op, dtype); + if (cb == NULL) { + shl_debug_info("%s: Cannot find OP map\n", __func__); + } + memcpy(base->cb, cb, sizeof(struct csinn_callback)); + + return CSINN_TRUE; } -void *csi_init_map_c906(int op, int dtype); -void *csi_init_map_ref(int op, int dtype); -void *csi_init_map_i805(int op, int dtype); -void *csi_init_map_e804(int op, int dtype); -void *csi_init_map_ref_i805(int op, int dtype); -void *csi_init_map_c908(int op, int dtype); -void *csi_init_map_rvv(int op, int dtype); -void *csi_init_func_table[CSINN_API_SIZE] = { -#ifdef CSI_BUILD_REF - csi_init_map_ref, /* c code */ -#else - NULL, /* c code */ -#endif - NULL, /* gref */ - NULL, /* c860 */ -#ifdef CSI_BUILD_C906 - csi_init_map_c906, -#else - NULL, /* c906 */ -#endif - NULL, - NULL, - NULL, - NULL, - NULL, -#ifdef CSI_BUILD_I805 - csi_init_map_i805, -#else - NULL, -#endif -#ifdef CSI_BUILD_E804 - csi_init_map_e804, -#else - NULL, -#endif -#ifdef CSI_BUILD_REF_I805 - csi_init_map_ref_i805, -#else - NULL, -#endif - NULL, - NULL, - NULL, -#ifdef CSI_BUILD_RVV - csi_init_map_rvv, -#else - NULL, /* rvv */ -#endif -}; +static void *shl_runtime_callback_table[CSINN_API_SIZE]; -void *csi_init_map(int api, int op, int dtype) +void shl_register_runtime_callback(int api, void *cb) { shl_runtime_callback_table[api] = cb; } + +void *shl_get_runtime_callback(struct csinn_session *sess, int op) { - void *(*func)() = csi_init_func_table[api]; - if (func != NULL) { - return func(op, dtype); + void *(*runtime_map)(); + if (sess->base_run_mode == CSINN_RM_CPU_GRAPH && sess->base_api == CSINN_REF) { + /* Heterogeneous use GREF */ + runtime_map = shl_runtime_callback_table[CSINN_GREF]; } else { + runtime_map = shl_runtime_callback_table[sess->base_api]; + } + if (runtime_map == NULL) { return NULL; + } else { + return runtime_map(op); } } -void csi_session_init(struct csi_session *sess) +void csinn_session_init(struct csinn_session *sess) { - csi_debug_set_level(sess->debug_level); + shl_debug_set_level(sess->debug_level); - void *(*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_INIT, sess->base_dtype); + void *(*func)() = shl_get_runtime_callback(sess, CSINN_SESSION_INIT); if (func != NULL) { func(sess); } } -void csi_session_deinit(struct csi_session *sess) +void csinn_session_deinit(struct csinn_session *sess) { void *(*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_DEINIT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SESSION_DEINIT); if (func != NULL) { func(sess); } } -void csi_set_output_number(int number, struct csi_session *sess) +void csinn_set_output_number(int number, struct csinn_session *sess) { sess->output_num = number; - sess->output = csi_mem_alloc(sess->output_num * sizeof(struct csi_tensor *)); + sess->output = shl_mem_alloc(sess->output_num * sizeof(struct csinn_tensor *)); void (*func)(); - func = - csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_OUTPUT_NUMBER, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SET_OUTPUT_NUMBER); if (func != NULL) { func(number, sess); } } -void csi_set_input_number(int number, struct csi_session *sess) +void csinn_set_input_number(int number, struct csinn_session *sess) { sess->input_num = number; - sess->input = csi_mem_alloc(sess->input_num * sizeof(struct csi_tensor *)); + sess->input = shl_mem_alloc(sess->input_num * sizeof(struct csinn_tensor *)); void (*func)(); - func = - csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_INPUT_NUMBER, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SET_INPUT_NUMBER); if (func != NULL) { func(number, sess); } } -int csi_get_output_number(struct csi_session *sess) +int csinn_get_output_number(struct csinn_session *sess) { int (*func)(); - func = - csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_OUTPUT_NUMBER, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT_NUMBER); if (func != NULL) { return func(sess); } else { @@ -206,11 +182,10 @@ int csi_get_output_number(struct csi_session *sess) } } -int csi_get_input_number(struct csi_session *sess) +int csinn_get_input_number(struct csinn_session *sess) { int (*func)(); - func = - csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_INPUT_NUMBER, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_GET_INPUT_NUMBER); if (func != NULL) { return func(sess); } else { @@ -218,62 +193,62 @@ int csi_get_input_number(struct csi_session *sess) } } -int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess) +int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { sess->output[index] = output; int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_OUTPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SET_OUTPUT); if (func != NULL) { return func(index, output, sess); } return CSINN_TRUE; } -int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess) +int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { sess->input[index] = input; int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_INPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SET_INPUT); if (func != NULL) { return func(index, input, sess); } return CSINN_TRUE; } -int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess) +int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { - csi_tensor_copy(output, sess->output[index]); + csinn_tensor_copy(output, sess->output[index]); int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_OUTPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT); if (func != NULL) { return func(index, output, sess); } return CSINN_TRUE; } -int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess) +int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { - csi_tensor_copy(input, sess->input[index]); + csinn_tensor_copy(input, sess->input[index]); int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_INPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_GET_INPUT); if (func != NULL) { return func(index, input, sess); } return CSINN_TRUE; } -int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess) +int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { sess->input[index]->data = input->data; int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_UPDATE_INPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_UPDATE_INPUT); if (func != NULL) { int ret = CSINN_FALSE; if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) { - uint64_t start = csi_get_timespec(); + uint64_t start = shl_get_timespec(); ret = func(index, input, sess); - uint64_t end = csi_get_timespec(); - csi_print_time_interval(start, end, __func__); + uint64_t end = shl_get_timespec(); + shl_print_time_interval(start, end, __func__); } else { ret = func(index, input, sess); } @@ -282,28 +257,28 @@ int csi_update_input(int index, struct csi_tensor *input, struct csi_session *se return CSINN_TRUE; } -int csi_update_output(int index, struct csi_tensor *output, struct csi_session *sess) +int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { sess->output[index]->data = output->data; int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_UPDATE_OUTPUT, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_UPDATE_OUTPUT); if (func != NULL) { return func(index, output, sess); } return CSINN_TRUE; } -int csi_session_setup(struct csi_session *sess) +int csinn_session_setup(struct csinn_session *sess) { int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_SETUP, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SESSION_SETUP); if (func != NULL) { int ret = CSINN_FALSE; if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) { - uint64_t start = csi_get_timespec(); + uint64_t start = shl_get_timespec(); ret = func(sess); - uint64_t end = csi_get_timespec(); - csi_print_time_interval(start, end, __func__); + uint64_t end = shl_get_timespec(); + shl_print_time_interval(start, end, __func__); } else { ret = func(sess); } @@ -312,17 +287,17 @@ int csi_session_setup(struct csi_session *sess) return CSINN_FALSE; } -int csi_session_run(struct csi_session *sess) +int csinn_session_run(struct csinn_session *sess) { int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_RUN, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_SESSION_RUN); if (func != NULL) { int ret = CSINN_FALSE; if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) { - uint64_t start = csi_get_timespec(); + uint64_t start = shl_get_timespec(); ret = func(sess); - uint64_t end = csi_get_timespec(); - csi_print_time_interval(start, end, __func__); + uint64_t end = shl_get_timespec(); + shl_print_time_interval(start, end, __func__); } else { ret = func(sess); } @@ -331,53 +306,29 @@ int csi_session_run(struct csi_session *sess) return CSINN_FALSE; } -int csi_set_tensor_entry(struct csi_tensor *t, struct csi_session *sess) +int csinn_set_tensor_entry(struct csinn_tensor *t, struct csinn_session *sess) { int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_TENSOR_ENTRY, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_TENSOR_ENTRY); if (func != NULL) { return func(t, sess); } return CSINN_FALSE; } -struct csi_bc_op_list *csi_bc_list_end(struct csi_bc_op_list *list) -{ - struct csi_bc_op_list *l = list; - while (l->next) { - l = l->next; - } - return l; -} - -void *csi_bc_list_match(struct csi_bc_op_list *list, enum csinn_dtype_enum dtype, - enum csinn_op_enum op_name) -{ - void *ret = NULL; - struct csi_bc_op_list *l = list; - while (l) { - if (l->dtype == dtype && l->op_name == op_name) { - ret = l->bc; - break; - } - l = l->next; - } - return ret; -} - -int csi_load_binary_model(char *path, struct csi_session *sess) +int csinn_load_binary_model(struct csinn_session *sess) { int (*func)(); - func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_LOAD_BG, sess->base_dtype); + func = shl_get_runtime_callback(sess, CSINN_LOAD_BG); if (func != NULL) { int ret = CSINN_FALSE; if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) { - uint64_t start = csi_get_timespec(); - ret = func(path, sess); - uint64_t end = csi_get_timespec(); - csi_print_time_interval(start, end, __func__); + uint64_t start = shl_get_timespec(); + ret = func(sess); + uint64_t end = shl_get_timespec(); + shl_print_time_interval(start, end, __func__); } else { - ret = func(path, sess); + ret = func(sess); } return ret; } diff --git a/source/nn2/shape.c b/source/nn2/shape.c index b5f5ceaf..1de0d001 100644 --- a/source/nn2/shape.c +++ b/source/nn2/shape.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_shape_init(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params) +int csinn_shape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SHAPE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SHAPE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_shape(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params) +int csinn_shape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { - CSI_DEBUG_CALL(csi_shape_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_shape_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/shuffle_channel.c b/source/nn2/shuffle_channel.c index 1a624af1..39973f9e 100644 --- a/source/nn2/shuffle_channel.c +++ b/source/nn2/shuffle_channel.c @@ -1,4 +1,4 @@ - /* +/* * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_shuffle_channel_init(struct csi_tensor *input, - struct csi_tensor *output, - struct shuffle_channel_params *params) +int csinn_shuffle_channel_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SHUFFLE_CHANNEL, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SHUFFLE_CHANNEL, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_shuffle_channel(struct csi_tensor *input, - struct csi_tensor *output, - struct shuffle_channel_params *params) +int csinn_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { - CSI_DEBUG_CALL(csi_shuffle_channel_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_shuffle_channel_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/sigmoid.c b/source/nn2/sigmoid.c index 0f482b89..9fd911c3 100644 --- a/source/nn2/sigmoid.c +++ b/source/nn2/sigmoid.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sigmoid_init(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csinn_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIGMOID, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SIGMOID, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csinn_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - CSI_DEBUG_CALL(csi_sigmoid_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_sigmoid_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sign.c b/source/nn2/sign.c index c8749bf3..e3e2f3d8 100644 --- a/source/nn2/sign.c +++ b/source/nn2/sign.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sign_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sign_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIGN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SIGN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sign(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sin.c b/source/nn2/sin.c index 29a19ae0..450d7a02 100644 --- a/source/nn2/sin.c +++ b/source/nn2/sin.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sin_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sin_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SIN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sin(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sinh.c b/source/nn2/sinh.c index 2a4dc620..6bea7206 100644 --- a/source/nn2/sinh.c +++ b/source/nn2/sinh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sinh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SINH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SINH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sinh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/slice.c b/source/nn2/slice.c index fb75a496..0b96e4ac 100644 --- a/source/nn2/slice.c +++ b/source/nn2/slice.c @@ -16,32 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_slice_init(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params) +int csinn_slice_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params) { if (params->begin != NULL) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SLICE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SLICE, input->dtype); } else { return CSINN_FALSE; } + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); + } return CSINN_TRUE; } -int csi_slice(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params) +int csinn_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params) { - CSI_DEBUG_CALL(csi_slice_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_slice_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/softmax.c b/source/nn2/softmax.c index 684b589f..05b76671 100644 --- a/source/nn2/softmax.c +++ b/source/nn2/softmax.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_softmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csinn_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTMAX, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SOFTMAX, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_softmax(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csinn_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - CSI_DEBUG_CALL(csi_softmax_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_softmax_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/softplus.c b/source/nn2/softplus.c index 0d979527..3f833660 100644 --- a/source/nn2/softplus.c +++ b/source/nn2/softplus.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_softplus_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_softplus_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTPLUS, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SOFTPLUS, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_softplus(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_softplus(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/softrelu.c b/source/nn2/softrelu.c index b34a8b0b..b356dd1b 100644 --- a/source/nn2/softrelu.c +++ b/source/nn2/softrelu.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_softrelu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_softrelu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTRELU, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SOFTRELU, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_softrelu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_softrelu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/softsign.c b/source/nn2/softsign.c index 537098c0..b0347449 100644 --- a/source/nn2/softsign.c +++ b/source/nn2/softsign.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_softsign_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_softsign_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTSIGN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SOFTSIGN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_softsign(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_softsign(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/space_to_batch.c b/source/nn2/space_to_batch.c index e9f791f7..7e75b051 100644 --- a/source/nn2/space_to_batch.c +++ b/source/nn2/space_to_batch.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_space_to_batch_init(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params) +int csinn_space_to_batch_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_BATCH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SPACE_TO_BATCH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_space_to_batch(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params) +int csinn_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params) { - CSI_DEBUG_CALL(csi_space_to_batch_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_space_to_batch_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/space_to_batch_nd.c b/source/nn2/space_to_batch_nd.c index ea23b9d8..f30d0114 100644 --- a/source/nn2/space_to_batch_nd.c +++ b/source/nn2/space_to_batch_nd.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_space_to_batch_nd_init(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_nd_params *params) +int csinn_space_to_batch_nd_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_BATCH_ND, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SPACE_TO_BATCH_ND, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_space_to_batch_nd(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_nd_params *params) +int csinn_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params) { - CSI_DEBUG_CALL(csi_space_to_batch_nd_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_space_to_batch_nd_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/space_to_depth.c b/source/nn2/space_to_depth.c index a8725cc0..c6849e94 100644 --- a/source/nn2/space_to_depth.c +++ b/source/nn2/space_to_depth.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_space_to_depth_init(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params) +int csinn_space_to_depth_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_DEPTH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SPACE_TO_DEPTH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_space_to_depth(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params) +int csinn_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params) { - CSI_DEBUG_CALL(csi_space_to_depth_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_space_to_depth_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/split.c b/source/nn2/split.c index 20f8eb00..e9dad6c3 100644 --- a/source/nn2/split.c +++ b/source/nn2/split.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_split_init(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params) +int csinn_split_init(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPLIT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SPLIT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_split(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params) +int csinn_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { - CSI_DEBUG_CALL(csi_split_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_split_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sqrt.c b/source/nn2/sqrt.c index c7916298..b779ddd4 100644 --- a/source/nn2/sqrt.c +++ b/source/nn2/sqrt.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sqrt_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sqrt_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQRT, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SQRT, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sqrt(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_sqrt(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/square.c b/source/nn2/square.c index eecfb4e2..65665cc6 100644 --- a/source/nn2/square.c +++ b/source/nn2/square.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_square_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_square_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQUARE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SQUARE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_square(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_square(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/squeeze.c b/source/nn2/squeeze.c index 1271f091..bf1b5cc7 100644 --- a/source/nn2/squeeze.c +++ b/source/nn2/squeeze.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_squeeze_init(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params) +int csinn_squeeze_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQUEEZE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SQUEEZE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_squeeze(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params) +int csinn_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params) { - CSI_DEBUG_CALL(csi_squeeze_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_squeeze_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/stack.c b/source/nn2/stack.c index fd4e588e..3d7cb488 100644 --- a/source/nn2/stack.c +++ b/source/nn2/stack.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_stack_init(struct csi_tensor **input, - struct csi_tensor *output, - struct stack_params *params) +int csinn_stack_init(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_STACK, input[0]->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_STACK, input[0]->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_stack(struct csi_tensor **input, - struct csi_tensor *output, - struct stack_params *params) +int csinn_stack(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params) { - CSI_DEBUG_CALL(csi_stack_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_stack_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/strided_slice.c b/source/nn2/strided_slice.c index 53a20a22..38dc4286 100644 --- a/source/nn2/strided_slice.c +++ b/source/nn2/strided_slice.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_strided_slice_init(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) +int csinn_strided_slice_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_STRIDED_SLICE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_STRIDED_SLICE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_strided_slice(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) +int csinn_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params) { - CSI_DEBUG_CALL(csi_strided_slice_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_strided_slice_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sub.c b/source/nn2/sub.c index e7a81e55..f13fa057 100644 --- a/source/nn2/sub.c +++ b/source/nn2/sub.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sub_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_sub_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SUB, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_SUB, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_sub(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/sum.c b/source/nn2/sum.c index c7d27bc8..eab211f8 100644 --- a/source/nn2/sum.c +++ b/source/nn2/sum.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_sum_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_sum_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SUM, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; - } + shl_op_callback_map(¶ms->base, CSINN_OP_SUM, input->dtype); + } + + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_sum(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csinn_sum(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/tan.c b/source/nn2/tan.c index 2a5fafb3..8c5d4ddb 100644 --- a/source/nn2/tan.c +++ b/source/nn2/tan.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_tan_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_tan_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TAN, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TAN, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_tan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_tan(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/tanh.c b/source/nn2/tanh.c index d2267479..57871cf5 100644 --- a/source/nn2/tanh.c +++ b/source/nn2/tanh.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_tanh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_tanh_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TANH, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TANH, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_tanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/threshold_relu.c b/source/nn2/threshold_relu.c index 534162ea..ca1f8e21 100644 --- a/source/nn2/threshold_relu.c +++ b/source/nn2/threshold_relu.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_threshold_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_threshold_relu_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_THRESHOLD_RELU, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_THRESHOLD_RELU, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_threshold_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csinn_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/tile.c b/source/nn2/tile.c index fcf52fdc..a9e87de2 100644 --- a/source/nn2/tile.c +++ b/source/nn2/tile.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_tile_init(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params) +int csinn_tile_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TILE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TILE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_tile(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params) +int csinn_tile(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params) { - CSI_DEBUG_CALL(csi_tile_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_tile_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/topk.c b/source/nn2/topk.c index f932f8f3..48916472 100644 --- a/source/nn2/topk.c +++ b/source/nn2/topk.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_topk_init(struct csi_tensor *input, - struct csi_tensor *output1, - struct csi_tensor *output2, - struct topk_params *params) +int csinn_topk_init(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TOPK, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TOPK, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output1, output2, params); } return CSINN_TRUE; } -int csi_topk(struct csi_tensor *input, - struct csi_tensor *output1, - struct csi_tensor *output2, - struct topk_params *params) +int csinn_topk(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params) { - CSI_DEBUG_CALL(csi_topk_debug_info(input, output1, output2, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output1, output2, params); + SHL_DEBUG_CALL(shl_topk_debug_info(input, output1, output2, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output1, output2, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/transpose.c b/source/nn2/transpose.c index 0d1cddb7..6c859b2f 100644 --- a/source/nn2/transpose.c +++ b/source/nn2/transpose.c @@ -16,39 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_transpose_init(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) +int csinn_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { - if (params->base.run_mode != CSINN_RM_CPU_GRAPH) { - int (*init_func)(); - init_func = csi_init_map(params->base.api, CSINN_OP_TRANSPOSE, input->dtype); - if (init_func != NULL) { - return init_func(input, output, params); - } - } - - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TRANSPOSE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TRANSPOSE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_transpose(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) +int csinn_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { - CSI_DEBUG_CALL(csi_transpose_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_transpose_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/trunc.c b/source/nn2/trunc.c index 0b3e8a6c..bae0b8a2 100644 --- a/source/nn2/trunc.c +++ b/source/nn2/trunc.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_trunc_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_trunc_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TRUNC, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_TRUNC, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_trunc(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_trunc(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/unpooling.c b/source/nn2/unpooling.c index 49effac4..1b068706 100644 --- a/source/nn2/unpooling.c +++ b/source/nn2/unpooling.c @@ -19,30 +19,29 @@ /* CSI-NN2 version 1.9.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_unpooling_init(struct csi_tensor *input, - struct csi_tensor *mask, - struct csi_tensor *output, - struct unpooling_params *params) +int csinn_unpooling_init(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNPOOLING, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_UNPOOLING, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_unpooling(struct csi_tensor *input, - struct csi_tensor *mask, - struct csi_tensor *output, - struct unpooling_params *params) +int csinn_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params) { - CSI_DEBUG_CALL(csi_unpooling_debug_info(input, mask, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, mask, output, params); + SHL_DEBUG_CALL(shl_unpooling_debug_info(input, mask, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, mask, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; } - diff --git a/source/nn2/unstack.c b/source/nn2/unstack.c index 67b8ce79..f7031c74 100644 --- a/source/nn2/unstack.c +++ b/source/nn2/unstack.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_unstack_init(struct csi_tensor *input, - struct csi_tensor **output, - struct unstack_params *params) +int csinn_unstack_init(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSTACK, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_UNSTACK, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_unstack(struct csi_tensor *input, - struct csi_tensor **output, - struct unstack_params *params) +int csinn_unstack(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params) { - CSI_DEBUG_CALL(csi_unstack_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_unstack_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/utils.c b/source/nn2/utils.c index 479c1d09..6ded07f3 100644 --- a/source/nn2/utils.c +++ b/source/nn2/utils.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include #include "csi_nn.h" -#include "csi_ref.h" +#include "shl_utils.h" /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/quantization_util.cc */ -static int64_t integer_from_exp(double input, int *shift) +static int64_t integer_from_exp(double input, int32_t *shift) { uint64_t kSignMask = 0x8000000000000000LL; uint64_t kExponentMask = 0x7ff0000000000000LL; @@ -100,7 +100,8 @@ static int64_t integer_from_exp(double input, int *shift) return fraction; } -void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift) +void shl_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, + int32_t *shift) { if (double_multiplier == 0.) { *quantized_multiplier = 0; @@ -135,7 +136,7 @@ void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multip *quantized_multiplier = (int32_t)(q_fixed); } -void csi_statistical_mean_std(float *data, int sz) +void shl_statistical_mean_std(float *data, int sz) { int i = 0; float max_value = data[0]; @@ -163,7 +164,7 @@ void csi_statistical_mean_std(float *data, int sz) printf("The std_value of output: %lf\n", std); } -void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class) +void shl_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class) { uint32_t i, j, k; @@ -190,7 +191,7 @@ void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class) } } -void csi_show_top5(struct csi_tensor *output, struct csi_session *sess) +void shl_show_top5(struct csinn_tensor *output, struct csinn_session *sess) { uint32_t i, size; uint32_t class[5]; @@ -205,11 +206,11 @@ void csi_show_top5(struct csi_tensor *output, struct csi_session *sess) size *= output->dim[i]; } - // #ifdef CSI_DEBUG - csi_statistical_mean_std(output->data, size); + // #ifdef SHL_DEBUG + shl_statistical_mean_std(output->data, size); // #endif - csi_get_top5(output->data, size, prob, class); + shl_get_top5(output->data, size, prob, class); printf(" ============ top5: ===========\n"); size = size > 5 ? 5 : size; @@ -218,21 +219,29 @@ void csi_show_top5(struct csi_tensor *output, struct csi_session *sess) } } -int csi_tensor_size(struct csi_tensor *tensor) +int csinn_tensor_size(struct csinn_tensor *tensor) { if (tensor->dim_count == 0) { return 0; } int size = 1; - for (int i = 0; i < tensor->dim_count; i++) { - size *= tensor->dim[i]; + if (tensor->layout == CSINN_LAYOUT_O32I32) { + size = tensor->dim[1] * ((tensor->dim[0] + 31) / 32) * 32; + } else if (tensor->layout == CSINN_LAYOUT_O32HWI32) { + size = tensor->dim[1] * tensor->dim[2] * tensor->dim[3] * ((tensor->dim[0] + 31) / 32) * 32; + } else if (tensor->layout == CSINN_LAYOUT_1HW32O32) { + size = tensor->dim[1] * tensor->dim[2] * ((tensor->dim[3] + 31) / 32) * 32; + } else { + for (int i = 0; i < tensor->dim_count; i++) { + size *= tensor->dim[i]; + } } return size; } -int csi_tensor_byte_size(struct csi_tensor *tensor) +int csinn_tensor_byte_size(struct csinn_tensor *tensor) { - int size = csi_tensor_size(tensor); + int size = csinn_tensor_size(tensor); switch (tensor->dtype) { case CSINN_DTYPE_INT4: /* FIXME: round to byte */ @@ -258,26 +267,27 @@ int csi_tensor_byte_size(struct csi_tensor *tensor) return size; } -struct csi_tensor *csi_alloc_tensor(struct csi_session *session) +struct csinn_tensor *csinn_alloc_tensor(struct csinn_session *session) { - struct csi_tensor *ret = csi_mem_alloc(sizeof(struct csi_tensor)); + struct csinn_tensor *ret = shl_mem_alloc(sizeof(struct csinn_tensor)); if (session != NULL) { ret->dtype = session->base_dtype; ret->layout = session->base_layout; ret->sess = session; } ret->quant_channel = 1; - ret->qinfo = csi_mem_alloc(sizeof(struct csi_quant_info)); + ret->qinfo = shl_mem_alloc(sizeof(struct csinn_quant_info)); return ret; } -void csi_realloc_quant_info(struct csi_tensor *tensor, int quant_info_num) +void csinn_realloc_quant_info(struct csinn_tensor *tensor, int quant_info_num) { tensor->quant_channel = quant_info_num; - tensor->qinfo = csi_mem_realloc(tensor->qinfo, quant_info_num * sizeof(struct csi_quant_info)); + tensor->qinfo = + shl_mem_realloc(tensor->qinfo, quant_info_num * sizeof(struct csinn_quant_info)); } -void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src) +void csinn_tensor_copy(struct csinn_tensor *dest, struct csinn_tensor *src) { dest->data = src->data; dest->dtype = src->dtype; @@ -286,61 +296,61 @@ void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src) dest->name = src->name; dest->layout = src->layout; if (src->quant_channel != dest->quant_channel && src->quant_channel != 0) { - csi_realloc_quant_info(dest, src->quant_channel); + csinn_realloc_quant_info(dest, src->quant_channel); } - memcpy(dest->qinfo, src->qinfo, sizeof(struct csi_quant_info) * src->quant_channel); + memcpy(dest->qinfo, src->qinfo, sizeof(struct csinn_quant_info) * src->quant_channel); dest->sess = src->sess; dest->is_const = src->is_const; } -void csi_free_tensor(struct csi_tensor *tensor) +void csinn_free_tensor(struct csinn_tensor *tensor) { if (tensor->qinfo != NULL) { - csi_mem_free(tensor->qinfo); + shl_mem_free(tensor->qinfo); } - csi_mem_free(tensor); + shl_mem_free(tensor); } -void *csi_alloc_params(int params_size, struct csi_session *session) +void *csinn_alloc_params(int params_size, struct csinn_session *session) { - struct csi_params_base *params = csi_mem_alloc(params_size); + struct csinn_params_base *params = shl_mem_alloc(params_size); if (session != NULL) { params->api = session->base_api; params->layout = session->base_layout; - params->run_mode = session->base_run_mode; params->sess = session; } + params->cb = shl_mem_alloc(sizeof(struct csinn_callback)); return params; } -void csi_free_params(void *params) { csi_mem_free(params); } +void csinn_free_params(void *params) { shl_mem_free(params); } -static float csi_int4_to_float_base(int8_t i, struct csi_tensor *t, int index) +static float int4_to_float_base(int8_t i, struct csinn_tensor *t, int index) { return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale; } -static float csi_uint8_to_float_base(uint8_t i, struct csi_tensor *t, int index) +static float uint8_to_float_base(uint8_t i, struct csinn_tensor *t, int index) { return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale; } -static float csi_int8_to_float_base(int8_t i, struct csi_tensor *t, int index) +static float int8_to_float_base(int8_t i, struct csinn_tensor *t, int index) { return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale; } -static float csi_int16_to_float_base(int16_t i, struct csi_tensor *t, int index) +static float int16_to_float_base(int16_t i, struct csinn_tensor *t, int index) { return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale; } -static float csi_int32_to_float_base(int32_t i, struct csi_tensor *t, int index) +static float int32_to_float_base(int32_t i, struct csinn_tensor *t, int index) { return (float)i * t->qinfo[index].scale; } -static int8_t csi_float_to_int4_base(float i, struct csi_tensor *t, int index) +static int8_t float_to_int4_base(float i, struct csinn_tensor *t, int index) { float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point; if (ret > 7) { @@ -352,7 +362,7 @@ static int8_t csi_float_to_int4_base(float i, struct csi_tensor *t, int index) } } -static uint8_t csi_float_to_uint8_base(float i, struct csi_tensor *t, int index) +static uint8_t float_to_uint8_base(float i, struct csinn_tensor *t, int index) { float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point; if (ret > 255) { @@ -364,7 +374,7 @@ static uint8_t csi_float_to_uint8_base(float i, struct csi_tensor *t, int index) } } -static int8_t csi_float_to_int8_base(float i, struct csi_tensor *t, int index) +static int8_t float_to_int8_base(float i, struct csinn_tensor *t, int index) { float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point; if (ret > 127) { @@ -376,7 +386,7 @@ static int8_t csi_float_to_int8_base(float i, struct csi_tensor *t, int index) } } -static int16_t csi_float_to_int16_base(float i, struct csi_tensor *t, int index) +static int16_t float_to_int16_base(float i, struct csinn_tensor *t, int index) { float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point; if (ret > 32767) { @@ -388,9 +398,60 @@ static int16_t csi_float_to_int16_base(float i, struct csi_tensor *t, int index) } } +static int16_t float32_to_float16_base(float value) +{ + int16_t ret; + if (value > -6.1e-5 && value < 6.1e-5) { + /* to small for f16, ignore to 0 */ + return 0; + } + if (value > 65504) { + shl_debug_error("too large f32 to f16\n"); + /* saturate to f16 max value: 65504 */ + value = 65504; + } + int32_t org_format = *(int32_t *)&value; + int16_t sign = (org_format & 0x80000000) >> 16; + int16_t frac = (org_format & 0x7fffff) >> 13; + int16_t exp = (((((org_format >> 23) & 0xff) - 128) + 16) & 0x1f) << 10; + ret = sign | frac | exp; + return ret; +} + +static float float16_to_float32_base(int16_t value) +{ + float ret; + if (value == 0 || value == 0x8000) { + return 0; + } + int32_t ret_format = 0; + int32_t sign = (value & 0x8000) << 16; + int32_t frac = (value & 0x3ff) << 13; + int32_t exp = (((((value >> 10) & 0x1f) - 16) + 128) & 0xff) << 23; + ret_format = sign | frac | exp; + ret = *(float *)&ret_format; + return ret; +} + +static int16_t float32_to_bfloat16_base(float value) +{ + int16_t ret; + int32_t org_format = *(int32_t *)&value; + ret = (org_format & 0xffff0000) >> 16; + return ret; +} + +static float bfloat16_to_float32_base(int16_t value) +{ + float ret; + int32_t ret_format = value << 16; + ret = *(float *)&ret_format; + return ret; +} + /* Only for CSINN_LAYOUT_OHWI, HWI's size align */ -static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi_tensor *src, - int inner_size) +static void axis0_int4_to_float_alignHWI(struct csinn_tensor *dest, struct csinn_tensor *src, + int inner_size) { int8_t *src_data = src->data; float *dest_data = dest->data; @@ -404,10 +465,10 @@ static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi /* int4 little endian */ if (j % 2) { src_tmp = src_data[in_index] & 0xf0; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } else { src_tmp = (src_data[in_index] & 0xf) << 4; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } dest_data[index] = ret; } @@ -415,8 +476,8 @@ static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi } /* Only for CSINN_LAYOUT_OHWI, HWI's size align */ -static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi_tensor *src, - int inner_size) +static void axis0_float_to_int4_alignHWI(struct csinn_tensor *dest, struct csinn_tensor *src, + int inner_size) { float *src_data = src->data; int8_t *dest_data = dest->data; @@ -424,7 +485,7 @@ static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = i * inner_size + j; - int input_val = csi_float_to_int4_base(src_data[index], dest, i); + int input_val = float_to_int4_base(src_data[index], dest, i); int out_index = i * ((inner_size + 1) / 2) + j / 2; /* int4 little endian */ if (j % 2) { @@ -438,8 +499,8 @@ static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi } } -static void csi_nchw_int4_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int8_t *src_data = src->data; float *dest_data = dest->data; @@ -453,18 +514,18 @@ static void csi_nchw_int4_to_float(struct csi_tensor *dest, struct csi_tensor *s /* int4 little endian */ if (index % 2) { src_tmp = src_data[in_index] & 0xf0; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } else { src_tmp = (src_data[in_index] & 0xf) << 4; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } dest_data[index] = ret; } } } -static void csi_nhwc_int4_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int8_t *src_data = src->data; float *dest_data = dest->data; @@ -478,18 +539,18 @@ static void csi_nhwc_int4_to_float(struct csi_tensor *dest, struct csi_tensor *s /* int4 little endian */ if (index % 2) { src_tmp = src_data[in_index] & 0xf0; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } else { src_tmp = (src_data[in_index] & 0xf) << 4; - ret = csi_int4_to_float_base(src_tmp >> 4, src, i); + ret = int4_to_float_base(src_tmp >> 4, src, i); } dest_data[index] = ret; } } } -static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int8_t *dest_data = dest->data; @@ -497,7 +558,7 @@ static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - int input_val = csi_float_to_int4_base(src_data[index], dest, i); + int input_val = float_to_int4_base(src_data[index], dest, i); int out_index = index / 2; /* int4 little endian */ if (index % 2) { @@ -509,8 +570,8 @@ static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s } } -static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int8_t *dest_data = dest->data; @@ -518,7 +579,7 @@ static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - int input_val = csi_float_to_int4_base(src_data[index], dest, i); + int input_val = float_to_int4_base(src_data[index], dest, i); int out_index = index / 2; /* int4 little endian */ if (index % 2) { @@ -530,8 +591,8 @@ static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s } } -static void csi_nchw_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { uint8_t *src_data = src->data; float *dest_data = dest->data; @@ -539,13 +600,13 @@ static void csi_nchw_uint8_to_float(struct csi_tensor *dest, struct csi_tensor * for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_uint8_to_float_base(src_data[index], src, i); + dest_data[index] = uint8_to_float_base(src_data[index], src, i); } } } -static void csi_nhwc_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { uint8_t *src_data = src->data; float *dest_data = dest->data; @@ -553,13 +614,13 @@ static void csi_nhwc_uint8_to_float(struct csi_tensor *dest, struct csi_tensor * for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_uint8_to_float_base(src_data[index], src, i); + dest_data[index] = uint8_to_float_base(src_data[index], src, i); } } } -static void csi_nchw_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; uint8_t *dest_data = dest->data; @@ -567,12 +628,12 @@ static void csi_nchw_float_to_uint8(struct csi_tensor *dest, struct csi_tensor * for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_float_to_uint8_base(src_data[index], dest, i); + dest_data[index] = float_to_uint8_base(src_data[index], dest, i); } } } -static void csi_nhwc_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; uint8_t *dest_data = dest->data; @@ -580,13 +641,13 @@ static void csi_nhwc_float_to_uint8(struct csi_tensor *dest, struct csi_tensor * for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_float_to_uint8_base(src_data[index], dest, i); + dest_data[index] = float_to_uint8_base(src_data[index], dest, i); } } } -static void csi_nchw_int8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int8_t *src_data = src->data; float *dest_data = dest->data; @@ -594,12 +655,12 @@ static void csi_nchw_int8_to_float(struct csi_tensor *dest, struct csi_tensor *s for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_int8_to_float_base(src_data[index], src, i); + dest_data[index] = int8_to_float_base(src_data[index], src, i); } } } -static void csi_nhwc_int8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int8_t *src_data = src->data; float *dest_data = dest->data; @@ -607,13 +668,13 @@ static void csi_nhwc_int8_to_float(struct csi_tensor *dest, struct csi_tensor *s for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_int8_to_float_base(src_data[index], src, i); + dest_data[index] = int8_to_float_base(src_data[index], src, i); } } } -static void csi_nchw_float_to_int8(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int8_t *dest_data = dest->data; @@ -621,13 +682,13 @@ static void csi_nchw_float_to_int8(struct csi_tensor *dest, struct csi_tensor *s for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_float_to_int8_base(src_data[index], dest, i); + dest_data[index] = float_to_int8_base(src_data[index], dest, i); } } } -static void csi_nhwc_float_to_int8(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int8_t *dest_data = dest->data; @@ -635,13 +696,13 @@ static void csi_nhwc_float_to_int8(struct csi_tensor *dest, struct csi_tensor *s for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_float_to_int8_base(src_data[index], dest, i); + dest_data[index] = float_to_int8_base(src_data[index], dest, i); } } } -static void csi_nchw_int16_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int16_t *src_data = src->data; float *dest_data = dest->data; @@ -649,13 +710,13 @@ static void csi_nchw_int16_to_float(struct csi_tensor *dest, struct csi_tensor * for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_int16_to_float_base(src_data[index], src, i); + dest_data[index] = int16_to_float_base(src_data[index], src, i); } } } -static void csi_nhwc_int16_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int16_t *src_data = src->data; float *dest_data = dest->data; @@ -663,13 +724,13 @@ static void csi_nhwc_int16_to_float(struct csi_tensor *dest, struct csi_tensor * for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_int16_to_float_base(src_data[index], src, i); + dest_data[index] = int16_to_float_base(src_data[index], src, i); } } } -static void csi_nchw_float_to_int16(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int16_t *dest_data = dest->data; @@ -677,13 +738,13 @@ static void csi_nchw_float_to_int16(struct csi_tensor *dest, struct csi_tensor * for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_float_to_int16_base(src_data[index], dest, i); + dest_data[index] = float_to_int16_base(src_data[index], dest, i); } } } -static void csi_nhwc_float_to_int16(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { float *src_data = src->data; int16_t *dest_data = dest->data; @@ -691,13 +752,13 @@ static void csi_nhwc_float_to_int16(struct csi_tensor *dest, struct csi_tensor * for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_float_to_int16_base(src_data[index], dest, i); + dest_data[index] = float_to_int16_base(src_data[index], dest, i); } } } -static void csi_nchw_int32_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nchw_int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int32_t *src_data = src->data; float *dest_data = dest->data; @@ -705,13 +766,13 @@ static void csi_nchw_int32_to_float(struct csi_tensor *dest, struct csi_tensor * for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = n * q_size * inner_size + i * inner_size + j; - dest_data[index] = csi_int32_to_float_base(src_data[index], src, i); + dest_data[index] = int32_to_float_base(src_data[index], src, i); } } } -static void csi_nhwc_int32_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n, - int inner_size) +static void nhwc_int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n, + int inner_size) { int32_t *src_data = src->data; float *dest_data = dest->data; @@ -719,54 +780,54 @@ static void csi_nhwc_int32_to_float(struct csi_tensor *dest, struct csi_tensor * for (int j = 0; j < inner_size; j++) { for (int i = 0; i < q_size; i++) { int index = n * q_size * inner_size + j * q_size + i; - dest_data[index] = csi_int32_to_float_base(src_data[index], src, i); + dest_data[index] = int32_to_float_base(src_data[index], src, i); } } } -static void csi_f16_to_float(struct csi_tensor *dest, struct csi_tensor *src) +static void csinn_f16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src) { int16_t *src_data = src->data; float *dest_data = dest->data; - int32_t size = csi_tensor_size(src); + int32_t size = csinn_tensor_size(src); for (int j = 0; j < size; j++) { - dest_data[j] = csi_ref_float16_to_float32(src_data[j]); + dest_data[j] = float16_to_float32_base(src_data[j]); } } -static void csi_float_to_f16(struct csi_tensor *dest, struct csi_tensor *src) +static void csinn_float_to_f16(struct csinn_tensor *dest, struct csinn_tensor *src) { float *src_data = src->data; int16_t *dest_data = dest->data; - int32_t size = csi_tensor_size(src); + int32_t size = csinn_tensor_size(src); for (int i = 0; i < size; i++) { - dest_data[i] = csi_ref_float32_to_float16(src_data[i]); + dest_data[i] = float32_to_float16_base(src_data[i]); } } -static void csi_bf16_to_float(struct csi_tensor *dest, struct csi_tensor *src) +static void bf16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src) { int16_t *src_data = src->data; float *dest_data = dest->data; - int32_t size = csi_tensor_size(src); + int32_t size = csinn_tensor_size(src); for (int j = 0; j < size; j++) { - dest_data[j] = csi_ref_bfloat16_to_float32(src_data[j]); + dest_data[j] = bfloat16_to_float32_base(src_data[j]); } } -static void csi_float_to_bf16(struct csi_tensor *dest, struct csi_tensor *src) +static void float_to_bf16(struct csinn_tensor *dest, struct csinn_tensor *src) { float *src_data = src->data; int16_t *dest_data = dest->data; - int32_t size = csi_tensor_size(src); + int32_t size = csinn_tensor_size(src); for (int i = 0; i < size; i++) { - dest_data[i] = csi_ref_float32_to_bfloat16(src_data[i]); + dest_data[i] = float32_to_bfloat16_base(src_data[i]); } } -int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *src) +static int tensor_data_convert_weight(struct csinn_tensor *dest, struct csinn_tensor *src) { - int size = csi_tensor_size(src); + int size = csinn_tensor_size(src); int inner_size = src->quant_channel == 0 ? size : size / src->quant_channel; if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT4) { switch (src->layout) { @@ -778,13 +839,13 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_O1HW: case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_int4_to_float(dest, src, 0, inner_size); + nchw_int4_to_float(dest, src, 0, inner_size); break; case CSINN_LAYOUT_OHWI: - csi_axis0_int4_to_float_alignHWI(dest, src, inner_size); + axis0_int4_to_float_alignHWI(dest, src, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_int4_to_float(dest, src, 0, inner_size); + nhwc_int4_to_float(dest, src, 0, inner_size); break; default: break; @@ -799,12 +860,12 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_O1HW: case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_float_to_int4(dest, src, 0, inner_size); + nchw_float_to_int4(dest, src, 0, inner_size); break; case CSINN_LAYOUT_OHWI: - csi_axis0_float_to_int4_alignHWI(dest, src, inner_size); + axis0_float_to_int4_alignHWI(dest, src, inner_size); case CSINN_LAYOUT_1HWO: - csi_nhwc_float_to_int4(dest, src, 0, inner_size); + nhwc_float_to_int4(dest, src, 0, inner_size); break; default: break; @@ -820,10 +881,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_uint8_to_float(dest, src, 0, inner_size); + nchw_uint8_to_float(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_uint8_to_float(dest, src, 0, inner_size); + nhwc_uint8_to_float(dest, src, 0, inner_size); break; default: break; @@ -839,10 +900,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_float_to_uint8(dest, src, 0, inner_size); + nchw_float_to_uint8(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_float_to_uint8(dest, src, 0, inner_size); + nhwc_float_to_uint8(dest, src, 0, inner_size); break; default: break; @@ -858,10 +919,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_int8_to_float(dest, src, 0, inner_size); + nchw_int8_to_float(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_int8_to_float(dest, src, 0, inner_size); + nhwc_int8_to_float(dest, src, 0, inner_size); break; default: break; @@ -877,10 +938,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_float_to_int8(dest, src, 0, inner_size); + nchw_float_to_int8(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_float_to_int8(dest, src, 0, inner_size); + nhwc_float_to_int8(dest, src, 0, inner_size); break; default: break; @@ -896,10 +957,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_int16_to_float(dest, src, 0, inner_size); + nchw_int16_to_float(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_int16_to_float(dest, src, 0, inner_size); + nhwc_int16_to_float(dest, src, 0, inner_size); break; default: break; @@ -915,10 +976,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_float_to_int16(dest, src, 0, inner_size); + nchw_float_to_int16(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_float_to_int16(dest, src, 0, inner_size); + nhwc_float_to_int16(dest, src, 0, inner_size); break; default: break; @@ -934,33 +995,33 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s case CSINN_LAYOUT_OWI: case CSINN_LAYOUT_OHWI: case CSINN_LAYOUT_ODHWI: - csi_nchw_int32_to_float(dest, src, 0, inner_size); + nchw_int32_to_float(dest, src, 0, inner_size); break; case CSINN_LAYOUT_1HWO: - csi_nhwc_int32_to_float(dest, src, 0, inner_size); + nhwc_int32_to_float(dest, src, 0, inner_size); break; default: break; } } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) { - csi_float_to_f16(dest, src); + csinn_float_to_f16(dest, src); } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) { - csi_f16_to_float(dest, src); + csinn_f16_to_float(dest, src); } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) { - csi_float_to_bf16(dest, src); + float_to_bf16(dest, src); } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) { - csi_bf16_to_float(dest, src); + bf16_to_float(dest, src); } else if (dest->dtype == src->dtype) { - memcpy(dest->data, src->data, csi_tensor_byte_size(src)); + memcpy(dest->data, src->data, csinn_tensor_byte_size(src)); } else { return CSINN_FALSE; } return CSINN_TRUE; } -int csi_tensor_data_convert_activation(struct csi_tensor *dest, struct csi_tensor *src) +int tensor_data_convert_activation(struct csinn_tensor *dest, struct csinn_tensor *src) { - int size = csi_tensor_size(src); + int size = csinn_tensor_size(src); int32_t q_size = src->quant_channel != 0 ? src->quant_channel : dest->quant_channel; if (q_size == 0) { q_size = 1; @@ -969,92 +1030,92 @@ int csi_tensor_data_convert_activation(struct csi_tensor *dest, struct csi_tenso if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT4) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_int4_to_float(dest, src, n, inner_size); + nchw_int4_to_float(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_int4_to_float(dest, src, n, inner_size); + nhwc_int4_to_float(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_INT4 && src->dtype == CSINN_DTYPE_FLOAT32) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_float_to_int4(dest, src, n, inner_size); + nchw_float_to_int4(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_float_to_int4(dest, src, n, inner_size); + nhwc_float_to_int4(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_UINT8) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_uint8_to_float(dest, src, n, inner_size); + nchw_uint8_to_float(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_uint8_to_float(dest, src, n, inner_size); + nhwc_uint8_to_float(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_UINT8 && src->dtype == CSINN_DTYPE_FLOAT32) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_float_to_uint8(dest, src, n, inner_size); + nchw_float_to_uint8(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_float_to_uint8(dest, src, n, inner_size); + nhwc_float_to_uint8(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT8) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_int8_to_float(dest, src, n, inner_size); + nchw_int8_to_float(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_int8_to_float(dest, src, n, inner_size); + nhwc_int8_to_float(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_INT8 && src->dtype == CSINN_DTYPE_FLOAT32) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_float_to_int8(dest, src, n, inner_size); + nchw_float_to_int8(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_float_to_int8(dest, src, n, inner_size); + nhwc_float_to_int8(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT16) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_int16_to_float(dest, src, n, inner_size); + nchw_int16_to_float(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_int16_to_float(dest, src, n, inner_size); + nhwc_int16_to_float(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_INT16 && src->dtype == CSINN_DTYPE_FLOAT32) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_float_to_int16(dest, src, n, inner_size); + nchw_float_to_int16(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_float_to_int16(dest, src, n, inner_size); + nhwc_float_to_int16(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT32) { for (int n = 0; n < src->dim[0]; n++) { if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) { - csi_nchw_int32_to_float(dest, src, n, inner_size); + nchw_int32_to_float(dest, src, n, inner_size); } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) { - csi_nhwc_int32_to_float(dest, src, n, inner_size); + nhwc_int32_to_float(dest, src, n, inner_size); } } } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) { - csi_float_to_f16(dest, src); + csinn_float_to_f16(dest, src); } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) { - csi_f16_to_float(dest, src); + csinn_f16_to_float(dest, src); } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) { - csi_float_to_bf16(dest, src); + float_to_bf16(dest, src); } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) { - csi_bf16_to_float(dest, src); + bf16_to_float(dest, src); } else if (dest->dtype == src->dtype) { - memcpy(dest->data, src->data, csi_tensor_byte_size(src)); + memcpy(dest->data, src->data, csinn_tensor_byte_size(src)); } else { return CSINN_FALSE; } return CSINN_TRUE; } -int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src) +int csinn_tensor_data_convert(struct csinn_tensor *dest, struct csinn_tensor *src) { if (src->layout != dest->layout) return CSINN_FALSE; @@ -1069,7 +1130,7 @@ int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src) case CSINN_LAYOUT_NWC: case CSINN_LAYOUT_NCDHW: case CSINN_LAYOUT_NDHWC: - return csi_tensor_data_convert_activation(dest, src); + return tensor_data_convert_activation(dest, src); case CSINN_LAYOUT_O: case CSINN_LAYOUT_OI: case CSINN_LAYOUT_OIW: @@ -1080,28 +1141,220 @@ int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src) case CSINN_LAYOUT_ODHWI: case CSINN_LAYOUT_O1HW: case CSINN_LAYOUT_1HWO: - return csi_tensor_data_convert_weight(dest, src); + return tensor_data_convert_weight(dest, src); default: return CSINN_FALSE; } } -#ifdef CSI_BUILD_RTOS -uint64_t csi_get_timespec() { return 0; } +static int layout_1HWO_to_1HW32O32(struct csinn_tensor *dest, struct csinn_tensor *src) +{ + if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) { + return CSINN_FALSE; + } + int a_len = 32; + int b_len = a_len * src->dim[1] * src->dim[2]; + + void *src_addr = src->data; + void *dest_addr = dest->data; + /* read in src order, write stride */ + for (int i = 0; i < src->dim[1] * src->dim[2]; i++) { + for (int j = 0; j < src->dim[3] / a_len; j++) { + dest_addr = dest->data + j * b_len + i * a_len; + memcpy(dest_addr, src_addr, a_len); + src_addr += a_len; + } + if (src->dim[3] % a_len) { + dest_addr = dest->data + (src->dim[3] / a_len) * b_len + i * a_len; + memcpy(dest_addr, src_addr, src->dim[3] % a_len); + src_addr += src->dim[3] % a_len; + } + } + return CSINN_TRUE; +} + +static int layout_OI_to_O32I32(struct csinn_tensor *dest, struct csinn_tensor *src) +{ + if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) { + return CSINN_FALSE; + } + int a_len = 32; + + int8_t *src_addr = src->data; + int8_t *dest_addr = dest->data; + int src_idx = 0; + int idx_base = 0; + int dest_idx = 0; + /* read src stride, write in order */ + for (int i = 0; i < src->dim[0] / a_len; i++) { + idx_base = i * a_len * src->dim[1]; + dest_idx = idx_base; + for (int j = 0; j < src->dim[1]; j++) { + for (int k = 0; k < a_len; k++) { + src_idx = idx_base + k * src->dim[1] + j; + dest_addr[dest_idx] = src_addr[src_idx]; + dest_idx++; + } + } + } + idx_base = (src->dim[0] / a_len) * a_len * src->dim[1]; + dest_idx = idx_base; + for (int j = 0; j < src->dim[1]; j++) { + for (int k = 0; k < src->dim[0] % a_len; k++) { + src_idx = idx_base + k * src->dim[1] + j; + dest_idx = idx_base + k + a_len * j; + dest_addr[dest_idx] = src_addr[src_idx]; + } + } +} + +static int layout_OHWI_to_O32HWI32(struct csinn_tensor *dest, struct csinn_tensor *src) +{ + if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) { + return CSINN_FALSE; + } + int a_len = 32; + int b_len = src->dim[1] * src->dim[2] * src->dim[3]; + + int8_t *src_addr = src->data; + int8_t *dest_addr = dest->data; + int src_idx = 0; + int idx_base = 0; + int dest_idx = 0; + /* read src stride, write in order */ + for (int i = 0; i < src->dim[0] / a_len; i++) { + idx_base = i * a_len * b_len; + dest_idx = idx_base; + for (int j = 0; j < b_len; j++) { + for (int k = 0; k < a_len; k++) { + src_idx = idx_base + k * b_len + j; + dest_addr[dest_idx] = src_addr[src_idx]; + dest_idx++; + } + } + } + idx_base = (src->dim[0] / a_len) * a_len * b_len; + dest_idx = idx_base; + for (int j = 0; j < b_len; j++) { + for (int k = 0; k < src->dim[0] % a_len; k++) { + src_idx = idx_base + k * b_len + j; + dest_idx = idx_base + k + a_len * j; + dest_addr[dest_idx] = src_addr[src_idx]; + } + } +} + +int csinn_tensor_layout_convert(struct csinn_tensor *dest, struct csinn_tensor *src) +{ + int ret = CSINN_FALSE; + if (src->layout == CSINN_LAYOUT_1HWO && dest->layout == CSINN_LAYOUT_1HW32O32) { + ret = layout_1HWO_to_1HW32O32(dest, src); + } else if (src->layout == CSINN_LAYOUT_OI && dest->layout == CSINN_LAYOUT_O32I32) { + ret = layout_OI_to_O32I32(dest, src); + } else if (src->layout == CSINN_LAYOUT_OHWI && dest->layout == CSINN_LAYOUT_O32HWI32) { + ret = layout_OHWI_to_O32HWI32(dest, src); + } + + return ret; +} + +enum csinn_rmode_enum shl_get_run_mode(struct csinn_params_base *base) +{ + if (base->sess == NULL) { + return CSINN_RM_LAYER; + } else { + return base->sess->base_run_mode; + } +} + +struct shl_cb_op_list *shl_cb_list_end(struct shl_cb_op_list *list) +{ + struct shl_cb_op_list *l = list; + while (l->next) { + l = l->next; + } + return l; +} -void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg) { return; } +struct csinn_callback *shl_cb_list_match(struct shl_cb_op_list *list, enum csinn_dtype_enum dtype, + enum csinn_op_enum op_name) +{ + struct csinn_callback *ret = NULL; + struct shl_cb_op_list *l = list; + while (l) { + if (l->dtype == dtype && l->op_name == op_name) { + ret = l->cb; + break; + } + l = l->next; + } + return ret; +} + +void *shl_get_init_cb(struct csinn_params_base *base) +{ + struct csinn_callback *cb = base->cb; + if (base->sess && ((base->sess->base_run_mode == CSINN_RM_CPU_GRAPH) || + (base->sess->base_run_mode == CSINN_RM_NPU_GRAPH))) { + return NULL; + } + if (cb->init) { + return cb->init; + } + + return NULL; +} + +/* establish graph or compute directly, get higher priority one */ +void *shl_get_p0_cb(struct csinn_params_base *base) +{ + struct csinn_callback *cb = base->cb; + if ((cb->est == NULL) && (cb->exec == NULL)) { + shl_debug_error("OP have not register\n"); + } + if (base->sess->base_run_mode == CSINN_RM_LAYER) { + if (cb->exec) { + return cb->exec; + } + } else { + if (cb->est) { + return cb->est; + } + if (cb->exec) { + return cb->exec; + } + } + + return NULL; +} + +#ifdef SHL_BUILD_RTOS +uint64_t shl_get_timespec() { return 0; } + +void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg) { return; } #else #define BILLION 1000000000 -uint64_t csi_get_timespec() +uint64_t shl_get_timespec() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (uint64_t)((uint64_t)ts.tv_nsec + (uint64_t)ts.tv_sec * BILLION); } -void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg) +void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg) { printf("Run %s time: %.5fms, FPS=%.2f\n", msg, ((double)(end - start)) / 1000000, 1000000000.0 / ((double)(end - start))); } #endif + +int csinn_version(char *vstr) +{ + int major = VERSION_MAJOR; + int minor = VERSION_MINOR; + int patch = VERSION_PATCH; + if (vstr) { + sprintf(vstr, "%d.%d.%d", major, minor, patch); + } + return (major << (VERSION_SHIFT * 2)) | (minor << VERSION_SHIFT) | patch; +} diff --git a/source/nn2/where.c b/source/nn2/where.c index 8d234a54..06622406 100644 --- a/source/nn2/where.c +++ b/source/nn2/where.c @@ -16,28 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_where_init(struct csi_tensor *condition, - struct csi_tensor *x, - struct csi_tensor *y, - struct csi_tensor *output, - struct where_params *params) +int csinn_where_init(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params) { return CSINN_FALSE; } -int csi_where(struct csi_tensor *condition, - struct csi_tensor *x, - struct csi_tensor *y, - struct csi_tensor *output, - struct where_params *params) +int csinn_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y, + struct csinn_tensor *output, struct csinn_where_params *params) { - CSI_DEBUG_CALL(csi_where_debug_info(condition, x, y, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(condition, x, y, output, params); + SHL_DEBUG_CALL(shl_where_debug_info(condition, x, y, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(condition, x, y, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/xor.c b/source/nn2/xor.c index 4a2a4e5c..a1dc54f8 100644 --- a/source/nn2/xor.c +++ b/source/nn2/xor.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_xor_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_XOR, input0->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_XOR, input0->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } return CSINN_TRUE; } -int csi_xor(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csinn_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params) { - CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input0, input1, output, params); + SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input0, input1, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/yuv_rgb_scale.c b/source/nn2/yuv_rgb_scale.c index fb5180ce..a9058102 100644 --- a/source/nn2/yuv_rgb_scale.c +++ b/source/nn2/yuv_rgb_scale.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" +#include "shl_utils.h" -int csi_yuv_rgb_scale_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_yuv_rgb_scale_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_YUV_RGB_SCALE, input->dtype); - if (params->base.bc == NULL) { - return CSINN_UNSUPPORT_DTYPE; + shl_op_callback_map(¶ms->base, CSINN_OP_YUV_RGB_SCALE, input->dtype); + struct csinn_callback *cb = params->base.cb; + int (*func)() = shl_get_init_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } return CSINN_TRUE; } -int csi_yuv_rgb_scale(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csinn_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__)); - if (params->base.bc != NULL) { - params->base.bc(input, output, params); + SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__)); + int (*func)() = shl_get_p0_cb(¶ms->base); + if (func != NULL) { + func(input, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/reference/abs.c b/source/reference/abs.c index 15924b4d..318c2f9c 100644 --- a/source/reference/abs.c +++ b/source/reference/abs.c @@ -16,15 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = fabs(input_data[i]); @@ -32,8 +33,8 @@ int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_abs_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_abs_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_abs_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_abs_f32); } diff --git a/source/reference/acos.c b/source/reference/acos.c index 1267fb6f..4f5b995d 100644 --- a/source/reference/acos.c +++ b/source/reference/acos.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_acos_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = acos(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_acos_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_acos_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_acos_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_acos_f32); } \ No newline at end of file diff --git a/source/reference/acosh.c b/source/reference/acosh.c index 2d77e373..a4ef3474 100644 --- a/source/reference/acosh.c +++ b/source/reference/acosh.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_acosh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = acosh(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_acosh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_acosh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_acosh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_acosh_f32); } diff --git a/source/reference/add.c b/source/reference/add.c index 6c6d7ac3..b74f8e5e 100644 --- a/source/reference/add.c +++ b/source/reference/add.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" static void element_add_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { dest[output_idx] = src0[output_idx] + src1[input_idx]; } -int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_add_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_add_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_add_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_add_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_add_f32); } diff --git a/source/reference/and.c b/source/reference/and.c index afa942d2..d6cd94b9 100644 --- a/source/reference/and.c +++ b/source/reference/and.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_and_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; uint32_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] & input1_data[i]; @@ -34,13 +34,13 @@ int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct return CSINN_TRUE; } -int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_and_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; uint8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] & input1_data[i]; @@ -48,13 +48,13 @@ int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct return CSINN_TRUE; } -int csi_ref_and_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_and_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int8_t *input0_data = input0->data; int8_t *input1_data = input1->data; int8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] & input1_data[i]; diff --git a/source/reference/arange.c b/source/reference/arange.c index c26c5ca7..fe8de9c5 100644 --- a/source/reference/arange.c +++ b/source/reference/arange.c @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params) +int shl_ref_arange_f32(struct csinn_tensor *output, struct csinn_arange_params *params) { float *data = output->data; int j = 0; @@ -41,26 +41,26 @@ int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params) return CSINN_TRUE; } -int csi_ref_arange_quant(struct csi_tensor *output, struct arange_params *params) +int shl_ref_arange_quant(struct csinn_tensor *output, struct csinn_arange_params *params) { - struct csi_quant_info qinfo; + struct csinn_quant_info qinfo; qinfo.zero_point = 0; qinfo.multiplier = params->start_multiplier; qinfo.shift = params->start_shift; - float start = csi_ref_dequantize_u8_to_f32(1.0, &qinfo); + float start = shl_ref_dequantize_u8_to_f32(1.0, &qinfo); qinfo.zero_point = 0; qinfo.multiplier = params->stop_multiplier; qinfo.shift = params->stop_shift; - float stop = csi_ref_dequantize_u8_to_f32(1.0, &qinfo); + float stop = shl_ref_dequantize_u8_to_f32(1.0, &qinfo); qinfo.zero_point = 0; qinfo.multiplier = params->step_multiplier; qinfo.shift = params->step_shift; - float step = csi_ref_dequantize_u8_to_f32(1.0, &qinfo); + float step = shl_ref_dequantize_u8_to_f32(1.0, &qinfo); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - csi_ref_arange_f32(foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + shl_ref_arange_f32(foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(foutput); return CSINN_TRUE; } diff --git a/source/reference/argmax.c b/source/reference/argmax.c index 978f701e..7ece8e60 100644 --- a/source/reference/argmax.c +++ b/source/reference/argmax.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" struct ArgPos { float value; @@ -33,8 +33,8 @@ static struct ArgPos fargmax_stride(struct ArgPos lhs, struct ArgPos rhs) return lhs; } -int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_argmax_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; int32_t *output_data = output->data; @@ -53,10 +53,10 @@ int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o for (int32_t out = 0; out < out_size; out++) { struct ArgPos result = {-FLT_MAX, -1}; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; struct ArgPos pos = {val, inner}; @@ -68,12 +68,12 @@ int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o return CSINN_TRUE; } -int csi_ref_argmax_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_argmax_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - ret = csi_ref_argmax_stride_i32_f32(finput, output, params); - csi_ref_tensor_transform_free_f32(finput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + ret = shl_ref_argmax_stride_i32_f32(finput, output, params); + shl_ref_tensor_transform_free_f32(finput); return ret; } diff --git a/source/reference/argmin.c b/source/reference/argmin.c index 57158877..37bcbb01 100644 --- a/source/reference/argmin.c +++ b/source/reference/argmin.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" struct ArgPos { float value; @@ -33,8 +33,8 @@ static struct ArgPos fargmin_stride(struct ArgPos lhs, struct ArgPos rhs) return lhs; } -int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_argmin_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; int32_t *output_data = output->data; @@ -53,10 +53,10 @@ int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o for (int32_t out = 0; out < out_size; out++) { struct ArgPos result = {FLT_MAX, -1}; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; struct ArgPos pos = {val, inner}; @@ -68,12 +68,12 @@ int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o return CSINN_TRUE; } -int csi_ref_argmin_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_argmin_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - ret = csi_ref_argmin_stride_i32_f32(finput, output, params); - csi_ref_tensor_transform_free_f32(finput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + ret = shl_ref_argmin_stride_i32_f32(finput, output, params); + shl_ref_tensor_transform_free_f32(finput); return ret; } \ No newline at end of file diff --git a/source/reference/asin.c b/source/reference/asin.c index bbb5dfd0..00e7e946 100644 --- a/source/reference/asin.c +++ b/source/reference/asin.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_asin_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = asin(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_asin_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_asin_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_asin_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_asin_f32); } diff --git a/source/reference/asinh.c b/source/reference/asinh.c index 58216b35..2ee55c1f 100644 --- a/source/reference/asinh.c +++ b/source/reference/asinh.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_asinh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = asinh(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_asinh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_asinh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_asinh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_asinh_f32); } diff --git a/source/reference/atan.c b/source/reference/atan.c index 020aacef..aeb90ad9 100644 --- a/source/reference/atan.c +++ b/source/reference/atan.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_atan_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = atan(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_atan_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_atan_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_atan_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_atan_f32); } diff --git a/source/reference/atanh.c b/source/reference/atanh.c index 0935afc4..2283a7df 100644 --- a/source/reference/atanh.c +++ b/source/reference/atanh.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_atanh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = atanh(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_atanh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_atanh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_atanh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_atanh_f32); } diff --git a/source/reference/averagepool.c b/source/reference/averagepool.c index cbcd8bc7..674e966d 100644 --- a/source/reference/averagepool.c +++ b/source/reference/averagepool.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_avgpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,19 +40,19 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float total = 0.f; float filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - total += input_data[csi_ref_get_index(input->dim, batch, in_y, in_x, + total += input_data[shl_ref_get_index(input->dim, batch, in_y, in_x, channel)]; filter_count++; } @@ -61,7 +61,7 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp filter_count = params->filter_height * params->filter_width; } const float average = total / filter_count; - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] = + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] = average; } } @@ -70,8 +70,8 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +static int shl_ref_avgpool2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -90,19 +90,19 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float total = 0.f; float filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - total += input_data[csi_ref_get_index(input->dim, batch, channel, in_y, + total += input_data[shl_ref_get_index(input->dim, batch, channel, in_y, in_x)]; filter_count++; } @@ -111,7 +111,7 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso filter_count = params->filter_height * params->filter_width; } const float average = total / filter_count; - output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] = + output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] = average; } } @@ -120,20 +120,20 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -int csi_ref_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_avgpool2d_nchw_f32(input, output, params); + shl_ref_avgpool2d_nchw_f32(input, output, params); } else if (params->base.layout = CSINN_LAYOUT_NHWC) { - csi_ref_avgpool2d_nhwc_f32(input, output, params); + shl_ref_avgpool2d_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_avgpool2d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_avgpool2d_f32); } diff --git a/source/reference/averagepool3d.c b/source/reference/averagepool3d.c index e7d879d9..dd29d219 100644 --- a/source/reference/averagepool3d.c +++ b/source/reference/averagepool3d.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_avgpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -44,15 +44,15 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, const int in_h_origin = (out_h * params->stride_height) - params->pad_top; const int in_w_origin = (out_w * params->stride_width) - params->pad_left; - const int filter_d_begin = csi_ref_max_internal_s32(0, -in_d_origin); + const int filter_d_begin = shl_ref_max_internal_s32(0, -in_d_origin); const int filter_d_end = - csi_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin); - const int filter_h_begin = csi_ref_max_internal_s32(0, -in_h_origin); - const int filter_h_end = csi_ref_min_internal_s32(params->filter_height, + shl_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin); + const int filter_h_begin = shl_ref_max_internal_s32(0, -in_h_origin); + const int filter_h_end = shl_ref_min_internal_s32(params->filter_height, in_height - in_h_origin); - const int filter_w_begin = csi_ref_max_internal_s32(0, -in_w_origin); + const int filter_w_begin = shl_ref_max_internal_s32(0, -in_w_origin); const int filter_w_end = - csi_ref_min_internal_s32(params->filter_width, in_width - in_w_origin); + shl_ref_min_internal_s32(params->filter_width, in_width - in_w_origin); float total = 0.0f; int filter_cnt = 0; @@ -64,7 +64,7 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, int in_d = in_d_origin + filter_d; int in_h = in_h_origin + filter_h; int in_w = in_w_origin + filter_w; - total += input_data[csi_ref_get_index_5( + total += input_data[shl_ref_get_index_5( input->dim, in_ch, out_ch, in_d, in_h, in_w)]; filter_cnt++; } @@ -76,7 +76,7 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, } // float average = filter_cnt==0 ? total : total/filter_cnt; float average = total / filter_cnt; - output_data[csi_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h, + output_data[shl_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h, out_w)] = average; } } @@ -87,8 +87,8 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_avgpool3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_avgpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_avgpool3d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_avgpool3d_f32); } diff --git a/source/reference/batch_normalization.c b/source/reference/batch_normalization.c index 7fcd63ed..4e37eecc 100644 --- a/source/reference/batch_normalization.c +++ b/source/reference/batch_normalization.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/nn_impl.py#L1474-L1542 */ -int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params) +int shl_ref_batch_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params) { float *input_data = input->data; float *mean_data = mean->data; @@ -62,25 +62,25 @@ int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_ref_batch_normalization_quant(struct csi_tensor *input, struct csi_tensor *mean, - struct csi_tensor *variance, struct csi_tensor *gamma, - struct csi_tensor *beta, struct csi_tensor *output, - struct bn_params *params) +int shl_ref_batch_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *fmean = csi_ref_tensor_transform_f32(mean); - struct csi_tensor *fvariance = csi_ref_tensor_transform_f32(variance); - struct csi_tensor *fgamma = csi_ref_tensor_transform_f32(gamma); - struct csi_tensor *fbeta = csi_ref_tensor_transform_f32(beta); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_batch_normalization_f32(finput, fmean, fvariance, fgamma, fbeta, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(fmean); - csi_ref_tensor_transform_free_f32(fvariance); - csi_ref_tensor_transform_free_f32(fgamma); - csi_ref_tensor_transform_free_f32(fbeta); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *fmean = shl_ref_tensor_transform_f32(mean); + struct csinn_tensor *fvariance = shl_ref_tensor_transform_f32(variance); + struct csinn_tensor *fgamma = shl_ref_tensor_transform_f32(gamma); + struct csinn_tensor *fbeta = shl_ref_tensor_transform_f32(beta); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_batch_normalization_f32(finput, fmean, fvariance, fgamma, fbeta, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(fmean); + shl_ref_tensor_transform_free_f32(fvariance); + shl_ref_tensor_transform_free_f32(fgamma); + shl_ref_tensor_transform_free_f32(fbeta); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/batch_to_space.c b/source/reference/batch_to_space.c index aded7dcf..f644f886 100644 --- a/source/reference/batch_to_space.c +++ b/source/reference/batch_to_space.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" // the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params) +int shl_ref_batch_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -46,8 +46,8 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp for (int in_h = 0; in_h < in_height; ++in_h) { for (int in_w = 0; in_w < in_width; ++in_w) { for (int out_c = 0; out_c < out_channel; ++out_c) { - float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float)); - int in_start_addr = csi_ref_get_index(input->dim, out_b, out_c, in_h, in_w); + float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float)); + int in_start_addr = shl_ref_get_index(input->dim, out_b, out_c, in_h, in_w); for (int i = 0; i < block_size2; ++i) { temp[i] = input_data[in_start_addr + i * out_batch * out_channel * in_height * in_width]; @@ -60,12 +60,12 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp if (h_now >= 0 && h_now < out_height && w_now >= 0 && w_now < out_width) { int out_addr = - csi_ref_get_index(output->dim, out_b, out_c, h_now, w_now); + shl_ref_get_index(output->dim, out_b, out_c, h_now, w_now); output_data[out_addr] = temp[h * block_size + w]; } } } - csi_mem_free(temp); + shl_mem_free(temp); } } } @@ -73,8 +73,8 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_ref_batch_to_space_quant(struct csi_tensor *input, struct csi_tensor *output, - struct batch_to_space_params *params) +int shl_ref_batch_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_batch_to_space_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_batch_to_space_f32); } diff --git a/source/reference/broadcast_to.c b/source/reference/broadcast_to.c index d0d29fa6..b47aa0d8 100644 --- a/source/reference/broadcast_to.c +++ b/source/reference/broadcast_to.c @@ -16,20 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_broadcast_to_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int shl_ref_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { - return csi_ref_broadcast_to_shape_f32(input, output, params->shape, params->shape_count); + return shl_ref_broadcast_to_shape_f32(input, output, params->shape, params->shape_count); } -int csi_ref_broadcast_to_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int shl_ref_broadcast_to_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params) { - return csi_ref_broadcast_to_shape_quant(input, output, params->shape, params->shape_count); + return shl_ref_broadcast_to_shape_quant(input, output, params->shape, params->shape_count); } diff --git a/source/reference/cache_conv1d.c b/source/reference/cache_conv1d.c index d1062676..22766850 100644 --- a/source/reference/cache_conv1d.c +++ b/source/reference/cache_conv1d.c @@ -16,31 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_internal.h" -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_ref_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { size_t data_size = output->dim[0] * output->dim[1] * output->dim[2] * sizeof(float); // 512*13*2 asr_buffer_init(¶ms->asr_buffer, 2 * data_size, data_size); + struct csinn_callback *cb = params->base.cb; if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_cache_conv1d_f32; + cb->exec = shl_ref_cache_conv1d_f32; } else { - params->base.bc = csi_ref_cache_conv1d_quant; + cb->exec = shl_ref_cache_conv1d_quant; } return CSINN_TRUE; } -int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_ref_cache_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -78,23 +78,23 @@ int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output } } -int csi_ref_cache_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params) +int shl_ref_cache_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params) { - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); - struct csi_tensor *float_weight = csi_ref_tensor_transform_f32(weight); - struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias); + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); + struct csinn_tensor *float_weight = shl_ref_tensor_transform_f32(weight); + struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias); - int ret = csi_ref_cache_conv1d_f32(float_input, float_output, float_weight, float_bias, params); + int ret = shl_ref_cache_conv1d_f32(float_input, float_output, float_weight, float_bias, params); - csi_tensor_data_convert(output, float_output); + csinn_tensor_data_convert(output, float_output); - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_weight); - csi_ref_tensor_transform_free_f32(float_bias); + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_weight); + shl_ref_tensor_transform_free_f32(float_bias); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/reference/cache_matmul.c b/source/reference/cache_matmul.c index 1189f9af..160d44fc 100644 --- a/source/reference/cache_matmul.c +++ b/source/reference/cache_matmul.c @@ -1,207 +1,207 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#include "csi_internal.h" -#include "csi_ref.h" - -// asr data buffer -void asr_buffer_init(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth) -{ - buffer->buffer = csi_mem_alloc(buffer_size); - buffer->buffer_lenth = buffer_size; - buffer->data_lenth = data_lenth; - buffer->writer_index = buffer_size - data_lenth; - buffer->flag = 0; //用来记录有没有经过位置0.有的话置为1. -} - -// insert front -void *asr_buffer_insert_front(struct asr_buffer_t *buffer, void *input, size_t len) -{ - int start_position = buffer->writer_index - len; - uint8_t *p = NULL; - if (buffer->flag == 0) { - if (start_position < 0) { - buffer->flag = 1; - } - } - if (start_position >= 0) { - p = &buffer->buffer[start_position]; - memcpy(p, input, len); - buffer->writer_index = start_position; - if (buffer->flag == 0) { - return (void *)&buffer->buffer[0]; - } else { - return (void *)p; - } - } else { - start_position = buffer->buffer_lenth - buffer->data_lenth; - p = &buffer->buffer[start_position]; - memcpy(p, input, len); - memcpy(p + len, &buffer->buffer[buffer->writer_index], buffer->data_lenth - len); - buffer->writer_index = start_position; - return (void *)p; - } -} - -void *asr_buffer_insert_back(struct asr_buffer_t *buffer, void *input, size_t len) -{ - int end_position = buffer->writer_index + len; - uint8_t *p = NULL; - if (end_position <= buffer->buffer_lenth) { - p = &buffer->buffer[buffer->writer_index]; - memcpy(p, input, len); - buffer->writer_index += len; - p -= (buffer->data_lenth - len); - } else { - p = &buffer->buffer[buffer->writer_index + len - buffer->data_lenth]; - memcpy(&buffer->buffer[0], p, buffer->data_lenth - len); - buffer->writer_index = buffer->data_lenth; - memcpy(&buffer->buffer[buffer->data_lenth - len], input, len); - p = &buffer->buffer[0]; - } - return (void *)p; -} - -// get buffer -void *asr_buffer_get_buffer(struct asr_buffer_t *buffer) -{ - return asr_buffer_insert_back(buffer, NULL, 0); -} - -// reset buffer -void asr_buffer_reset(struct asr_buffer_t *buffer) -{ - free(buffer->buffer); - buffer->writer_index = 0; - buffer->buffer = NULL; - buffer->buffer_lenth = 0; - buffer->data_lenth = 0; - buffer->flag = 0; -} - -int csi_ref_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) -{ - size_t data_size = - params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(float); - asr_buffer_init(¶ms->asr_buffer, 2 * data_size, data_size); - - int accum_depth = weight->dim[0]; - int output_depth = weight->dim[1]; - - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_cache_matmul_f32; - } else { - params->base.bc = csi_ref_cache_matmul_quant; - } - - return CSINN_TRUE; -} - -int csi_ref_cache_matmul_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) -{ - int accum_depth = weight->dim[0]; - int output_depth = weight->dim[1]; - int batches = input->dim[1]; - float *input_data = input->data; - float *output_data = output->data; - float *weight_data = weight->data; - float *bias_data = bias->data; - - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - float total = 0.f; - for (int d = 0; d < accum_depth; ++d) { - total += input_data[b * accum_depth + d] * weight_data[out_c * accum_depth + d]; - } - float bias_value = 0.0f; - - bias_value = bias_data[out_c]; - - int out_pos = out_c + b * output_depth; //如果无transpose - output_data[out_pos] = total + bias_value; - } - } - - float judge = - bias_data[0] + bias_data[1] + bias_data[2] + bias_data[3] + bias_data[4] + bias_data[5]; - size_t insert_lenth = output_depth * batches; - float *output_from_buffer; - if (fabs(judge) < 0.01) { - output_from_buffer = - asr_buffer_insert_front(¶ms->asr_buffer, output_data, insert_lenth * sizeof(float)); - } else { - output_from_buffer = - asr_buffer_insert_back(¶ms->asr_buffer, output_data, insert_lenth * sizeof(float)); - } - // deal with reshape & transpose - int32_t *shape = output->dim; - - // transpose can only be 0,2,3,1 or 0,2,1,3 - if (params->axes[2] == 3) // 0,2,3,1 - { - int batch = shape[3]; - int shape3 = shape[2]; - int flatten_shape = shape[1] * shape[2]; - for (int i = 0; i < batch; i++) { - for (int j = 0; j < flatten_shape; j++) { - int out_pos = j * batch + i; - output_data[out_pos] = output_from_buffer[i * flatten_shape + j]; - } - } - } else // 0,2,1,3 - { - int batch = shape[2]; - int shape3 = shape[3]; - int flatten_shape = shape[1] * shape[3]; - for (int i = 0; i < batch; i++) { - for (int j = 0; j < flatten_shape; j++) { - int out_pos = i * shape3 + j % shape3 + batch * shape3 * (j / shape3); - output_data[out_pos] = output_from_buffer[i * flatten_shape + j]; - } - } - } - - return CSINN_TRUE; -} - -int csi_ref_cache_matmul_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params) -{ - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); - struct csi_tensor *float_weight = csi_ref_tensor_transform_f32(weight); - struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias); - - int ret = csi_ref_cache_matmul_f32(float_input, float_output, float_weight, float_bias, params); - - csi_tensor_data_convert(output, float_output); - - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_weight); - csi_ref_tensor_transform_free_f32(float_bias); - - return CSINN_TRUE; +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_ref.h" + +// asr data buffer +void asr_buffer_init(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth) +{ + buffer->buffer = shl_mem_alloc(buffer_size); + buffer->buffer_lenth = buffer_size; + buffer->data_lenth = data_lenth; + buffer->writer_index = buffer_size - data_lenth; + buffer->flag = 0; //用来记录有没有经过位置0.有的话置为1. +} + +// insert front +void *asr_buffer_insert_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len) +{ + int start_position = buffer->writer_index - len; + uint8_t *p = NULL; + if (buffer->flag == 0) { + if (start_position < 0) { + buffer->flag = 1; + } + } + if (start_position >= 0) { + p = &buffer->buffer[start_position]; + memcpy(p, input, len); + buffer->writer_index = start_position; + if (buffer->flag == 0) { + return (void *)&buffer->buffer[0]; + } else { + return (void *)p; + } + } else { + start_position = buffer->buffer_lenth - buffer->data_lenth; + p = &buffer->buffer[start_position]; + memcpy(p, input, len); + memcpy(p + len, &buffer->buffer[buffer->writer_index], buffer->data_lenth - len); + buffer->writer_index = start_position; + return (void *)p; + } +} + +void *asr_buffer_insert_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len) +{ + int end_position = buffer->writer_index + len; + uint8_t *p = NULL; + if (end_position <= buffer->buffer_lenth) { + p = &buffer->buffer[buffer->writer_index]; + memcpy(p, input, len); + buffer->writer_index += len; + p -= (buffer->data_lenth - len); + } else { + p = &buffer->buffer[buffer->writer_index + len - buffer->data_lenth]; + memcpy(&buffer->buffer[0], p, buffer->data_lenth - len); + buffer->writer_index = buffer->data_lenth; + memcpy(&buffer->buffer[buffer->data_lenth - len], input, len); + p = &buffer->buffer[0]; + } + return (void *)p; +} + +// get buffer +void *asr_buffer_get_buffer(struct csinn_asr_buffer_t *buffer) +{ + return asr_buffer_insert_back(buffer, NULL, 0); +} + +// reset buffer +void asr_buffer_reset(struct csinn_asr_buffer_t *buffer) +{ + free(buffer->buffer); + buffer->writer_index = 0; + buffer->buffer = NULL; + buffer->buffer_lenth = 0; + buffer->data_lenth = 0; + buffer->flag = 0; +} + +int shl_ref_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) +{ + size_t data_size = + params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(float); + asr_buffer_init(¶ms->asr_buffer, 2 * data_size, data_size); + + int accum_depth = weight->dim[0]; + int output_depth = weight->dim[1]; + + struct csinn_callback *cb = params->base.cb; + if (input->dtype == CSINN_DTYPE_FLOAT32) { + cb->exec = shl_ref_cache_matmul_f32; + } else { + cb->exec = shl_ref_cache_matmul_quant; + } + + return CSINN_TRUE; +} + +int shl_ref_cache_matmul_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) +{ + int accum_depth = weight->dim[0]; + int output_depth = weight->dim[1]; + int batches = input->dim[1]; + float *input_data = input->data; + float *output_data = output->data; + float *weight_data = weight->data; + float *bias_data = bias->data; + + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) { + total += input_data[b * accum_depth + d] * weight_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + + bias_value = bias_data[out_c]; + + int out_pos = out_c + b * output_depth; //如果无transpose + output_data[out_pos] = total + bias_value; + } + } + + float judge = + bias_data[0] + bias_data[1] + bias_data[2] + bias_data[3] + bias_data[4] + bias_data[5]; + size_t insert_lenth = output_depth * batches; + float *output_from_buffer; + if (fabs(judge) < 0.01) { + output_from_buffer = + asr_buffer_insert_front(¶ms->asr_buffer, output_data, insert_lenth * sizeof(float)); + } else { + output_from_buffer = + asr_buffer_insert_back(¶ms->asr_buffer, output_data, insert_lenth * sizeof(float)); + } + // deal with reshape & transpose + int32_t *shape = output->dim; + + // transpose can only be 0,2,3,1 or 0,2,1,3 + if (params->axes[2] == 3) // 0,2,3,1 + { + int batch = shape[3]; + int shape3 = shape[2]; + int flatten_shape = shape[1] * shape[2]; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < flatten_shape; j++) { + int out_pos = j * batch + i; + output_data[out_pos] = output_from_buffer[i * flatten_shape + j]; + } + } + } else // 0,2,1,3 + { + int batch = shape[2]; + int shape3 = shape[3]; + int flatten_shape = shape[1] * shape[3]; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < flatten_shape; j++) { + int out_pos = i * shape3 + j % shape3 + batch * shape3 * (j / shape3); + output_data[out_pos] = output_from_buffer[i * flatten_shape + j]; + } + } + } + + return CSINN_TRUE; +} + +int shl_ref_cache_matmul_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params) +{ + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); + struct csinn_tensor *float_weight = shl_ref_tensor_transform_f32(weight); + struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias); + + int ret = shl_ref_cache_matmul_f32(float_input, float_output, float_weight, float_bias, params); + + csinn_tensor_data_convert(output, float_output); + + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_weight); + shl_ref_tensor_transform_free_f32(float_bias); + + return CSINN_TRUE; } \ No newline at end of file diff --git a/source/reference/ceil.c b/source/reference/ceil.c index d77f4950..87460828 100644 --- a/source/reference/ceil.c +++ b/source/reference/ceil.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_ceil_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +37,8 @@ int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_ceil_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_ceil_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_ceil_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_ceil_f32); } diff --git a/source/reference/clip.c b/source/reference/clip.c index 2db4c3aa..68208b23 100644 --- a/source/reference/clip.c +++ b/source/reference/clip.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params) +int shl_ref_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -42,8 +42,8 @@ int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_clip_quant(struct csi_tensor *input, struct csi_tensor *output, - struct clip_params *params) +int shl_ref_clip_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_clip_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_clip_f32); } diff --git a/source/reference/col2im.c b/source/reference/col2im.c index 7a394509..434b6a61 100644 --- a/source/reference/col2im.c +++ b/source/reference/col2im.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_col2im_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct col2im_params *params) +int shl_ref_col2im_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params) { int32_t height = input->dim[1]; int32_t width = input->dim[2]; diff --git a/source/reference/concat.c b/source/reference/concat.c index 619b32a0..185875fa 100644 --- a/source/reference/concat.c +++ b/source/reference/concat.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params) +int shl_ref_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -36,7 +36,7 @@ int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output, float *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; float *input_item_data = input_item->data; const int copy_size = input_item->dim[params->axis] * base_inner_size; const float *input_ptr = input_item_data + k * copy_size; @@ -47,8 +47,8 @@ int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params) +int shl_ref_concat_quant(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { if (params->axis == -1) { params->axis = input[0]->dim_count - 1; @@ -57,19 +57,19 @@ int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output, int input_count = params->inputs_count; int ret; - struct csi_tensor *finput[input_count]; - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *finput[input_count]; + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); for (int i = 0; i < input_count; i++) { - finput[i] = csi_ref_tensor_transform_f32(input[i]); + finput[i] = shl_ref_tensor_transform_f32(input[i]); } - ret = csi_ref_concat_f32(finput, foutput, params); + ret = shl_ref_concat_f32(finput, foutput, params); - csi_tensor_data_convert(output, foutput); + csinn_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(foutput); + shl_ref_tensor_transform_free_f32(foutput); for (int i = 0; i < input_count; i++) { - csi_ref_tensor_transform_free_f32(finput[i]); + shl_ref_tensor_transform_free_f32(finput[i]); } return ret; } diff --git a/source/reference/conv_avx.h b/source/reference/conv_avx.h index 622a16b8..e2dc4c7a 100644 --- a/source/reference/conv_avx.h +++ b/source/reference/conv_avx.h @@ -1,15 +1,15 @@ #include -static float* channel(struct csi_tensor* t, int64_t c) +static float* channel(struct csinn_tensor* t, int64_t c) { return (float*)t->data + c * t->dim[2] * t->dim[3]; } -static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor* t_kernel) +static void conv_trans_kernel_avx(struct csinn_tensor* o_kernel, struct csinn_tensor* t_kernel) { float* kernel = o_kernel->data; float* ret; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(t_kernel, o_kernel); // kernel memory packed 8 x 8 int64_t outch = o_kernel->dim[0]; int64_t inch = o_kernel->dim[1]; @@ -19,7 +19,7 @@ static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor t_kernel->dim[2] = o_kernel->dim[1]; t_kernel->dim[3] = o_kernel->dim[2] * o_kernel->dim[3] * 8; - ret = csi_mem_alloc(8 * kernel_size * inch * (outch / 8 + (outch % 8) / 4 + outch % 4) * + ret = shl_mem_alloc(8 * kernel_size * inch * (outch / 8 + (outch % 8) / 4 + outch % 4) * sizeof(float)); t_kernel->data = ret; @@ -106,8 +106,8 @@ static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor } } -static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* output, - struct csi_tensor* kernel_tm, struct csi_tensor* o_bias, +static void conv_im2col_sgemm_avx(struct csinn_tensor* input, struct csinn_tensor* output, + struct csinn_tensor* kernel_tm, struct csinn_tensor* o_bias, int64_t kernel_w, int64_t kernel_h, int64_t stride_w, int64_t stride_h) { @@ -124,9 +124,9 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o } // im2col - struct csi_tensor* bottom_im2col = csi_alloc_tensor(NULL); - csi_tensor_copy(bottom_im2col, input); - bottom_im2col->data = csi_mem_alloc(outw * outh * kernel_h * kernel_w * inch * sizeof(float)); + struct csinn_tensor* bottom_im2col = csinn_alloc_tensor(NULL); + csinn_tensor_copy(bottom_im2col, input); + bottom_im2col->data = shl_mem_alloc(outw * outh * kernel_h * kernel_w * inch * sizeof(float)); bottom_im2col->dim[0] = 0; bottom_im2col->dim[1] = 0; bottom_im2col->dim[2] = kernel_h * kernel_w * inch; @@ -159,9 +159,9 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o int64_t out_size = outw * outh; // bottom_im2col memory packed 8 x 8 - struct csi_tensor* bottom_tm = csi_alloc_tensor(NULL); - csi_tensor_copy(bottom_tm, input); - bottom_tm->data = csi_mem_alloc(8 * kernel_size * inch * (out_size / 8 + out_size % 8) * 4); + struct csinn_tensor* bottom_tm = csinn_alloc_tensor(NULL); + csinn_tensor_copy(bottom_tm, input); + bottom_tm->data = shl_mem_alloc(8 * kernel_size * inch * (out_size / 8 + out_size % 8) * 4); bottom_tm->dim[0] = 0; bottom_tm->dim[1] = out_size / 8 + out_size % 8; bottom_tm->dim[2] = inch; @@ -180,7 +180,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o float* tmpptr = channel(bottom_tm, (i / 8)); for (int64_t q = 0; q < inch * kernel_size; q++) { -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT _mm256_storeu_ps(tmpptr, _mm256_loadu_ps(img0)); #else tmpptr[0] = img0[0]; @@ -245,7 +245,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o for (; j + 7 < N; j = j + 8) { const float* vb = channel(bottom_tm, (j / 8)); const float* va = channel(kernel_tm, (i / 8)); -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m256 _sum0 = _mm256_broadcast_ss(biasptr); __m256 _sum1 = _mm256_broadcast_ss(biasptr + 1); __m256 _sum2 = _mm256_broadcast_ss(biasptr + 2); @@ -499,7 +499,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o output6[n] = sum6[n] + biasptr[6]; output7[n] = sum7[n] + biasptr[7]; } -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT output0 += 8; output1 += 8; output2 += 8; @@ -514,7 +514,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o const float* vb = channel(bottom_tm, (j / 8 + j % 8)); const float* va = channel(kernel_tm, (i / 8)); -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m256 _sum0_7 = _mm256_loadu_ps(biasptr); __m256 _sum0 = _mm256_set1_ps(0.0); __m256 _sum1 = _mm256_set1_ps(0.0); @@ -599,7 +599,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o output5[0] = sum5; output6[0] = sum6; output7[0] = sum7; -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT output0++; output1++; output2++; @@ -629,7 +629,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o for (; j + 7 < N; j = j + 8) { const float* vb = channel(bottom_tm, (j / 8)); const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4)); -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m256 _sum0 = _mm256_broadcast_ss(biasptr); __m256 _sum1 = _mm256_broadcast_ss(biasptr + 1); __m256 _sum2 = _mm256_broadcast_ss(biasptr + 2); @@ -791,7 +791,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o output2[n] = sum2[n] + biasptr[2]; output3[n] = sum3[n] + biasptr[3]; } -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT output0 += 8; output1 += 8; output2 += 8; @@ -801,7 +801,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o for (; j < N; j++) { const float* vb = channel(bottom_tm, (j / 8 + j % 8)); const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4)); -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m128 _sum0_3 = _mm_loadu_ps(biasptr); __m128 _sum0 = _mm_set1_ps(0.0); __m128 _sum1 = _mm_set1_ps(0.0); @@ -869,7 +869,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o output1[0] = sum1; output2[0] = sum2; output3[0] = sum3; -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT output0++; output1++; output2++; @@ -889,7 +889,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o for (; j + 7 < N; j = j + 8) { const float* vb = channel(bottom_tm, (j / 8)); const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4 + i % 4)); -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m256 _sum0 = _mm256_broadcast_ss(&bias0); int64_t k = 0; @@ -957,7 +957,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o for (int64_t n = 0; n < 8; n++) { output0[n] = sum[n] + bias0; } -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT output0 += 8; } @@ -966,7 +966,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4 + i % 4)); int64_t k = 0; -#ifdef CSI_AVX_OPT +#ifdef SHL_AVX_OPT __m128 _sum0 = _mm_set1_ps(0.f); for (; k + 3 < L; k += 4) { @@ -987,7 +987,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o #else float sum0 = bias0; -#endif // CSI_AVX_OPT +#endif // SHL_AVX_OPT for (; k < L; k++) { sum0 += va[0] * vb[0]; @@ -1000,8 +1000,8 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o } } } - csi_mem_free(bottom_tm->data); - csi_mem_free(bottom_tm); - csi_mem_free(bottom_im2col->data); - csi_mem_free(bottom_im2col); + shl_mem_free(bottom_tm->data); + shl_mem_free(bottom_tm); + shl_mem_free(bottom_im2col->data); + shl_mem_free(bottom_im2col); } diff --git a/source/reference/convolution.c b/source/reference/convolution.c index 03f0471e..8c1e4170 100644 --- a/source/reference/convolution.c +++ b/source/reference/convolution.c @@ -16,10 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#ifdef CSI_AVX_OPT +#include "shl_ref.h" +#ifdef SHL_AVX_OPT #include "conv_avx.h" #endif @@ -27,9 +27,9 @@ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/conv.h */ -static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_ref_conv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -65,10 +65,10 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor * // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32_t input_index = csi_ref_get_index(input->dim, batch, in_y, + int32_t input_index = shl_ref_get_index(input->dim, batch, in_y, in_x, in_channel); float input_val = input_data[input_index]; - int32_t filter_index = csi_ref_get_index( + int32_t filter_index = shl_ref_get_index( kernel->dim, out_channel, filter_y, filter_x, in_channel); float filter_val = kernel_data[filter_index]; acc += (input_val * filter_val); @@ -80,7 +80,7 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor * if (bias_data && bias->dim_count != 0) { bias_value = bias_data[out_channel]; } - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, out_channel)] = + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc + bias_value; } } @@ -90,61 +90,60 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor * return CSINN_TRUE; } -static int csi_ref_conv2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_ref_conv2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { -#ifdef CSI_AVX_OPT - struct csi_tensor *t_input = csi_alloc_tensor(NULL); - csi_tensor_copy(t_input, input); +#ifdef SHL_AVX_OPT + struct csinn_tensor *t_input = csinn_alloc_tensor(NULL); + csinn_tensor_copy(t_input, input); int32_t pad_b[4] = {0, 0, params->pad_top, params->pad_left}; int32_t pad_a[4] = {0, 0, params->pad_down, params->pad_right}; t_input->dim[2] = input->dim[2] + params->pad_top + params->pad_down; t_input->dim[3] = input->dim[3] + params->pad_left + params->pad_right; t_input->data = - csi_mem_alloc(t_input->dim[0] * t_input->dim[1] * t_input->dim[2] * t_input->dim[3] * 4); - struct pad_params pparams; + shl_mem_alloc(t_input->dim[0] * t_input->dim[1] * t_input->dim[2] * t_input->dim[3] * 4); + struct csinn_pad_params pparams; pparams.base.layout = CSINN_LAYOUT_NCHW; pparams.base.api = CSINN_REF; - pparams.base.run_mode = CSINN_RM_LAYER; pparams.pad_before = pad_b; pparams.pad_after = pad_a; pparams.pad_num = 4; pparams.pad_mode = 0; pparams.pad_value = 0; pparams.base.name = "tmp_pad"; - csi_pad_init(input, t_input, &pparams); - csi_pad(input, t_input, &pparams); + shl_ref_pad_f32(input, t_input, &pparams); - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); conv_trans_kernel_avx(kernel, t_kernel); conv_im2col_sgemm_avx(t_input, output, t_kernel, bias, kernel->dim[3], kernel->dim[2], params->stride_width, params->stride_height); - csi_mem_free(t_input->data); - csi_mem_free(t_kernel->data); + shl_mem_free(t_input->data); + shl_mem_free(t_kernel->data); #else - struct csi_tensor *t_input; - struct csi_tensor *t_output; - struct csi_tensor *t_kernel; - struct csi_tensor *t_bias = bias; - t_input = csi_ref_nchw_to_nhwc_f32(input); - t_kernel = csi_ref_nchw_to_nhwc_f32(kernel); - t_output = csi_ref_nchw_to_nhwc_f32(output); - csi_ref_conv2d_nhwc_f32(t_input, t_output, t_kernel, t_bias, params); - csi_ref_nhwc_to_nchw_f32(output, t_output); - csi_mem_free(t_input->data); - csi_mem_free(t_input); - csi_mem_free(t_kernel->data); - csi_mem_free(t_kernel); + struct csinn_tensor *t_input; + struct csinn_tensor *t_output; + struct csinn_tensor *t_kernel; + struct csinn_tensor *t_bias = bias; + t_input = shl_ref_nchw_to_nhwc_f32(input); + t_kernel = shl_ref_nchw_to_nhwc_f32(kernel); + t_output = shl_ref_nchw_to_nhwc_f32(output); + shl_ref_conv2d_nhwc_f32(t_input, t_output, t_kernel, t_bias, params); + shl_ref_nhwc_to_nchw_f32(output, t_output); + shl_mem_free(t_input->data); + shl_mem_free(t_input); + shl_mem_free(t_kernel->data); + shl_mem_free(t_kernel); #endif return CSINN_TRUE; } -static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_ref_depthwise_conv2d_nhwc_f32(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -186,9 +185,9 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - float input_val = input_data[csi_ref_get_index(input->dim, b, + float input_val = input_data[shl_ref_get_index(input->dim, b, in_y, in_x, ic)]; - float filter_val = kernel_data[csi_ref_get_index( + float filter_val = kernel_data[shl_ref_get_index( kernel->dim, 0, filter_y, filter_x, oc)]; acc += (filter_val) * (input_val); } @@ -197,7 +196,7 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs if (bias_data && bias->dim_count != 0) { acc += bias_data[oc]; } - output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = acc; + output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = acc; } } } @@ -206,9 +205,10 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs return CSINN_TRUE; } -static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_ref_depthwise_conv2d_nchw_f32(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -250,9 +250,9 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - float input_val = input_data[csi_ref_get_index(input->dim, b, + float input_val = input_data[shl_ref_get_index(input->dim, b, ic, in_y, in_x)]; - float filter_val = kernel_data[csi_ref_get_index( + float filter_val = kernel_data[shl_ref_get_index( kernel->dim, oc, 0, filter_y, filter_x)]; acc += (filter_val) * (input_val); } @@ -261,7 +261,7 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs if (bias_data && bias->dim_count != 0) { acc += bias_data[oc]; } - output_data[csi_ref_get_index(output->dim, b, oc, out_y, out_x)] = acc; + output_data[shl_ref_get_index(output->dim, b, oc, out_y, out_x)] = acc; } } } @@ -269,27 +269,28 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs } } -static int csi_ref_group_conv2d_nhwc_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_group_conv2d_nhwc_f32(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); - csi_tensor_copy(input, o_input); - csi_tensor_copy(output, o_output); - csi_tensor_copy(kernel, o_kernel); - csi_tensor_copy(bias, o_bias); + csinn_tensor_copy(input, o_input); + csinn_tensor_copy(output, o_output); + csinn_tensor_copy(kernel, o_kernel); + csinn_tensor_copy(bias, o_bias); input->dim[3] /= params->group; output->dim[3] /= params->group; kernel->dim[0] /= params->group; - int input_size = csi_tensor_size(input); - int output_size = csi_tensor_size(output); - int kernel_size = csi_tensor_size(kernel); + int input_size = csinn_tensor_size(input); + int output_size = csinn_tensor_size(output); + int kernel_size = csinn_tensor_size(kernel); float *input_data = o_input->data; float *output_data = o_output->data; @@ -302,32 +303,33 @@ static int csi_ref_group_conv2d_nhwc_f32(struct csi_tensor *o_input, struct csi_ if (bias->data && bias->dim_count != 0) { bias->data = bias_data + i * o_output->dim[3] / params->group; } - csi_ref_conv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_conv2d_nhwc_f32(input, output, kernel, bias, params); } return CSINN_TRUE; } -static int csi_ref_group_conv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_group_conv2d_nchw_f32(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); - csi_tensor_copy(input, o_input); - csi_tensor_copy(output, o_output); - csi_tensor_copy(kernel, o_kernel); - csi_tensor_copy(bias, o_bias); + csinn_tensor_copy(input, o_input); + csinn_tensor_copy(output, o_output); + csinn_tensor_copy(kernel, o_kernel); + csinn_tensor_copy(bias, o_bias); input->dim[1] /= params->group; output->dim[1] /= params->group; kernel->dim[0] /= params->group; - int input_size = csi_tensor_size(input); - int output_size = csi_tensor_size(output); - int kernel_size = csi_tensor_size(kernel); + int input_size = csinn_tensor_size(input); + int output_size = csinn_tensor_size(output); + int kernel_size = csinn_tensor_size(kernel); float *input_data = o_input->data; float *output_data = o_output->data; @@ -340,37 +342,37 @@ static int csi_ref_group_conv2d_nchw_f32(struct csi_tensor *o_input, struct csi_ if (bias->data && bias->dim_count != 0) { bias->data = bias_data + i * o_output->dim[1] / params->group; } - csi_ref_conv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_conv2d_nchw_f32(input, output, kernel, bias, params); } return CSINN_TRUE; } -int csi_ref_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_conv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_conv2d_nhwc_f32(input, output, kernel, bias, params); } else if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_conv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_conv2d_nchw_f32(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int ret; if (params->conv_extra.fuse_zp2bias) { - struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias); - struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel); + struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias); + struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel); float *tmp_bias_data = tmp_bias->data; float *tmp_kernel_data = tmp_kernel->data; int k_len = kernel->dim[0]; - int k_inner = csi_tensor_size(kernel) / k_len; + int k_inner = csinn_tensor_size(kernel) / k_len; float sp = input->qinfo->scale * input->qinfo->zero_point; for (int i = 0; i < k_len; i++) { float t_k = 0; @@ -380,42 +382,42 @@ int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, } tmp_bias_data[i] += t_k; } - csi_ref_tensor_transform_free_f32(tmp_kernel); + shl_ref_tensor_transform_free_f32(tmp_kernel); ret = - csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params, csi_ref_conv2d_f32); - csi_ref_tensor_transform_free_f32(tmp_bias); + shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params, shl_ref_conv2d_f32); + shl_ref_tensor_transform_free_f32(tmp_bias); } else { - ret = csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_conv2d_f32); + ret = shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_conv2d_f32); } return ret; } -int csi_ref_depthwise_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_depthwise_conv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_depthwise_conv2d_nhwc_f32(input, output, kernel, bias, params); } else if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_depthwise_conv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_depthwise_conv2d_nchw_f32(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int ret; if (params->conv_extra.fuse_zp2bias) { - struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias); - struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel); + struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias); + struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel); float *tmp_bias_data = tmp_bias->data; float *tmp_kernel_data = tmp_kernel->data; if (params->base.layout == CSINN_LAYOUT_NCHW) { int k_len = kernel->dim[0]; - int k_inner = csi_tensor_size(kernel) / k_len; + int k_inner = csinn_tensor_size(kernel) / k_len; float sp = input->qinfo->scale * input->qinfo->zero_point; for (int i = 0; i < k_len; i++) { float t_k = tmp_bias_data[i]; @@ -427,7 +429,7 @@ int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor * } } else { int k_len = kernel->dim[3]; - int k_outer = csi_tensor_size(kernel) / k_len; + int k_outer = csinn_tensor_size(kernel) / k_len; float sp = input->qinfo->scale * input->qinfo->zero_point; for (int i = 0; i < k_len; i++) { float t_k = tmp_bias_data[i]; @@ -438,43 +440,43 @@ int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor * tmp_bias_data[i] = t_k; } } - csi_ref_tensor_transform_free_f32(tmp_kernel); - ret = csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params, - csi_ref_depthwise_conv2d_f32); - csi_ref_tensor_transform_free_f32(tmp_bias); + shl_ref_tensor_transform_free_f32(tmp_kernel); + ret = shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params, + shl_ref_depthwise_conv2d_f32); + shl_ref_tensor_transform_free_f32(tmp_bias); } else { - ret = csi_ref_conv_callback_base(input, output, kernel, bias, params, - csi_ref_depthwise_conv2d_f32); + ret = shl_ref_conv_callback_base(input, output, kernel, bias, params, + shl_ref_depthwise_conv2d_f32); } return ret; } -int csi_ref_group_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_group_conv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_group_conv2d_nhwc_f32(input, output, kernel, bias, params); } else if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_group_conv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_group_conv2d_nchw_f32(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int ret; if (params->conv_extra.fuse_zp2bias) { - struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias); - struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel); + struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias); + struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel); float *tmp_bias_data = tmp_bias->data; float *tmp_kernel_data = tmp_kernel->data; int k_len = kernel->dim[0]; - int k_inner = csi_tensor_size(kernel) / k_len; + int k_inner = csinn_tensor_size(kernel) / k_len; float sp = input->qinfo->scale * input->qinfo->zero_point; for (int i = 0; i < k_len; i++) { float t_k = 0; @@ -484,13 +486,13 @@ int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *outp } tmp_bias_data[i] += t_k; } - csi_ref_tensor_transform_free_f32(tmp_kernel); - ret = csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params, - csi_ref_group_conv2d_f32); - csi_ref_tensor_transform_free_f32(tmp_bias); + shl_ref_tensor_transform_free_f32(tmp_kernel); + ret = shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params, + shl_ref_group_conv2d_f32); + shl_ref_tensor_transform_free_f32(tmp_bias); } else { - ret = csi_ref_conv_callback_base(input, output, kernel, bias, params, - csi_ref_group_conv2d_f32); + ret = shl_ref_conv_callback_base(input, output, kernel, bias, params, + shl_ref_group_conv2d_f32); } return ret; diff --git a/source/reference/convolution1d.c b/source/reference/convolution1d.c index af1a7e40..b27df52c 100644 --- a/source/reference/convolution1d.c +++ b/source/reference/convolution1d.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params) +int shl_ref_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params) { - struct conv2d_params params_conv2d; + struct csinn_conv2d_params params_conv2d; params_conv2d.base = params->base; params_conv2d.group = params->group; params_conv2d.stride_height = 1; @@ -43,16 +43,16 @@ int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output, input->dim[3] = 1; output->dim_count = 4; output->dim[3] = 1; - csi_ref_conv2d_f32(input, output, kernel, bias, ¶ms_conv2d); + shl_ref_conv2d_f32(input, output, kernel, bias, ¶ms_conv2d); return CSINN_TRUE; } -int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv1d_params *params) +int shl_ref_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params) { - struct conv2d_params params_conv2d; + struct csinn_conv2d_params params_conv2d; params_conv2d.base = params->base; params_conv2d.group = params->group; params_conv2d.stride_height = 1; @@ -71,7 +71,7 @@ int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output, input->dim[3] = 1; output->dim_count = 4; output->dim[3] = 1; - csi_ref_conv2d_quant(input, output, kernel, bias, ¶ms_conv2d); + shl_ref_conv2d_quant(input, output, kernel, bias, ¶ms_conv2d); return CSINN_TRUE; } diff --git a/source/reference/convolution3d.c b/source/reference/convolution3d.c index 49d742f8..b53fc537 100644 --- a/source/reference/convolution3d.c +++ b/source/reference/convolution3d.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params) +int shl_ref_conv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -77,11 +77,11 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output, if ((in_d >= 0) && (in_d < in_depth) && (in_h >= 0) && (in_h < in_height) && (in_w >= 0) && (in_w < in_width)) { - int32_t input_idx = csi_ref_get_index_5( + int32_t input_idx = shl_ref_get_index_5( input->dim, out_b, in_ch, in_d, in_h, in_w); float input_val = input_data[input_idx]; int32_t filter_idx = - csi_ref_get_index_5(kernel->dim, out_ch, in_ch, + shl_ref_get_index_5(kernel->dim, out_ch, in_ch, filter_d, filter_h, filter_w); float filter_val = kernel_data[filter_idx]; acc += input_val * filter_val; @@ -95,7 +95,7 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output, bias_val = bias_data[out_ch]; } int32_t output_idx = - csi_ref_get_index_5(output->dim, out_b, out_ch, out_d, out_h, out_w); + shl_ref_get_index_5(output->dim, out_b, out_ch, out_d, out_h, out_w); output_data[output_idx] = acc + bias_val; } } @@ -105,9 +105,9 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_conv3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params) +int shl_ref_conv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_conv3d_f32); + return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_conv3d_f32); } diff --git a/source/reference/convolution_channel.c b/source/reference/convolution_channel.c index 92fa3912..b8d6d143 100644 --- a/source/reference/convolution_channel.c +++ b/source/reference/convolution_channel.c @@ -16,37 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -static float csi_ref_uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point) +static float shl_ref_uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point) { return ((float)i - zero_point) * scale; } -static float csi_ref_int8_to_float_channel(int8_t i, float scale, int32_t zero_point) +static float shl_ref_int8_to_float_channel(int8_t i, float scale, int32_t zero_point) { return ((float)i - zero_point) * scale; } -static int channel_kernel_to_common(struct csi_tensor *float_kernel, struct csi_tensor *o_kernel, - struct conv2d_params *params) +static int channel_kernel_to_common(struct csinn_tensor *float_kernel, + struct csinn_tensor *o_kernel, + struct csinn_conv2d_params *params) { float *float_kernel_data = float_kernel->data; - int kernel_size = csi_tensor_size(o_kernel); + int kernel_size = csinn_tensor_size(o_kernel); for (int i = 0; i < o_kernel->dim[0]; i++) { int per_cahnnel = kernel_size / o_kernel->dim[0]; for (int j = 0; j < per_cahnnel; j++) { int index = i * per_cahnnel + j; if (o_kernel->dtype == CSINN_DTYPE_UINT8) { uint8_t *kernel_data = o_kernel->data; - float_kernel_data[index] = csi_ref_uint8_to_float_channel(kernel_data[index], - o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point); + float_kernel_data[index] = shl_ref_uint8_to_float_channel( + kernel_data[index], o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point); } else if (o_kernel->dtype == CSINN_DTYPE_INT8) { int8_t *kernel_data = o_kernel->data; - float_kernel_data[index] = csi_ref_int8_to_float_channel(kernel_data[index], - o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point); + float_kernel_data[index] = shl_ref_int8_to_float_channel( + kernel_data[index], o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point); } else { return CSINN_FALSE; } @@ -54,49 +55,49 @@ static int channel_kernel_to_common(struct csi_tensor *float_kernel, struct csi_ } } -static void channel_bias_to_common(struct csi_tensor *float_bias, struct csi_tensor *bias, - struct csi_tensor *input, struct csi_tensor *kernel) +static void channel_bias_to_common(struct csinn_tensor *float_bias, struct csinn_tensor *bias, + struct csinn_tensor *input, struct csinn_tensor *kernel) { int32_t *bias_data = bias->data; float *float_bias_data = float_bias->data; - int bias_size = csi_tensor_size(bias); + int bias_size = csinn_tensor_size(bias); for (int i = 0; i < bias_size; i++) { float_bias_data[i] = bias_data[i] * kernel->qinfo[i].scale * input->qinfo->scale; } } -static int csi_ref_conv2d_channel_nchw_quant(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_conv2d_channel_nchw_quant(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *float_input = csi_ref_convert_float_tensor(o_input); - struct csi_tensor *float_kernel = csi_ref_alloc_float_tensor(o_kernel); - struct csi_tensor *float_bias = csi_ref_alloc_float_tensor(o_bias); - struct csi_tensor *float_output = csi_ref_alloc_float_tensor(o_output); + struct csinn_tensor *float_input = shl_ref_convert_float_tensor(o_input); + struct csinn_tensor *float_kernel = shl_ref_alloc_float_tensor(o_kernel); + struct csinn_tensor *float_bias = shl_ref_alloc_float_tensor(o_bias); + struct csinn_tensor *float_output = shl_ref_alloc_float_tensor(o_output); channel_kernel_to_common(float_kernel, o_kernel, params); channel_bias_to_common(float_bias, o_bias, o_input, o_kernel); - csi_ref_conv2d_f32(float_input, float_output, float_kernel, float_bias, params); - csi_tensor_data_convert(o_output, float_output); - csi_ref_conv_free_float_tensor(float_input, float_output, float_kernel, float_bias); + shl_ref_conv2d_f32(float_input, float_output, float_kernel, float_bias, params); + csinn_tensor_data_convert(o_output, float_output); + shl_ref_conv_free_float_tensor(float_input, float_output, float_kernel, float_bias); return CSINN_TRUE; } -static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_depthwise_conv2d_channel_nchw_u8(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor* input; - struct csi_tensor* output; - struct csi_tensor* kernel; - struct csi_tensor* bias = o_bias; - input = csi_ref_nchw_to_nhwc_8(o_input); - kernel = csi_ref_nchw_to_nhwc_8(o_kernel); - output = csi_ref_nchw_to_nhwc_8(o_output); + struct csinn_tensor *input; + struct csinn_tensor *output; + struct csinn_tensor *kernel; + struct csinn_tensor *bias = o_bias; + input = shl_ref_nchw_to_nhwc_8(o_input); + kernel = shl_ref_nchw_to_nhwc_8(o_kernel); + output = shl_ref_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -120,14 +121,16 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input, const int32_t output_shift = output->qinfo->shift; for (int32_t b = 0; b < batches; ++b) { - #pragma omp parallel for num_threads(8) +#pragma omp parallel for num_threads(8) for (int32_t out_y = 0; out_y < output_height; ++out_y) { for (int32_t out_x = 0; out_x < output_width; ++out_x) { for (int32_t ic = 0; ic < input_depth; ++ic) { for (int32_t m = 0; m < depth_multiplier; m++) { const int32_t oc = m + ic * depth_multiplier; - const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; - const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + const int32_t in_x_origin = + (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = + (out_y * params->stride_height) - params->pad_top; int64_t acc = 0; for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { @@ -138,12 +141,12 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32_t input_val = - input_data[csi_ref_get_index(input->dim, b, in_y, in_x, ic)]; - int32_t filter_val = kernel_data[csi_ref_get_index( + int32_t input_val = input_data[shl_ref_get_index( + input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[shl_ref_get_index( kernel->dim, ic, filter_y, filter_x, m)]; - acc += - (filter_val - o_kernel->qinfo[oc].zero_point) * (input_val - input_offset); + acc += (filter_val - o_kernel->qinfo[oc].zero_point) * + (input_val - input_offset); } } } @@ -151,34 +154,35 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input, acc += bias_data[oc]; } - uint8_t out = csi_ref_quantize_channel_u8(acc, input, output, o_kernel->qinfo[oc].scale); - output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = out; + uint8_t out = shl_ref_quantize_channel_u8(acc, input, output, + o_kernel->qinfo[oc].scale); + output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = out; } } } } } - csi_ref_nhwc_to_nchw_8(o_output, output); - csi_mem_free(input->data); - csi_mem_free(input); - csi_mem_free(kernel->data); - csi_mem_free(kernel); + shl_ref_nhwc_to_nchw_8(o_output, output); + shl_mem_free(input->data); + shl_mem_free(input); + shl_mem_free(kernel->data); + shl_mem_free(kernel); return CSINN_TRUE; } -static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_depthwise_conv2d_channel_nchw_i8(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor* input; - struct csi_tensor* output; - struct csi_tensor* kernel; - struct csi_tensor* bias = o_bias; - input = csi_ref_nchw_to_nhwc_8(o_input); - kernel = csi_ref_nchw_to_nhwc_8(o_kernel); - output = csi_ref_nchw_to_nhwc_8(o_output); + struct csinn_tensor *input; + struct csinn_tensor *output; + struct csinn_tensor *kernel; + struct csinn_tensor *bias = o_bias; + input = shl_ref_nchw_to_nhwc_8(o_input); + kernel = shl_ref_nchw_to_nhwc_8(o_kernel); + output = shl_ref_nchw_to_nhwc_8(o_output); int8_t *input_data = input->data; int8_t *output_data = output->data; @@ -202,14 +206,16 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input, const int32_t output_shift = output->qinfo->shift; for (int32_t b = 0; b < batches; ++b) { - #pragma omp parallel for num_threads(8) +#pragma omp parallel for num_threads(8) for (int32_t out_y = 0; out_y < output_height; ++out_y) { for (int32_t out_x = 0; out_x < output_width; ++out_x) { for (int32_t ic = 0; ic < input_depth; ++ic) { for (int32_t m = 0; m < depth_multiplier; m++) { const int32_t oc = m + ic * depth_multiplier; - const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; - const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + const int32_t in_x_origin = + (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = + (out_y * params->stride_height) - params->pad_top; int64_t acc = 0; for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { @@ -220,12 +226,12 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32_t input_val = - input_data[csi_ref_get_index(input->dim, b, in_y, in_x, ic)]; - int32_t filter_val = kernel_data[csi_ref_get_index( + int32_t input_val = input_data[shl_ref_get_index( + input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[shl_ref_get_index( kernel->dim, ic, filter_y, filter_x, m)]; - acc += - (filter_val - o_kernel->qinfo[oc].zero_point) * (input_val - input_offset); + acc += (filter_val - o_kernel->qinfo[oc].zero_point) * + (input_val - input_offset); } } } @@ -233,38 +239,39 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input, acc += bias_data[oc]; } - int8_t out = csi_ref_quantize_channel_i8(acc, input, output, o_kernel->qinfo[oc].scale); - output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = out; + int8_t out = shl_ref_quantize_channel_i8(acc, input, output, + o_kernel->qinfo[oc].scale); + output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = out; } } } } } - csi_ref_nhwc_to_nchw_8(o_output, output); - csi_mem_free(input->data); - csi_mem_free(input); - csi_mem_free(kernel->data); - csi_mem_free(kernel); + shl_ref_nhwc_to_nchw_8(o_output, output); + shl_mem_free(input->data); + shl_mem_free(input); + shl_mem_free(kernel->data); + shl_mem_free(kernel); return CSINN_TRUE; } -static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_group_conv2d_channel_nchw_quant(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params pparams; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params pparams; - csi_tensor_copy(input, o_input); - csi_tensor_copy(output, o_output); - csi_tensor_copy(kernel, o_kernel); - csi_tensor_copy(bias, o_bias); - memcpy(&pparams, params, sizeof(struct conv2d_params)); + csinn_tensor_copy(input, o_input); + csinn_tensor_copy(output, o_output); + csinn_tensor_copy(kernel, o_kernel); + csinn_tensor_copy(bias, o_bias); + memcpy(&pparams, params, sizeof(struct csinn_conv2d_params)); input->dim[1] /= params->group; output->dim[1] /= params->group; @@ -272,9 +279,9 @@ static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input, bias->dim[0] /= params->group; pparams.group = 1; - int input_size = csi_tensor_size(input); - int output_size = csi_tensor_size(output); - int kernel_size = csi_tensor_size(kernel); + int input_size = csinn_tensor_size(input); + int output_size = csinn_tensor_size(output); + int kernel_size = csinn_tensor_size(kernel); int8_t *input_data = o_input->data; int8_t *output_data = o_output->data; @@ -289,64 +296,55 @@ static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input, } kernel->qinfo = o_kernel->qinfo + i * o_output->dim[1] / params->group; - csi_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, &pparams); + shl_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, &pparams); } return CSINN_TRUE; } -int csi_ref_conv2d_channel_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, params); + shl_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_conv2d_channel_relu_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_conv2d_channel_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_conv2d_channel_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); return CSINN_TRUE; } -int csi_ref_conv2d_channel_relu6_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_channel_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_conv2d_channel_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu6_init(output, output, rp); - csi_relu6(output, output, rp); + shl_ref_conv2d_channel_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu6_init(output, output, rp); + csinn_relu6(output, output, rp); return CSINN_TRUE; } - -int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { if (input->dtype == CSINN_DTYPE_UINT8) { - csi_ref_depthwise_conv2d_channel_nchw_u8(input, output, kernel, bias, params); + shl_ref_depthwise_conv2d_channel_nchw_u8(input, output, kernel, bias, params); } else if (input->dtype == CSINN_DTYPE_INT8) { - csi_ref_depthwise_conv2d_channel_nchw_i8(input, output, kernel, bias, params); + shl_ref_depthwise_conv2d_channel_nchw_i8(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_DTYPE; } @@ -355,54 +353,50 @@ int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input, } } -int csi_ref_depthwise_conv2d_channel_relu_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_channel_relu_quant(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, + struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); } -int csi_ref_depthwise_conv2d_channel_relu6_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_channel_relu6_quant(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, + struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu6_init(output, output, rp); - csi_relu6(output, output, rp); + shl_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu6_init(output, output, rp); + csinn_relu6(output, output, rp); } -int csi_ref_group_conv2d_channel_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_group_conv2d_channel_nchw_quant(input, output, kernel, bias, params); + shl_ref_group_conv2d_channel_nchw_quant(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_group_conv2d_channel_relu_quant(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_group_conv2d_channel_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_group_conv2d_channel_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); } diff --git a/source/reference/convolution_relu.c b/source/reference/convolution_relu.c index 34d6880c..c05d0d3f 100644 --- a/source/reference/convolution_relu.c +++ b/source/reference/convolution_relu.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_conv2d_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_conv2d_f32(input, output, kernel, bias, params); + shl_ref_conv2d_f32(input, output, kernel, bias, params); float *data = output->data; - int size = csi_tensor_size(output); + int size = csinn_tensor_size(output); for (int i = 0; i < size; i++) { data[i] = data[i] > 0 ? data[i] : 0; } return CSINN_TRUE; } -int csi_ref_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); return CSINN_TRUE; } -int csi_ref_depthwise_conv2d_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_depthwise_conv2d_f32(input, output, kernel, bias, params); + shl_ref_depthwise_conv2d_f32(input, output, kernel, bias, params); float *data = output->data; - int size = csi_tensor_size(output); + int size = csinn_tensor_size(output); for (int i = 0; i < size; i++) { data[i] = data[i] > 0 ? data[i] : 0; } return CSINN_TRUE; } -int csi_ref_depthwise_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_depthwise_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_depthwise_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); return CSINN_TRUE; } -int csi_ref_group_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_group_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu_init(output, output, rp); - csi_relu(output, output, rp); + shl_ref_group_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu_init(output, output, rp); + csinn_relu(output, output, rp); return CSINN_TRUE; } diff --git a/source/reference/convolution_relu6.c b/source/reference/convolution_relu6.c index 9a5f2447..6b98d0fe 100644 --- a/source/reference/convolution_relu6.c +++ b/source/reference/convolution_relu6.c @@ -16,43 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu6_init(output, output, rp); - csi_relu6(output, output, rp); + shl_ref_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu6_init(output, output, rp); + csinn_relu6(output, output, rp); return CSINN_TRUE; } -int csi_ref_depthwise_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_depthwise_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu6_init(output, output, rp); - csi_relu6(output, output, rp); + shl_ref_depthwise_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu6_init(output, output, rp); + csinn_relu6(output, output, rp); return CSINN_TRUE; } -int csi_ref_group_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_group_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - csi_ref_group_conv2d_quant(input, output, kernel, bias, params); - struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params)); - memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base)); - csi_relu6_init(output, output, rp); - csi_relu6(output, output, rp); + shl_ref_group_conv2d_quant(input, output, kernel, bias, params); + struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params)); + memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base)); + csinn_relu6_init(output, output, rp); + csinn_relu6(output, output, rp); return CSINN_TRUE; } diff --git a/source/reference/cos.c b/source/reference/cos.c index 01aca588..5887c17b 100644 --- a/source/reference/cos.c +++ b/source/reference/cos.c @@ -16,15 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_cos_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = cos(input_data[i]); @@ -32,8 +33,8 @@ int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_cos_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_cos_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_cos_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_cos_f32); } diff --git a/source/reference/cosh.c b/source/reference/cosh.c index 5613cd13..ebaf7e2c 100644 --- a/source/reference/cosh.c +++ b/source/reference/cosh.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_cosh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = cosh(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_cosh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_cosh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_cosh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_cosh_f32); } diff --git a/source/reference/cumprod.c b/source/reference/cumprod.c index dade805a..e765efda 100644 --- a/source/reference/cumprod.c +++ b/source/reference/cumprod.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params) +int shl_ref_cumprod_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -58,8 +58,8 @@ int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_cumprod_quant(struct csi_tensor *input, struct csi_tensor *output, - struct cumprod_params *params) +int shl_ref_cumprod_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_cumprod_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_cumprod_f32); } diff --git a/source/reference/cumsum.c b/source/reference/cumsum.c index 5320fe55..dec6d27f 100644 --- a/source/reference/cumsum.c +++ b/source/reference/cumsum.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params) +int shl_ref_cumsum_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -58,8 +58,8 @@ int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_cumsum_quant(struct csi_tensor *input, struct csi_tensor *output, - struct cumsum_params *params) +int shl_ref_cumsum_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_cumsum_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_cumsum_f32); } diff --git a/source/reference/data_convert.c b/source/reference/data_convert.c index 50f091d7..6e2de713 100644 --- a/source/reference/data_convert.c +++ b/source/reference/data_convert.c @@ -18,21 +18,21 @@ /* CSI-NN2 version 1.11.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_data_convert_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_data_convert_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size_byte = csi_tensor_byte_size(input); + int size_byte = csinn_tensor_byte_size(input); memcpy(output_data, input_data, size_byte); return CSINN_TRUE; } -int csi_ref_data_convert_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_data_convert_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_data_convert_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_data_convert_f32); } diff --git a/source/reference/deconvolution.c b/source/reference/deconvolution.c index 23acf0d2..a412b384 100644 --- a/source/reference/deconvolution.c +++ b/source/reference/deconvolution.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +static int shl_ref_deconv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,7 +39,7 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor const int output_width = output->dim[2]; const int output_batch = output->dim[0]; - int num_elements = csi_tensor_size(output); + int num_elements = csinn_tensor_size(output); memset(output_data, 0, num_elements * sizeof(float)); // Loop through input elements one at a time. @@ -59,11 +59,11 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor // We cannot accumulate out of bounds. if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && (out_y < output_height)) { - float input_value = input_data[csi_ref_get_index( + float input_value = input_data[shl_ref_get_index( input->dim, batch, in_y, in_x, in_channel)]; - float filter_value = filter_data[csi_ref_get_index( + float filter_value = filter_data[shl_ref_get_index( kernel->dim, out_channel, filter_y, filter_x, in_channel)]; - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, out_channel)] += input_value * filter_value; } @@ -80,7 +80,7 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor for (int o_y = 0; o_y < output_height; o_y++) { for (int o_x = 0; o_x < output_width; o_x++) { for (int o_channel = 0; o_channel < output_depth; ++o_channel) { - output_data[csi_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] += + output_data[shl_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] += bias_data[o_channel]; } } @@ -91,26 +91,26 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -static int csi_ref_deconv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params) +static int shl_ref_deconv2d_nchw_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input); - struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output); + struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input); + struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output); int32_t permute[4] = {1, 2, 3, 0}; - struct csi_tensor *kernel = csi_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute); - struct csi_tensor *bias = o_bias; + struct csinn_tensor *kernel = shl_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute); + struct csinn_tensor *bias = o_bias; - csi_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params); - csi_ref_nhwc_to_nchw_f32(o_output, output); - csi_ref_free_float_tensor(input); + shl_ref_nhwc_to_nchw_f32(o_output, output); + shl_ref_free_float_tensor(input); return CSINN_TRUE; } -int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_deconv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -127,7 +127,7 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten const int output_width = output->dim[2]; const int output_batch = output->dim[0]; - int num_elements = csi_tensor_size(output); + int num_elements = csinn_tensor_size(output); memset(output_data, 0, num_elements * sizeof(float)); // Loop through input elements one at a time. @@ -146,11 +146,11 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten // We cannot accumulate out of bounds. if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && (out_y < output_height)) { - float input_value = input_data[csi_ref_get_index( + float input_value = input_data[shl_ref_get_index( input->dim, batch, in_y, in_x, in_channel)]; - float filter_value = filter_data[csi_ref_get_index( + float filter_value = filter_data[shl_ref_get_index( kernel->dim, 0, filter_y, filter_x, in_channel)]; - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, in_channel)] += input_value * filter_value; } @@ -165,7 +165,7 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten for (int o_y = 0; o_y < output_height; o_y++) { for (int o_x = 0; o_x < output_width; o_x++) { for (int o_channel = 0; o_channel < output_depth; ++o_channel) { - output_data[csi_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] += + output_data[shl_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] += bias_data[o_channel]; } } @@ -176,59 +176,59 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten return CSINN_TRUE; } -int csi_ref_depthwise_deconv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct csi_tensor *o_kernel, struct csi_tensor *o_bias, - struct conv2d_params *params) +int shl_ref_depthwise_deconv2d_nchw_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params) { - struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input); - struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output); + struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input); + struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output); int32_t permute[4] = {1, 2, 3, 0}; - struct csi_tensor *kernel = csi_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute); - struct csi_tensor *bias = o_bias; - csi_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params); + struct csinn_tensor *kernel = shl_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute); + struct csinn_tensor *bias = o_bias; + shl_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params); - csi_ref_nhwc_to_nchw_f32(o_output, output); - csi_ref_free_float_tensor(input); + shl_ref_nhwc_to_nchw_f32(o_output, output); + shl_ref_free_float_tensor(input); return CSINN_TRUE; } -int csi_ref_depthwise_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_depthwise_deconv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_depthwise_deconv2d_nchw_f32(input, output, kernel, bias, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_depthwise_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_depthwise_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - return csi_ref_conv_callback_base(input, output, kernel, bias, params, - csi_ref_depthwise_deconv2d_f32); + return shl_ref_conv_callback_base(input, output, kernel, bias, params, + shl_ref_depthwise_deconv2d_f32); } -int csi_ref_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_deconv2d_nchw_f32(input, output, kernel, bias, params); + shl_ref_deconv2d_nchw_f32(input, output, kernel, bias, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params); + shl_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_ref_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_deconv2d_f32); + return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_deconv2d_f32); } diff --git a/source/reference/deconvolution3d.c b/source/reference/deconvolution3d.c index 35e5899f..10c0164d 100644 --- a/source/reference/deconvolution3d.c +++ b/source/reference/deconvolution3d.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" // input: NCDHW // kernel: IODHW // output: NODHW -int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params) +int shl_ref_deconv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -55,7 +55,7 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, num_elements *= output->dim[i]; } // We need to initialize scratch_buffer to all 0s - float *scratch_buffer = csi_mem_alloc(num_elements * sizeof(float)); + float *scratch_buffer = shl_mem_alloc(num_elements * sizeof(float)); // Loop through input elements one at a time. for (int out_b = 0; out_b < batch; ++out_b) { @@ -80,14 +80,14 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, if ((out_d >= 0) && (out_d < output_depth) && (out_h >= 0) && (out_h < output_height) && (out_w >= 0) && (out_w < output_width)) { - int32_t input_idx = csi_ref_get_index_5( + int32_t input_idx = shl_ref_get_index_5( input->dim, out_b, in_ch, in_d, in_h, in_w); float input_val = input_data[input_idx]; int32_t filter_idx = - csi_ref_get_index_5(kernel->dim, in_ch, out_ch, + shl_ref_get_index_5(kernel->dim, in_ch, out_ch, filter_d, filter_h, filter_w); float filter_val = kernel_data[filter_idx]; - int32_t output_idx = csi_ref_get_index_5( + int32_t output_idx = shl_ref_get_index_5( output->dim, out_b, out_ch, out_d, out_h, out_w); scratch_buffer[output_idx] += input_val * filter_val; } @@ -107,7 +107,7 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, for (int out_d = 0; out_d < output_depth; ++out_d) { for (int out_h = 0; out_h < output_height; ++out_h) { for (int out_w = 0; out_w < output_width; ++out_w) { - int32_t out_idx = csi_ref_get_index_5(output->dim, out_b, out_ch, out_d, + int32_t out_idx = shl_ref_get_index_5(output->dim, out_b, out_ch, out_d, out_h, out_w); scratch_buffer[out_idx] += bias_data[out_ch]; } @@ -119,13 +119,13 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output, for (int i = 0; i < num_elements; ++i) { output_data[i] = scratch_buffer[i]; } - csi_mem_free(scratch_buffer); + shl_mem_free(scratch_buffer); return CSINN_TRUE; } -int csi_ref_deconv3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv3d_params *params) +int shl_ref_deconv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params) { - return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_deconv3d_f32); + return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_deconv3d_f32); } \ No newline at end of file diff --git a/source/reference/depth_to_space.c b/source/reference/depth_to_space.c index 3057d2f6..05c3b62c 100644 --- a/source/reference/depth_to_space.c +++ b/source/reference/depth_to_space.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" // the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params) +int shl_ref_depth_to_space_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { if (params->mode == CSINN_DEPTHTOSPACE_CRD) return CSINN_FALSE; float *input_data = (float *)input->data; @@ -45,13 +45,13 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor for (int in_h = 0; in_h < in_height; ++in_h) { for (int in_w = 0; in_w < in_width; ++in_w) { for (int out_c = 0; out_c < out_channel; ++out_c) { - float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float)); - int in_start_addr = csi_ref_get_index(input->dim, out_b, out_c, in_h, in_w); + float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float)); + int in_start_addr = shl_ref_get_index(input->dim, out_b, out_c, in_h, in_w); for (int i = 0; i < block_size2; i++) { temp[i] = input_data[in_start_addr + i * out_channel * in_height * in_width]; } - int out_start_addr = csi_ref_get_index(output->dim, out_b, out_c, + int out_start_addr = shl_ref_get_index(output->dim, out_b, out_c, in_h * block_size, in_w * block_size); for (int h = 0; h < block_size; h++) { for (int w = 0; w < block_size; w++) { @@ -59,7 +59,7 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor temp[h * block_size + w]; } } - csi_mem_free(temp); + shl_mem_free(temp); } } } @@ -67,64 +67,63 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_ref_depth_to_space_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params) +int shl_ref_depth_to_space_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { - struct csi_tensor *t_input = csi_alloc_tensor(NULL); - csi_tensor_copy(t_input, input); + struct csinn_tensor *t_input = csinn_alloc_tensor(NULL); + csinn_tensor_copy(t_input, input); t_input->layout = CSINN_LAYOUT_NCHW; - t_input->data = malloc(csi_tensor_size(input) * sizeof(float)); + t_input->data = malloc(csinn_tensor_size(input) * sizeof(float)); t_input->dim[1] = input->dim[3]; t_input->dim[2] = input->dim[1]; t_input->dim[3] = input->dim[2]; - struct transpose_params pparams; + struct csinn_transpose_params pparams; pparams.permute_num = 4; pparams.base.layout = CSINN_LAYOUT_NCHW; pparams.base.api = CSINN_REF; - pparams.base.run_mode = CSINN_RM_LAYER; pparams.base.name = params->base.name; pparams.permute = malloc(pparams.permute_num * sizeof(int32_t)); pparams.permute[0] = 0; pparams.permute[1] = 3; pparams.permute[2] = 1; pparams.permute[3] = 2; - csi_ref_transpose(input, t_input, &pparams); + shl_ref_transpose(input, t_input, &pparams); - struct csi_tensor *t_output = csi_alloc_tensor(NULL); - csi_tensor_copy(t_output, output); + struct csinn_tensor *t_output = csinn_alloc_tensor(NULL); + csinn_tensor_copy(t_output, output); t_output->layout = CSINN_LAYOUT_NCHW; - t_output->data = malloc(csi_tensor_size(output) * sizeof(float)); + t_output->data = malloc(csinn_tensor_size(output) * sizeof(float)); t_output->dim[1] = output->dim[3]; t_output->dim[2] = output->dim[1]; t_output->dim[3] = output->dim[2]; - csi_ref_depth_to_space_nchw_f32(t_input, t_output, params); + shl_ref_depth_to_space_nchw_f32(t_input, t_output, params); pparams.permute[0] = 0; pparams.permute[1] = 2; pparams.permute[2] = 3; pparams.permute[3] = 1; - csi_ref_transpose(t_output, output, &pparams); + shl_ref_transpose(t_output, output, &pparams); - csi_free_tensor(t_input); - csi_free_tensor(t_output); + csinn_free_tensor(t_input); + csinn_free_tensor(t_output); free(pparams.permute); return CSINN_TRUE; } -int csi_ref_depth_to_space_f32(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params) +int shl_ref_depth_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { if (input->layout == CSINN_LAYOUT_NCHW) { - return csi_ref_depth_to_space_nchw_f32(input, output, params); + return shl_ref_depth_to_space_nchw_f32(input, output, params); } else if (input->layout == CSINN_LAYOUT_NHWC) { - return csi_ref_depth_to_space_nhwc_f32(input, output, params); + return shl_ref_depth_to_space_nhwc_f32(input, output, params); } return CSINN_FALSE; } -int csi_ref_depth_to_space_quant(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params) +int shl_ref_depth_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_depth_to_space_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_depth_to_space_f32); } diff --git a/source/reference/div.c b/source/reference/div.c index cf7ac84b..cd162f00 100644 --- a/source/reference/div.c +++ b/source/reference/div.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" static void element_div_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { dest[output_idx] = src0[output_idx] / src1[input_idx]; } -int csi_ref_div_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_div_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_div_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_div_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_div_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_div_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_div_f32); } diff --git a/source/reference/elu.c b/source/reference/elu.c index 4b1eadb0..96ca13ea 100644 --- a/source/reference/elu.c +++ b/source/reference/elu.c @@ -16,13 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" static float elu(float x) { return x < 0.0 ? exp(x) - 1 : x; } -int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) +int shl_ref_elu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +38,8 @@ int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_elu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_elu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_elu_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_elu_f32); } diff --git a/source/reference/equal.c b/source/reference/equal.c index 40cfd579..a20645a2 100644 --- a/source/reference/equal.c +++ b/source/reference/equal.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; bool *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] == input1_data[i]; @@ -34,14 +34,14 @@ int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int ret; - struct csi_tensor *finput0 = csi_ref_tensor_transform_f32(input0); - struct csi_tensor *finput1 = csi_ref_tensor_transform_f32(input1); - ret = csi_ref_equal_f32(finput0, finput1, output, params); - csi_ref_tensor_transform_free_f32(finput0); - csi_ref_tensor_transform_free_f32(finput1); + struct csinn_tensor *finput0 = shl_ref_tensor_transform_f32(input0); + struct csinn_tensor *finput1 = shl_ref_tensor_transform_f32(input1); + ret = shl_ref_equal_f32(finput0, finput1, output, params); + shl_ref_tensor_transform_free_f32(finput0); + shl_ref_tensor_transform_free_f32(finput1); return ret; } diff --git a/source/reference/erf.c b/source/reference/erf.c index cdedfde4..52486b7c 100644 --- a/source/reference/erf.c +++ b/source/reference/erf.c @@ -16,11 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_erf_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -35,8 +36,8 @@ int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_erf_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_erf_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_erf_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_erf_f32); } diff --git a/source/reference/exp.c b/source/reference/exp.c index b75eb577..2cfe81ff 100644 --- a/source/reference/exp.c +++ b/source/reference/exp.c @@ -16,11 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_exp_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -35,8 +36,8 @@ int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_exp_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_exp_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_exp_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_exp_f32); } diff --git a/source/reference/expand_dims.c b/source/reference/expand_dims.c index 18c72aa2..8a30a045 100644 --- a/source/reference/expand_dims.c +++ b/source/reference/expand_dims.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params) +int shl_ref_expand_dims_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -35,8 +35,8 @@ int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_expand_dims_quant(struct csi_tensor *input, struct csi_tensor *output, - struct expand_dims_params *params) +int shl_ref_expand_dims_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_expand_dims_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_expand_dims_f32); } diff --git a/source/reference/expm1.c b/source/reference/expm1.c index 9ffeb4f3..f53a546d 100644 --- a/source/reference/expm1.c +++ b/source/reference/expm1.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_expm1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,8 +36,8 @@ int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_expm1_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_expm1_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_expm1_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_expm1_f32); } diff --git a/source/reference/flatten.c b/source/reference/flatten.c index 4df53721..acaee181 100644 --- a/source/reference/flatten.c +++ b/source/reference/flatten.c @@ -16,27 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_flatten_init(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params) +int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { + struct csinn_callback *cb = params->base.cb; if (input->quant_channel == output->quant_channel) { - int quant_size = input->quant_channel * sizeof(struct csi_quant_info); + int quant_size = input->quant_channel * sizeof(struct csinn_quant_info); int t = memcmp(input->qinfo, output->qinfo, quant_size); if (t == 0) { - params->base.bc = csi_ref_flatten; + cb->exec = shl_ref_flatten; return CSINN_TRUE; } } - params->base.bc = csi_ref_flatten_quant; + cb->exec = shl_ref_flatten_quant; return CSINN_TRUE; } -int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params) +int shl_ref_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -45,14 +46,14 @@ int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } - int size = csi_tensor_byte_size(input); + int size = csinn_tensor_byte_size(input); memcpy(output_data, input_data, size); return CSINN_TRUE; } -int csi_ref_flatten_quant(struct csi_tensor *input, struct csi_tensor *output, - struct flatten_params *params) +int shl_ref_flatten_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_flatten); + return shl_ref_siso_callback_base(input, output, params, shl_ref_flatten); } diff --git a/source/reference/floor.c b/source/reference/floor.c index 736a3efb..e3ea9602 100644 --- a/source/reference/floor.c +++ b/source/reference/floor.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_floor_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -36,8 +36,8 @@ int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_floor_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_floor_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_floor_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_floor_f32); } diff --git a/source/reference/floor_divide.c b/source/reference/floor_divide.c index b139b95e..30e130d6 100644 --- a/source/reference/floor_divide.c +++ b/source/reference/floor_divide.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_floor_divide_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; float *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = floor(input0_data[i] / input1_data[i]); @@ -34,8 +34,8 @@ int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input return CSINN_TRUE; } -int csi_ref_floor_divide_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_floor_divide_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_floor_divide_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_floor_divide_f32); } diff --git a/source/reference/floor_mod.c b/source/reference/floor_mod.c index b8d9796e..9722ea20 100644 --- a/source/reference/floor_mod.c +++ b/source/reference/floor_mod.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_floor_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -39,8 +39,8 @@ int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_floor_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_floor_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_floor_mod_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_floor_mod_f32); } diff --git a/source/reference/fsmn.c b/source/reference/fsmn.c index 52ffe84f..dcb68b8f 100644 --- a/source/reference/fsmn.c +++ b/source/reference/fsmn.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float fsmn(float x) { return x > 0 ? x : 0; } -int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, struct csi_tensor *output, - struct fsmn_params *params) +int shl_ref_fsmn_f32(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params) { float *last_frame = frame->data; float *past_filter = l_filter->data; @@ -85,25 +84,25 @@ int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter, return CSINN_TRUE; } -int csi_ref_fsmn_quant(struct csi_tensor *frame, struct csi_tensor *l_filter, - struct csi_tensor *r_filter, struct csi_tensor *frame_sequence, - struct csi_tensor *frame_count, struct csi_tensor *output, - struct fsmn_params *params) +int shl_ref_fsmn_quant(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_count, struct csinn_tensor *output, + struct csinn_fsmn_params *params) { - struct csi_tensor *float_frame = csi_ref_tensor_transform_f32(frame); - struct csi_tensor *float_l_filter = csi_ref_tensor_transform_f32(l_filter); - struct csi_tensor *float_r_filter = csi_ref_tensor_transform_f32(r_filter); - struct csi_tensor *float_frame_sequence = csi_ref_tensor_transform_f32(frame_sequence); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *float_frame = shl_ref_tensor_transform_f32(frame); + struct csinn_tensor *float_l_filter = shl_ref_tensor_transform_f32(l_filter); + struct csinn_tensor *float_r_filter = shl_ref_tensor_transform_f32(r_filter); + struct csinn_tensor *float_frame_sequence = shl_ref_tensor_transform_f32(frame_sequence); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); - int ret = csi_ref_fsmn_f32(float_frame, float_l_filter, float_r_filter, float_frame_sequence, + int ret = shl_ref_fsmn_f32(float_frame, float_l_filter, float_r_filter, float_frame_sequence, frame_count, float_output, params); - csi_tensor_data_convert(output, float_output); - csi_tensor_data_convert(frame_sequence, float_frame_sequence); - csi_ref_tensor_transform_free_f32(float_frame); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_l_filter); - csi_ref_tensor_transform_free_f32(float_r_filter); - csi_ref_tensor_transform_free_f32(float_frame_sequence); + csinn_tensor_data_convert(output, float_output); + csinn_tensor_data_convert(frame_sequence, float_frame_sequence); + shl_ref_tensor_transform_free_f32(float_frame); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_l_filter); + shl_ref_tensor_transform_free_f32(float_r_filter); + shl_ref_tensor_transform_free_f32(float_frame_sequence); return ret; } diff --git a/source/reference/fullyconnected.c b/source/reference/fullyconnected.c index e2bec8af..ed3138d2 100644 --- a/source/reference/fullyconnected.c +++ b/source/reference/fullyconnected.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) +int shl_ref_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -53,20 +53,20 @@ int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) +int shl_ref_fullyconnected_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_kernel = csi_ref_tensor_transform_f32(weights); - struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_kernel = shl_ref_tensor_transform_f32(weights); + struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); if (params->fc_extra.fuse_zp2bias) { float *float_bias_data = float_bias->data; float *float_kernel_data = float_kernel->data; int k_len = weights->dim[0]; - int k_inner = csi_tensor_size(weights) / k_len; + int k_inner = csinn_tensor_size(weights) / k_len; float sp = input->qinfo->scale * input->qinfo->zero_point; for (int i = 0; i < k_len; i++) { float t_k = 0; @@ -79,11 +79,11 @@ int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *ou } int ret = - csi_ref_fullyconnected_f32(float_input, float_output, float_kernel, float_bias, params); - csi_tensor_data_convert(output, float_output); - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_kernel); - csi_ref_tensor_transform_free_f32(float_bias); + shl_ref_fullyconnected_f32(float_input, float_output, float_kernel, float_bias, params); + csinn_tensor_data_convert(output, float_output); + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_kernel); + shl_ref_tensor_transform_free_f32(float_bias); return ret; } diff --git a/source/reference/gather.c b/source/reference/gather.c index 44a5c223..1f2df02b 100644 --- a/source/reference/gather.c +++ b/source/reference/gather.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params) +int shl_ref_gather_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -55,14 +55,14 @@ int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices, return CSINN_TRUE; } -int csi_ref_gather_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_params *params) +int shl_ref_gather_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_gather_f32(finput, indices, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_gather_f32(finput, indices, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); } diff --git a/source/reference/gather_nd.c b/source/reference/gather_nd.c index 9632c807..af22b1b3 100644 --- a/source/reference/gather_nd.c +++ b/source/reference/gather_nd.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" static int Multiplication(int32_t *input, int s, int e) { @@ -29,8 +29,8 @@ static int Multiplication(int32_t *input, int s, int e) return res; } -int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params) +int shl_ref_gather_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -88,15 +88,15 @@ int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, return CSINN_TRUE; } -int csi_ref_gather_nd_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *output, struct gather_nd_params *params) +int shl_ref_gather_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_gather_nd_f32(finput, indices, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_gather_nd_f32(finput, indices, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/global_averagepool.c b/source/reference/global_averagepool.c index 67138b6d..05df7a6d 100644 --- a/source/reference/global_averagepool.c +++ b/source/reference/global_averagepool.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { params->stride_height = 1; params->stride_width = 1; @@ -41,11 +41,11 @@ int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *ou } else { return CSINN_UNSUPPORT_LAYOUT; } - csi_ref_avgpool2d_f32(input, output, params); + shl_ref_avgpool2d_f32(input, output, params); } -int csi_ref_global_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_global_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_global_avgpool2d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_global_avgpool2d_f32); } \ No newline at end of file diff --git a/source/reference/global_maxpool.c b/source/reference/global_maxpool.c index c681a213..5e75ea8d 100644 --- a/source/reference/global_maxpool.c +++ b/source/reference/global_maxpool.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { params->stride_height = 1; params->stride_width = 1; @@ -41,11 +41,11 @@ int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *ou } else { return CSINN_UNSUPPORT_LAYOUT; } - csi_ref_maxpool2d_f32(input, output, params); + shl_ref_maxpool2d_f32(input, output, params); } -int csi_ref_global_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_global_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_global_maxpool2d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_global_maxpool2d_f32); } \ No newline at end of file diff --git a/source/reference/greater.c b/source/reference/greater.c index 99467882..681922ff 100644 --- a/source/reference/greater.c +++ b/source/reference/greater.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_greater_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_greater_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_greater_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_greater_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_greater_f32); } diff --git a/source/reference/greater_equal.c b/source/reference/greater_equal.c index 9e5eb9b3..b1e33fb9 100644 --- a/source/reference/greater_equal.c +++ b/source/reference/greater_equal.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_greater_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *inpu return CSINN_TRUE; } -int csi_ref_greater_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_greater_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_greater_equal_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_greater_equal_f32); } diff --git a/source/reference/hard_sigmoid.c b/source/reference/hard_sigmoid.c index 653b55dc..de946518 100644 --- a/source/reference/hard_sigmoid.c +++ b/source/reference/hard_sigmoid.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params) +int shl_ref_hard_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -42,8 +41,8 @@ int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output return CSINN_TRUE; } -int csi_ref_hard_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params) +int shl_ref_hard_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_hard_sigmoid_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_hard_sigmoid_f32); } diff --git a/source/reference/im2col.c b/source/reference/im2col.c index 33e5dec8..4d448003 100644 --- a/source/reference/im2col.c +++ b/source/reference/im2col.c @@ -16,16 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" // input_data layout:NCHW // https://github.com/pjreddie/darknet/blob/master/src/im2col.c // output_data: row = channels*ksize_h*ksize_w, col = batch*height_col*width_col -static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params) +static int shl_ref_im2col_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -60,7 +59,7 @@ static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor * output_data[col_index] = 0.0f; } else { output_data[col_index] = - input_data[csi_ref_get_index(input->dim, b, c_im, im_row, im_col)]; + input_data[shl_ref_get_index(input->dim, b, c_im, im_row, im_col)]; } } } @@ -71,8 +70,8 @@ static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor * // input_data layout:NHWC // output_data: row = batch*height_col*width_col, col = channels*ksize_h*ksize_w -static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params) +static int shl_ref_im2col_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -108,7 +107,7 @@ static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor * output_data[col_index] = 0.0f; } else { output_data[col_index] = - input_data[csi_ref_get_index(input->dim, b, im_row, im_col, c_im)]; + input_data[shl_ref_get_index(input->dim, b, im_row, im_col, c_im)]; } } } @@ -118,21 +117,21 @@ static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor * return CSINN_TRUE; } -int csi_ref_im2col_f32(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params) +int shl_ref_im2col_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_im2col_nchw_f32(input, output, params); + shl_ref_im2col_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_im2col_nhwc_f32(input, output, params); + shl_ref_im2col_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } return CSINN_TRUE; } -int csi_ref_im2col_quant(struct csi_tensor *input, struct csi_tensor *output, - struct im2col_params *params) +int shl_ref_im2col_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_im2col_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_im2col_f32); } diff --git a/source/reference/isnan.c b/source/reference/isnan.c index 51c4bc72..9daa4a81 100644 --- a/source/reference/isnan.c +++ b/source/reference/isnan.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_isnan_bool_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_isnan_bool_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; bool *output_data = output->data; diff --git a/source/reference/l2_normalization.c b/source/reference/l2_normalization.c index 04e2bdc5..eca784dd 100644 --- a/source/reference/l2_normalization.c +++ b/source/reference/l2_normalization.c @@ -16,16 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/l2normalization.h */ -int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params) +int shl_ref_l2_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -51,8 +50,8 @@ int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_ref_l2_normalization_quant(struct csi_tensor *input, struct csi_tensor *output, - struct l2n_params *params) +int shl_ref_l2_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_l2_normalization_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_l2_normalization_f32); } diff --git a/source/reference/l2pool.c b/source/reference/l2pool.c index e328fb7a..12349551 100644 --- a/source/reference/l2pool.c +++ b/source/reference/l2pool.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_l2pool_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,26 +39,26 @@ int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output, const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float sum_squares = 0.f; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - const float val = input_data[csi_ref_get_index(input->dim, batch, in_y, + const float val = input_data[shl_ref_get_index(input->dim, batch, in_y, in_x, channel)]; sum_squares += val * val; filter_count++; } } const float l2pool_result = sqrt(sum_squares / filter_count); - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] = + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] = l2pool_result; } } diff --git a/source/reference/layer_norm.c b/source/reference/layer_norm.c index aa409b34..c858c486 100644 --- a/source/reference/layer_norm.c +++ b/source/reference/layer_norm.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params) +int shl_ref_layer_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { int flatten_size = 0; flatten_size *= input->dim[0] * input->dim[1] * input->dim[2]; @@ -68,23 +67,23 @@ int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_layer_norm_quant(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *gamma, struct csi_tensor *beta, - struct layer_norm_params *params) +int shl_ref_layer_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params) { - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); - struct csi_tensor *float_gamma = csi_ref_tensor_transform_f32(gamma); - struct csi_tensor *float_beta = csi_ref_tensor_transform_f32(beta); + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); + struct csinn_tensor *float_gamma = shl_ref_tensor_transform_f32(gamma); + struct csinn_tensor *float_beta = shl_ref_tensor_transform_f32(beta); - int ret = csi_ref_layer_norm_f32(float_input, float_output, float_gamma, float_beta, params); + int ret = shl_ref_layer_norm_f32(float_input, float_output, float_gamma, float_beta, params); - csi_tensor_data_convert(output, float_output); + csinn_tensor_data_convert(output, float_output); - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_gamma); - csi_ref_tensor_transform_free_f32(float_beta); + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_gamma); + shl_ref_tensor_transform_free_f32(float_beta); return CSINN_TRUE; } diff --git a/source/reference/leaky_relu.c b/source/reference/leaky_relu.c index 6a089945..67a41404 100644 --- a/source/reference/leaky_relu.c +++ b/source/reference/leaky_relu.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -38,8 +37,8 @@ int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_leaky_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_leaky_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_leaky_relu_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_leaky_relu_f32); } diff --git a/source/reference/less.c b/source/reference/less.c index 08914b7a..44cf36b4 100644 --- a/source/reference/less.c +++ b/source/reference/less.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_less_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_less_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_less_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_less_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_less_f32); } diff --git a/source/reference/less_equal.c b/source/reference/less_equal.c index c1e70cbf..15da06f5 100644 --- a/source/reference/less_equal.c +++ b/source/reference/less_equal.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_less_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_less_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_less_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_less_equal_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_less_equal_f32); } diff --git a/source/reference/log.c b/source/reference/log.c index b77a95fc..2bc48f91 100644 --- a/source/reference/log.c +++ b/source/reference/log.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_log_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -36,8 +36,8 @@ int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_log_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_log_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_log_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_log_f32); } diff --git a/source/reference/log1p.c b/source/reference/log1p.c index 42cc5b89..4780e53f 100644 --- a/source/reference/log1p.c +++ b/source/reference/log1p.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_log1p_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,8 +36,8 @@ int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_log1p_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_log1p_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_log1p_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_log1p_f32); } diff --git a/source/reference/log_softmax.c b/source/reference/log_softmax.c index 9ef78be1..6b32415d 100644 --- a/source/reference/log_softmax.c +++ b/source/reference/log_softmax.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" /* logsoftmax = logits - log(reduce_sum(exp(logits), axis)) */ -int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params) +int shl_ref_log_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { // now only support 2D input assert(params->axis == 1 && input->dim_count == 2); @@ -65,8 +64,8 @@ int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_log_softmax_quant(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params) +int shl_ref_log_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_log_softmax_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_log_softmax_f32); } diff --git a/source/reference/logical_and.c b/source/reference/logical_and.c index 152578bc..da1c2007 100644 --- a/source/reference/logical_and.c +++ b/source/reference/logical_and.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_and_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1 return CSINN_TRUE; } -int csi_ref_logical_and_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_and_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_and_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_and_f32); } diff --git a/source/reference/logical_not.c b/source/reference/logical_not.c index 6b81bb8d..d0ce77ff 100644 --- a/source/reference/logical_not.c +++ b/source/reference/logical_not.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_logical_not_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,8 +35,8 @@ int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_logical_not_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_logical_not_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_logical_not_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_logical_not_f32); } diff --git a/source/reference/logical_or.c b/source/reference/logical_or.c index 8db0b883..13d391e4 100644 --- a/source/reference/logical_or.c +++ b/source/reference/logical_or.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_or_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_logical_or_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_or_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_or_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_or_f32); } diff --git a/source/reference/logical_xor.c b/source/reference/logical_xor.c index 4297cb11..04687243 100644 --- a/source/reference/logical_xor.c +++ b/source/reference/logical_xor.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_xor_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; @@ -38,8 +37,8 @@ int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1 return CSINN_TRUE; } -int csi_ref_logical_xor_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_logical_xor_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_xor_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_xor_f32); } diff --git a/source/reference/lrn.c b/source/reference/lrn.c index 46e434fc..58a20d45 100644 --- a/source/reference/lrn.c +++ b/source/reference/lrn.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params) +static int shl_ref_lrn_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +36,8 @@ static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out for (int i = 0; i < outer_size; ++i) { for (int c = 0; c < depth; ++c) { - const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range); - const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1); + const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range); + const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1); float accum = 0.f; for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { const float input_val = input_data[i * depth + input_c]; @@ -52,8 +51,8 @@ static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out return CSINN_TRUE; } -static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params) +static int shl_ref_lrn_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -66,8 +65,8 @@ static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *out for (int j = 0; j < input->dim[0]; j++) { for (int c = 0; c < depth; ++c) { - const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range); - const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1); + const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range); + const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1); for (int i = 0; i < inner_size; ++i) { float accum = 0.f; for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { @@ -85,39 +84,40 @@ static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *out return CSINN_TRUE; } -int csi_ref_lrn_f32(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params) +int shl_ref_lrn_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_lrn_nchw_f32(input, output, params); + shl_ref_lrn_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_lrn_nhwc_f32(input, output, params); + shl_ref_lrn_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_lrn_quant(struct csi_tensor *input, struct csi_tensor *output, - struct lrn_params *params) +int shl_ref_lrn_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params) { double bias_f, alpha_f, beta_f; - struct csi_quant_info qinfo; + struct csinn_quant_info qinfo; qinfo.zero_point = 0; qinfo.multiplier = params->bias_multiplier; qinfo.shift = params->bias_shift; - bias_f = csi_ref_dequantize_u8_to_f32(1, &qinfo); + bias_f = shl_ref_dequantize_u8_to_f32(1, &qinfo); qinfo.zero_point = 0; qinfo.multiplier = params->alpha_multiplier; qinfo.shift = params->alpha_shift; - alpha_f = csi_ref_dequantize_u8_to_f32(1, &qinfo); + alpha_f = shl_ref_dequantize_u8_to_f32(1, &qinfo); qinfo.zero_point = 0; qinfo.multiplier = params->beta_multiplier; qinfo.shift = params->beta_shift; - beta_f = csi_ref_dequantize_u8_to_f32(1, &qinfo); + beta_f = shl_ref_dequantize_u8_to_f32(1, &qinfo); params->bias = bias_f; params->alpha = alpha_f; params->beta = beta_f; - return csi_ref_siso_callback_base(input, output, params, csi_ref_lrn_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_lrn_f32); } diff --git a/source/reference/matmul.c b/source/reference/matmul.c index 79f26002..d429056f 100644 --- a/source/reference/matmul.c +++ b/source/reference/matmul.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output, - struct matmul_params *params) +int shl_ref_matmul_f32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { float *mat0_data = mat0->data; float *mat1_data = mat1->data; @@ -103,8 +102,8 @@ int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct return CSINN_TRUE; } -int csi_ref_matmul_quant(struct csi_tensor *mat0, struct csi_tensor *mat1, - struct csi_tensor *output, struct matmul_params *params) +int shl_ref_matmul_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { - return csi_ref_diso_callback_base(mat0, mat1, output, params, csi_ref_matmul_f32); + return shl_ref_diso_callback_base(mat0, mat1, output, params, shl_ref_matmul_f32); } diff --git a/source/reference/max.c b/source/reference/max.c index 36a6087f..b56a759f 100644 --- a/source/reference/max.c +++ b/source/reference/max.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_max_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,10 +40,10 @@ int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output, for (int32_t out = 0; out < out_size; out++) { float result = -FLT_MAX; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; result = fmax(result, val); @@ -55,8 +54,8 @@ int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_max_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_max_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_max_stride_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_max_stride_f32); } diff --git a/source/reference/maximum.c b/source/reference/maximum.c index 56cdbcfb..e3d5c2f0 100644 --- a/source/reference/maximum.c +++ b/source/reference/maximum.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_maximum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,8 +37,8 @@ int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_maximum_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_maximum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_maximum_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_maximum_f32); } diff --git a/source/reference/maxpool.c b/source/reference/maxpool.c index 085ba05b..7a39781a 100644 --- a/source/reference/maxpool.c +++ b/source/reference/maxpool.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +static int shl_ref_maxpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,19 +40,19 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float max = -FLT_MAX; int filter_cnt = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - max = fmax(max, input_data[csi_ref_get_index(input->dim, batch, in_y, + max = fmax(max, input_data[shl_ref_get_index(input->dim, batch, in_y, in_x, channel)]); filter_cnt++; } @@ -62,7 +61,7 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso if (filter_cnt != params->filter_height * params->filter_width) { max = fmax(max, 0); } - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] = max; + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] = max; } } } @@ -70,8 +69,8 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +static int shl_ref_maxpool2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -90,19 +89,19 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float max = -FLT_MAX; int filter_cnt = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - max = fmax(max, input_data[csi_ref_get_index(input->dim, batch, channel, + max = fmax(max, input_data[shl_ref_get_index(input->dim, batch, channel, in_y, in_x)]); filter_cnt++; } @@ -111,7 +110,7 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso if (filter_cnt != params->filter_height * params->filter_width) { max = fmax(max, 0); } - output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] = max; + output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] = max; } } } @@ -119,20 +118,20 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -int csi_ref_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_maxpool2d_nchw_f32(input, output, params); + shl_ref_maxpool2d_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_maxpool2d_nhwc_f32(input, output, params); + shl_ref_maxpool2d_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_maxpool2d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_maxpool2d_f32); } diff --git a/source/reference/maxpool2d_locat.c b/source/reference/maxpool2d_locat.c index f4645888..a1307259 100644 --- a/source/reference/maxpool2d_locat.c +++ b/source/reference/maxpool2d_locat.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +static int shl_ref_maxpool2d_locat_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; int *output_data = output->data; @@ -41,12 +40,12 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float max = FLT_MIN; int locat = (in_y_origin + filter_y_start) * input->dim[2] + (in_x_origin + filter_x_start); @@ -55,14 +54,14 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; int in_index = - csi_ref_get_index(input->dim, batch, channel, in_y, in_x); + shl_ref_get_index(input->dim, batch, channel, in_y, in_x); if (input_data[in_index] > max) { max = input_data[in_index]; locat = in_y * input->dim[2] + in_x; } } } - output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] = + output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] = locat; } } @@ -71,8 +70,8 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi return CSINN_TRUE; } -static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +static int shl_ref_maxpool2d_locat_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = input->data; int *output_data = output->data; @@ -91,12 +90,12 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi const int in_y_origin = (out_y * params->stride_height) - params->pad_top; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin); + const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin); const int filter_x_end = - csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); - const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin); + shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin); + const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin); const int filter_y_end = - csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); + shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin); float max = FLT_MIN; int locat = (in_y_origin + filter_y_start) * input->dim[3] + (in_x_origin + filter_x_start); @@ -105,14 +104,14 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; int in_index = - csi_ref_get_index(input->dim, batch, channel, in_y, in_x); + shl_ref_get_index(input->dim, batch, channel, in_y, in_x); if (input_data[in_index] > max) { max = input_data[in_index]; locat = in_y * input->dim[3] + in_x; } } } - output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] = + output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] = locat; } } @@ -121,24 +120,24 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi return CSINN_TRUE; } -int csi_ref_maxpool2d_locat_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool2d_locat_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_maxpool2d_locat_nchw_f32(input, output, params); + shl_ref_maxpool2d_locat_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_maxpool2d_locat_nhwc_f32(input, output, params); + shl_ref_maxpool2d_locat_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } return CSINN_TRUE; } -int csi_ref_maxpool2d_locat_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool2d_locat_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - csi_ref_maxpool2d_locat_f32(finput, output, params); - csi_ref_tensor_transform_free_f32(finput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + shl_ref_maxpool2d_locat_f32(finput, output, params); + shl_ref_tensor_transform_free_f32(finput); return CSINN_TRUE; } diff --git a/source/reference/maxpool3d.c b/source/reference/maxpool3d.c index 66eb3587..8fbd68c3 100644 --- a/source/reference/maxpool3d.c +++ b/source/reference/maxpool3d.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -46,15 +45,15 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, const int in_w_origin = (out_w * params->stride_width) - params->pad_left; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. - const int filter_d_begin = csi_ref_max_internal_s32(0, -in_d_origin); + const int filter_d_begin = shl_ref_max_internal_s32(0, -in_d_origin); const int filter_d_end = - csi_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin); - const int filter_h_begin = csi_ref_max_internal_s32(0, -in_h_origin); - const int filter_h_end = csi_ref_min_internal_s32(params->filter_height, + shl_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin); + const int filter_h_begin = shl_ref_max_internal_s32(0, -in_h_origin); + const int filter_h_end = shl_ref_min_internal_s32(params->filter_height, in_height - in_h_origin); - const int filter_w_begin = csi_ref_max_internal_s32(0, -in_w_origin); + const int filter_w_begin = shl_ref_max_internal_s32(0, -in_w_origin); const int filter_w_end = - csi_ref_min_internal_s32(params->filter_width, in_width - in_w_origin); + shl_ref_min_internal_s32(params->filter_width, in_width - in_w_origin); float max = -FLT_MAX; int filter_cnt = 0; @@ -67,7 +66,7 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, int in_h = in_h_origin + filter_h; int in_w = in_w_origin + filter_w; max = fmax(max, - input_data[csi_ref_get_index_5( + input_data[shl_ref_get_index_5( input->dim, in_ch, out_ch, in_d, in_h, in_w)]); filter_cnt++; } @@ -77,7 +76,7 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, params->filter_depth * params->filter_height * params->filter_width) { max = fmax(max, 0); } - output_data[csi_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h, + output_data[shl_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h, out_w)] = max; } } @@ -87,8 +86,8 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_maxpool3d_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_ref_maxpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_maxpool3d_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_maxpool3d_f32); } diff --git a/source/reference/mean.c b/source/reference/mean.c index 19538b3d..b28f3a4e 100644 --- a/source/reference/mean.c +++ b/source/reference/mean.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_mean_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,10 +40,10 @@ int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output, for (int32_t out = 0; out < out_size; out++) { float result = 0; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; result += val; @@ -55,23 +54,23 @@ int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_mean_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_mean_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_mean_stride_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_mean_stride_f32); } -int csi_ref_mean_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { if (params->axis_count != 2 || params->axis[0] != 2 || params->axis[1] != 3 || input->dim_count != 4 || output->dim_count != 4) { assert(0); } - struct pool_params pparams; + struct csinn_pool_params pparams; pparams.base.layout = CSINN_LAYOUT_NCHW; pparams.base.api = CSINN_REF; - csi_global_avgpool2d_init(input, output, &pparams); - csi_global_avgpool2d(input, output, &pparams); + csinn_global_avgpool2d_init(input, output, &pparams); + csinn_global_avgpool2d(input, output, &pparams); return CSINN_TRUE; } diff --git a/source/reference/min.c b/source/reference/min.c index e9bf6201..dfbb31d2 100644 --- a/source/reference/min.c +++ b/source/reference/min.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_min_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,10 +40,10 @@ int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output, for (int32_t out = 0; out < out_size; out++) { float result = FLT_MAX; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; result = fmin(result, val); @@ -55,8 +54,8 @@ int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_min_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_min_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_min_stride_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_min_stride_f32); } diff --git a/source/reference/minimum.c b/source/reference/minimum.c index 592d67c4..38304d19 100644 --- a/source/reference/minimum.c +++ b/source/reference/minimum.c @@ -16,19 +16,18 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; float *output_data = output->data; - int size0 = csi_tensor_size(input0); - int size1 = csi_tensor_size(input1); + int size0 = csinn_tensor_size(input0); + int size1 = csinn_tensor_size(input1); if (size0 == size1) { for (int i = 0; i < size0; i++) { @@ -43,8 +42,8 @@ int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_minimum_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_minimum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_minimum_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_minimum_f32); } diff --git a/source/reference/mod.c b/source/reference/mod.c index e3eeb58f..028ed06a 100644 --- a/source/reference/mod.c +++ b/source/reference/mod.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static void element_mod_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { @@ -27,18 +26,18 @@ static void element_mod_f32(float *src0, float *src1, float *dest, int input_idx src0[output_idx] - floor(src0[output_idx] / src1[output_idx]) * src1[input_idx]; } -int csi_ref_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_mod_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_mod_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_mod_f32); } diff --git a/source/reference/mul.c b/source/reference/mul.c index e5d4424b..9c3520a1 100644 --- a/source/reference/mul.c +++ b/source/reference/mul.c @@ -16,28 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static void element_mul_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { dest[output_idx] = src0[output_idx] * src1[input_idx]; } -int csi_ref_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_mul_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_mul_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_mul_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_mul_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_mul_f32); } diff --git a/source/reference/ndarray_size.c b/source/reference/ndarray_size.c index a5fdba64..aa7ea872 100644 --- a/source/reference/ndarray_size.c +++ b/source/reference/ndarray_size.c @@ -16,39 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_ndarray_size_f32(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params) +int shl_ref_ndarray_size_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { float *output_data = output->data; - output_data[0] = csi_tensor_size(input); + output_data[0] = csinn_tensor_size(input); return CSINN_TRUE; } -int csi_ref_ndarray_size_u8(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params) +int shl_ref_ndarray_size_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { uint8_t *output_data = output->data; - output_data[0] = csi_tensor_size(input); + output_data[0] = csinn_tensor_size(input); return CSINN_TRUE; } -int csi_ref_ndarray_size_i8(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params) +int shl_ref_ndarray_size_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { int8_t *output_data = output->data; - output_data[0] = csi_tensor_size(input); + output_data[0] = csinn_tensor_size(input); return CSINN_TRUE; } -int csi_ref_ndarray_size_i32(struct csi_tensor *input, struct csi_tensor *output, - struct ndarray_size_params *params) +int shl_ref_ndarray_size_i32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params) { int32_t *output_data = output->data; - output_data[0] = csi_tensor_size(input); + output_data[0] = csinn_tensor_size(input); return CSINN_TRUE; } diff --git a/source/reference/negative.c b/source/reference/negative.c index 8e69f492..d560eb51 100644 --- a/source/reference/negative.c +++ b/source/reference/negative.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_negative_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +36,8 @@ int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_negative_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_negative_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_negative_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_negative_f32); } diff --git a/source/reference/non_max_suppression.c b/source/reference/non_max_suppression.c index f6243272..00d53565 100644 --- a/source/reference/non_max_suppression.c +++ b/source/reference/non_max_suppression.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static int find_max_score_idx(const float *scores, int *flag, int len) { @@ -54,9 +53,9 @@ static float get_iou(const float *box1, const float *box2) return iou; } -int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params) +int shl_ref_non_max_suppression_std(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params) { float *boxes = (float *)input0->data; float *scores = (float *)input1->data; @@ -68,7 +67,7 @@ int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor int box_num = input1->dim[0]; int box_num_exist = box_num; - int *flag = (int *)csi_mem_alloc(box_num * sizeof(int)); + int *flag = (int *)shl_mem_alloc(box_num * sizeof(int)); int box_cnt = 0; while (box_num_exist) { @@ -92,6 +91,6 @@ int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor } } } - csi_mem_free(flag); + shl_mem_free(flag); return CSINN_TRUE; } diff --git a/source/reference/not.c b/source/reference/not.c index e2428d53..8de1375f 100644 --- a/source/reference/not.c +++ b/source/reference/not.c @@ -16,16 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_not_u32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { uint32_t *input_data = input->data; uint32_t *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = ~(input_data[i]); @@ -33,11 +34,12 @@ int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_not_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = ~(input_data[i]); @@ -45,11 +47,12 @@ int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct s return CSINN_TRUE; } -int csi_ref_not_i8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_not_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { int8_t *input_data = input->data; int8_t *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = ~(input_data[i]); diff --git a/source/reference/not_equal.c b/source/reference/not_equal.c index fdf6ac9a..5619e30b 100644 --- a/source/reference/not_equal.c +++ b/source/reference/not_equal.c @@ -16,18 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_not_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; float *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] != input1_data[i]; @@ -35,8 +34,8 @@ int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1, return CSINN_TRUE; } -int csi_ref_not_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_not_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_not_equal_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_not_equal_f32); } diff --git a/source/reference/or.c b/source/reference/or.c index ed692c4b..82d69943 100644 --- a/source/reference/or.c +++ b/source/reference/or.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_or_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; uint32_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] | input1_data[i]; @@ -34,13 +34,13 @@ int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct return CSINN_TRUE; } -int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_or_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; uint8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] | input1_data[i]; @@ -48,13 +48,13 @@ int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct c return CSINN_TRUE; } -int csi_ref_or_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_or_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int8_t *input0_data = input0->data; int8_t *input1_data = input1->data; int8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] | input1_data[i]; diff --git a/source/reference/pad.c b/source/reference/pad.c index 1b732772..3369f932 100644 --- a/source/reference/pad.c +++ b/source/reference/pad.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_pad_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params) +static int shl_ref_pad_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { const int output_batch = output->dim[0]; const int output_height = output->dim[1]; @@ -72,8 +71,8 @@ static int csi_ref_pad_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out return CSINN_TRUE; } -static int csi_ref_pad_nchw_f32(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params) +static int shl_ref_pad_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { const int output_batch = output->dim[0]; const int output_depth = output->dim[1]; @@ -123,19 +122,20 @@ static int csi_ref_pad_nchw_f32(struct csi_tensor *input, struct csi_tensor *out return CSINN_TRUE; } -int csi_ref_pad_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params) +int shl_ref_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_pad_nchw_f32(input, output, params); + shl_ref_pad_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_pad_nhwc_f32(input, output, params); + shl_ref_pad_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_pad_quant(struct csi_tensor *input, struct csi_tensor *output, - struct pad_params *params) +int shl_ref_pad_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_pad_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_pad_f32); } diff --git a/source/reference/power.c b/source/reference/power.c index da4692cc..cabe88ca 100644 --- a/source/reference/power.c +++ b/source/reference/power.c @@ -16,28 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static void element_pow_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { dest[output_idx] = powf(src0[output_idx], src1[input_idx]); } -int csi_ref_power_f32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_power_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_pow_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_power_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_power_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_power_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_power_f32); } diff --git a/source/reference/prelu.c b/source/reference/prelu.c index 8d417849..6bd9f5b5 100644 --- a/source/reference/prelu.c +++ b/source/reference/prelu.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params) +int shl_ref_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { float *input_data = (float *)input->data; float *alpha_data = (float *)alpha->data; @@ -35,7 +34,7 @@ int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct outer_size *= input->dim[i]; } - int64_t inner_size = (axis == 0 && input->dim_count == 1) ? csi_tensor_size(input) : 1; + int64_t inner_size = (axis == 0 && input->dim_count == 1) ? csinn_tensor_size(input) : 1; for (int i = axis + 1; i < input->dim_count; i++) { inner_size *= input->dim[i]; } @@ -56,8 +55,8 @@ int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct return CSINN_TRUE; } -int csi_ref_prelu_quant(struct csi_tensor *input, struct csi_tensor *alpha, - struct csi_tensor *output, struct prelu_params *params) +int shl_ref_prelu_quant(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params) { - return csi_ref_diso_callback_base(input, alpha, output, params, csi_ref_prelu_f32); + return shl_ref_diso_callback_base(input, alpha, output, params, shl_ref_prelu_f32); } diff --git a/source/reference/prod.c b/source/reference/prod.c index 0a5e5efe..e589a214 100644 --- a/source/reference/prod.c +++ b/source/reference/prod.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_prod_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,10 +40,10 @@ int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output, for (int32_t out = 0; out < out_size; out++) { float result = 1; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; result *= val; @@ -55,8 +54,8 @@ int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_prod_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_prod_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_prod_stride_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_prod_stride_f32); } diff --git a/source/reference/proposal.c b/source/reference/proposal.c index 01cdc8d5..ad11c76d 100644 --- a/source/reference/proposal.c +++ b/source/reference/proposal.c @@ -16,12 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" #define MAX(a, b) (a > b ? a : b) #define MIN(a, b) (a > b ? b : a) @@ -83,8 +82,9 @@ static struct bbox generate_anchor(float ratio, float scale, int32_t base_size) return _bbox; } -static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor *bbox_pred_tensor, - struct csi_tensor *im_info_tensor, float *ratios, int32_t ratios_num, +static float *predict_bbox(struct csinn_tensor *cls_prob_tensor, + struct csinn_tensor *bbox_pred_tensor, + struct csinn_tensor *im_info_tensor, float *ratios, int32_t ratios_num, float *scales, int32_t scales_num, int32_t feature_stride, int32_t iou_loss, int32_t rpn_min_size) { @@ -100,7 +100,7 @@ static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor float *bbox_pred = bbox_pred_tensor->data; float *im_info = im_info_tensor->data; - float *output = csi_mem_alloc(batch * height * width * num_anchors * 5 * sizeof(float)); + float *output = shl_mem_alloc(batch * height * width * num_anchors * 5 * sizeof(float)); for (int i = 0; i < batch * height * width; i++) { int w = i % width; @@ -119,7 +119,7 @@ static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor int x2 = anchor.x2 + w * feature_stride; int y2 = anchor.y2 + h * feature_stride; - float *delta = csi_mem_alloc(4 * sizeof(float)); + float *delta = shl_mem_alloc(4 * sizeof(float)); for (int j = 0; j < 4; j++) { delta[j] = bbox_pred[(((b * num_anchors + k) * 4 + j) * height + h) * width + w]; } @@ -190,7 +190,7 @@ static float calculate_overlap(float *out_tensor, int box_a_idx, int box_b_idx) static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, float threshold) { - float *out = csi_mem_alloc(batch * num_bbox * sizeof(float)); + float *out = shl_mem_alloc(batch * num_bbox * sizeof(float)); for (int b = 0; b < batch; b++) { int base_idx = b * num_bbox; for (int i = 0; i < num_bbox; i++) { @@ -216,9 +216,9 @@ static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, float thr static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch, int num_bbox, int rpn_post_nms_top_n) { - int *i = csi_mem_alloc(batch * sizeof(int)); - int *nkeep = csi_mem_alloc(batch * sizeof(int)); - float *output = csi_mem_alloc(batch * rpn_post_nms_top_n * 5 * sizeof(int)); + int *i = shl_mem_alloc(batch * sizeof(int)); + int *nkeep = shl_mem_alloc(batch * sizeof(int)); + float *output = shl_mem_alloc(batch * rpn_post_nms_top_n * 5 * sizeof(int)); for (int b = 0; b < batch; b++) { nkeep[b] = 0; @@ -252,9 +252,9 @@ static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch, return output; } -int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params) +int shl_ref_proposal_f32(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params) { float *output_data = output->data; @@ -271,7 +271,7 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr float *bbox = predict_bbox(cls_prob, bbox_pred, im_info, params->ratios, params->ratios_num, params->scales, params->scales_num, params->feature_stride, params->iou_loss, params->rpn_min_size); - index_value *score = csi_mem_alloc(batch * num_bbox * sizeof(index_value)); + index_value *score = shl_mem_alloc(batch * num_bbox * sizeof(index_value)); for (int i = 0; i < batch; i++) { for (int j = 0; j < num_bbox; j++) { int id = j + i * num_bbox; @@ -283,7 +283,7 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr qsort(score, batch * num_bbox, sizeof(index_value), argsort); - float *sorted_bbox = csi_mem_alloc(batch * params->rpn_pre_nms_top_n * 5 * sizeof(float)); + float *sorted_bbox = shl_mem_alloc(batch * params->rpn_pre_nms_top_n * 5 * sizeof(float)); for (int b = 0; b < batch; b++) { for (int i = 0; i < params->rpn_pre_nms_top_n; i++) { int sorted_index = score[b * params->rpn_pre_nms_top_n + i].index; @@ -307,32 +307,32 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr return CSINN_TRUE; } -int csi_ref_proposal_quant(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, struct csi_tensor *output, - struct proposal_params *params) +int shl_ref_proposal_quant(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params) { - float *scales = (float *)csi_mem_alloc(params->scales_num * sizeof(float)); + float *scales = (float *)shl_mem_alloc(params->scales_num * sizeof(float)); for (int i = 0; i < params->scales_num; i++) { - scales[i] = csi_ref_get_scale(params->scale_multipliers[i], params->scale_shifts[i]); + scales[i] = shl_ref_get_scale(params->scale_multipliers[i], params->scale_shifts[i]); } - float *ratios = (float *)csi_mem_alloc(params->scales_num * sizeof(float)); + float *ratios = (float *)shl_mem_alloc(params->scales_num * sizeof(float)); for (int i = 0; i < params->ratios_num; i++) { - ratios[i] = csi_ref_get_scale(params->ratio_multipliers[i], params->ratio_shifts[i]); + ratios[i] = shl_ref_get_scale(params->ratio_multipliers[i], params->ratio_shifts[i]); } - float threshold = csi_ref_get_scale(params->threshold_multiplier, params->threshold_shift); + float threshold = shl_ref_get_scale(params->threshold_multiplier, params->threshold_shift); params->ratios = ratios; params->scales = scales; params->threshold = threshold; - struct csi_tensor *fcls = csi_ref_tensor_transform_f32(cls_prob); - struct csi_tensor *fbbox = csi_ref_tensor_transform_f32(bbox_pred); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - csi_ref_proposal_f32(fcls, fbbox, im_info, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(fcls); - csi_ref_tensor_transform_free_f32(fbbox); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *fcls = shl_ref_tensor_transform_f32(cls_prob); + struct csinn_tensor *fbbox = shl_ref_tensor_transform_f32(bbox_pred); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + shl_ref_proposal_f32(fcls, fbbox, im_info, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(fcls); + shl_ref_tensor_transform_free_f32(fbbox); + shl_ref_tensor_transform_free_f32(foutput); return CSINN_TRUE; } diff --git a/source/reference/psroipooling.c b/source/reference/psroipooling.c index 90edea23..d179667e 100644 --- a/source/reference/psroipooling.c +++ b/source/reference/psroipooling.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params) +int shl_ref_psroipooling_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params) { float *output_data = output->data; float *bottom_data = data->data; @@ -88,17 +87,18 @@ int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois, return CSINN_TRUE; } -int csi_ref_psroipooling_quant(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct psroipooling_params *params) +int shl_ref_psroipooling_quant(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, + struct csinn_psroipooling_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(data); - struct csi_tensor *frois = csi_ref_tensor_transform_f32(rois); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_psroipooling_f32(finput, frois, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(frois); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(data); + struct csinn_tensor *frois = shl_ref_tensor_transform_f32(rois); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_psroipooling_f32(finput, frois, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(frois); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/reduce_logsumexp.c b/source/reference/reduce_logsumexp.c index 7cfa73af..14852d68 100644 --- a/source/reference/reduce_logsumexp.c +++ b/source/reference/reduce_logsumexp.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_logsumexp_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -66,8 +65,8 @@ int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_ref_reduce_logsumexp_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_logsumexp_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_logsumexp_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_logsumexp_f32); } diff --git a/source/reference/reduce_max.c b/source/reference/reduce_max.c index 8ff1af2c..d4888392 100644 --- a/source/reference/reduce_max.c +++ b/source/reference/reduce_max.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_max_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,8 +64,8 @@ int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_reduce_max_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_max_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_max_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_max_f32); } diff --git a/source/reference/reduce_mean.c b/source/reference/reduce_mean.c index 2c3be614..429c2897 100644 --- a/source/reference/reduce_mean.c +++ b/source/reference/reduce_mean.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_mean_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,8 +64,8 @@ int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_reduce_mean_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_mean_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_mean_f32); } diff --git a/source/reference/reduce_min.c b/source/reference/reduce_min.c index 2fef1a2a..0ebdedbe 100644 --- a/source/reference/reduce_min.c +++ b/source/reference/reduce_min.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_min_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,8 +64,8 @@ int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_reduce_min_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_min_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_min_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_min_f32); } diff --git a/source/reference/reduce_prod.c b/source/reference/reduce_prod.c index a3f0f3f1..04b7b47b 100644 --- a/source/reference/reduce_prod.c +++ b/source/reference/reduce_prod.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_prod_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,8 +64,8 @@ int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_reduce_prod_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_prod_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_prod_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_prod_f32); } diff --git a/source/reference/reduce_sum.c b/source/reference/reduce_sum.c index e4380715..2f3871a5 100644 --- a/source/reference/reduce_sum.c +++ b/source/reference/reduce_sum.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_sum_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,8 +64,8 @@ int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_reduce_sum_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_reduce_sum_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_sum_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_sum_f32); } diff --git a/source/reference/relu.c b/source/reference/relu.c index 0a6712a4..7e14fe31 100644 --- a/source/reference/relu.c +++ b/source/reference/relu.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float relu(float x) { return x > 0 ? x : 0; } -int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,8 +38,8 @@ int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_relu_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_relu_f32); } diff --git a/source/reference/relu1.c b/source/reference/relu1.c index 87f1985c..edbe43e6 100644 --- a/source/reference/relu1.c +++ b/source/reference/relu1.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float relu1(float x) { return fmin(x > 0 ? x : 0, 1); } -int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,8 +38,8 @@ int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_relu1_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu1_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_relu1_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_relu1_f32); } diff --git a/source/reference/relu6.c b/source/reference/relu6.c index 08343ac9..c4c91ced 100644 --- a/source/reference/relu6.c +++ b/source/reference/relu6.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float relu6(float x) { return fmin(x > 0 ? x : 0, 6); } -int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,8 +38,8 @@ int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_relu6_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_relu6_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_relu6_f32); } diff --git a/source/reference/relun.c b/source/reference/relun.c index c4d0f715..1a7505de 100644 --- a/source/reference/relun.c +++ b/source/reference/relun.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float relun(float x, float y) { return fmin(x > 0.0 ? x : 0.0, y); } -int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relun_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,8 +38,8 @@ int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_relun_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_relun_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_relun_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_relun_f32); } diff --git a/source/reference/reshape.c b/source/reference/reshape.c index 3f7c1ad2..d6d0b9d3 100644 --- a/source/reference/reshape.c +++ b/source/reference/reshape.c @@ -16,40 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_reshape_init(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params) +int shl_ref_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { + struct csinn_callback *cb = params->base.cb; if (input->quant_channel == output->quant_channel) { - int quant_size = input->quant_channel * sizeof(struct csi_quant_info); + int quant_size = input->quant_channel * sizeof(struct csinn_quant_info); int t = memcmp(input->qinfo, output->qinfo, quant_size); if (t == 0) { - params->base.bc = csi_ref_reshape; + cb->exec = shl_ref_reshape; return CSINN_TRUE; } } - params->base.bc = csi_ref_reshape_quant; + cb->exec = shl_ref_reshape_quant; return CSINN_TRUE; } -int csi_ref_reshape(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params) +int shl_ref_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_byte_size(input); + int size = csinn_tensor_byte_size(input); if (input_data != output_data) { memcpy(output_data, input_data, size); } return CSINN_TRUE; } -int csi_ref_reshape_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params) +int shl_ref_reshape_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reshape); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reshape); } diff --git a/source/reference/resize.c b/source/reference/resize.c index 2d23be1b..13c8334b 100644 --- a/source/reference/resize.c +++ b/source/reference/resize.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - bool align_corners) +static void shl_ref_resize_bilinear_nhwc_f32(struct csinn_tensor *input, + struct csinn_tensor *output, bool align_corners) { float *input_data = input->data; float *output_data = output->data; @@ -49,22 +48,22 @@ static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct cs for (int y = 0; y < output_height; ++y) { float input_y = y * height_scale; int32_t y0 = (int32_t)(floor(input_y)); - int32_t y1 = csi_ref_min_internal_s32(y0 + 1, input_height - 1); + int32_t y1 = shl_ref_min_internal_s32(y0 + 1, input_height - 1); for (int x = 0; x < output_width; ++x) { float input_x = x * width_scale; int32_t x0 = (int32_t)(floor(input_x)); - int32_t x1 = csi_ref_min_internal_s32(x0 + 1, input_width - 1); + int32_t x1 = shl_ref_min_internal_s32(x0 + 1, input_width - 1); for (int c = 0; c < depth; ++c) { float interpolation = - (float)(input_data[csi_ref_get_index(input->dim, b, y0, x0, c)] * + (float)(input_data[shl_ref_get_index(input->dim, b, y0, x0, c)] * (1 - (input_y - y0)) * (1 - (input_x - x0)) + - input_data[csi_ref_get_index(input->dim, b, y1, x0, c)] * + input_data[shl_ref_get_index(input->dim, b, y1, x0, c)] * (input_y - y0) * (1 - (input_x - x0)) + - input_data[csi_ref_get_index(input->dim, b, y0, x1, c)] * + input_data[shl_ref_get_index(input->dim, b, y0, x1, c)] * (1 - (input_y - y0)) * (input_x - x0) + - input_data[csi_ref_get_index(input->dim, b, y1, x1, c)] * + input_data[shl_ref_get_index(input->dim, b, y1, x1, c)] * (input_y - y0) * (input_x - x0)); - output_data[csi_ref_get_index(output->dim, b, y, x, c)] = interpolation; + output_data[shl_ref_get_index(output->dim, b, y, x, c)] = interpolation; } } } @@ -74,8 +73,8 @@ static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct cs /*reference * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h */ -static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct csi_tensor *output, - bool align_corners) +static void shl_ref_resize_nearest_neighbor_f32(struct csinn_tensor *input, + struct csinn_tensor *output, bool align_corners) { float *input_data = input->data; float *output_data = output->data; @@ -107,13 +106,13 @@ static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct for (int b = 0; b < batches; ++b) { for (int y = 0; y < output_height; ++y) { int32_t in_y = - csi_ref_min_internal_s32(align_corners ? (int32_t)(round(y * height_scale)) + shl_ref_min_internal_s32(align_corners ? (int32_t)(round(y * height_scale)) : (int32_t)(floor(y * height_scale)), input_height - 1); const float *y_input_ptr = input_ptr + in_y * row_offset; for (int x = 0; x < output_width; ++x) { int32_t in_x = - csi_ref_min_internal_s32(align_corners ? (int32_t)(round(x * width_scale)) + shl_ref_min_internal_s32(align_corners ? (int32_t)(round(x * width_scale)) : (int32_t)(floor(x * width_scale)), input_width - 1); const float *x_input_ptr = y_input_ptr + in_x * col_offset; @@ -125,41 +124,41 @@ static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct } } -static void csi_ref_resize_nearest_neighbor_nchw_f32(struct csi_tensor *o_input, - struct csi_tensor *o_output, +static void shl_ref_resize_nearest_neighbor_nchw_f32(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, bool align_corners) { - struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input); - struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output); - csi_ref_resize_nearest_neighbor_f32(input, output, align_corners); - csi_ref_nhwc_to_nchw_f32(o_output, output); - csi_ref_free_float_tensor(input); + struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input); + struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output); + shl_ref_resize_nearest_neighbor_f32(input, output, align_corners); + shl_ref_nhwc_to_nchw_f32(o_output, output); + shl_ref_free_float_tensor(input); } -static void csi_ref_resize_bilinear_nchw_f32(struct csi_tensor *o_input, - struct csi_tensor *o_output, bool align_corners) +static void shl_ref_resize_bilinear_nchw_f32(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, bool align_corners) { - struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input); - struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output); - csi_ref_resize_bilinear_nhwc_f32(input, output, align_corners); - csi_ref_nhwc_to_nchw_f32(o_output, output); - csi_ref_free_float_tensor(input); + struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input); + struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output); + shl_ref_resize_bilinear_nhwc_f32(input, output, align_corners); + shl_ref_nhwc_to_nchw_f32(o_output, output); + shl_ref_free_float_tensor(input); } -int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params) +int shl_ref_resize_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params) { if (params->resize_mode == CSINN_RESIZE_BILINEAR) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_resize_bilinear_nchw_f32(input, output, params->align_corners); + shl_ref_resize_bilinear_nchw_f32(input, output, params->align_corners); } else { - csi_ref_resize_bilinear_nhwc_f32(input, output, params->align_corners); + shl_ref_resize_bilinear_nhwc_f32(input, output, params->align_corners); } } else if (params->resize_mode == CSINN_RESIZE_NEAREST_NEIGHBOR) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_resize_nearest_neighbor_nchw_f32(input, output, params->align_corners); + shl_ref_resize_nearest_neighbor_nchw_f32(input, output, params->align_corners); } else { - csi_ref_resize_nearest_neighbor_f32(input, output, params->align_corners); + shl_ref_resize_nearest_neighbor_f32(input, output, params->align_corners); } } else { return CSINN_FALSE; @@ -167,8 +166,8 @@ int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_resize_quant(struct csi_tensor *input, struct csi_tensor *output, - struct resize_params *params) +int shl_ref_resize_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_resize_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_resize_f32); } diff --git a/source/reference/reverse.c b/source/reference/reverse.c index 0130e865..407d1341 100644 --- a/source/reference/reverse.c +++ b/source/reference/reverse.c @@ -16,12 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int Multiplication(struct csi_tensor *input, int s, int e) +static int Multiplication(struct csinn_tensor *input, int s, int e) { int res = 1; for (int i = s; i <= e; i++) { @@ -30,8 +29,8 @@ static int Multiplication(struct csi_tensor *input, int s, int e) return res; } -int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params) +int shl_ref_reverse_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -51,20 +50,20 @@ int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output, float *start_addr = output_data + i * step * (input->dim[axis]); float *end_addr = start_addr + step * (input->dim[axis]) - 1; for (int j = 0; j < cnt; j++) { - float *temp = (float *)csi_mem_alloc(step * sizeof(float)); + float *temp = (float *)shl_mem_alloc(step * sizeof(float)); memcpy(temp, start_addr, step * sizeof(float)); memcpy(start_addr, end_addr - step + 1, step * sizeof(float)); memcpy(end_addr - step + 1, temp, step * sizeof(float)); start_addr += step; end_addr -= step; - csi_mem_free(temp); + shl_mem_free(temp); } } return CSINN_TRUE; } -int csi_ref_reverse_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reverse_params *params) +int shl_ref_reverse_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_reverse_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_reverse_f32); } diff --git a/source/reference/roialign.c b/source/reference/roialign.c index 6f1783d8..2c07e0e4 100644 --- a/source/reference/roialign.c +++ b/source/reference/roialign.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" // https://github.com/AceCoooool/RoIAlign-RoIPool-pytorch/blob/master/roialign/roi_align_cpu.cpp @@ -74,8 +74,8 @@ static void pre_calc_for_bilinear(const int h, const int w, const int pool_h, co } } -int csi_ref_roi_align_f32(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_align_params *params) +int shl_ref_roi_align_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params) { float *bottom_rois = (float *)rois->data; float *input_data = (float *)data->data; diff --git a/source/reference/roipool.c b/source/reference/roipool.c index 0047475b..eb703dba 100644 --- a/source/reference/roipool.c +++ b/source/reference/roipool.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" // https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_pool_op.cc // defalut input layout: NCHW -int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, - struct roi_pool_params *params) +int shl_ref_roipool_f32(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params) { float *output_data = (float *)output->data; float *bottom_data = (float *)data->data; @@ -95,17 +94,17 @@ int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct return CSINN_TRUE; } -int csi_ref_roipool_quant(struct csi_tensor *data, struct csi_tensor *rois, - struct csi_tensor *output, struct roi_pool_params *params) +int shl_ref_roipool_quant(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(data); - struct csi_tensor *frois = csi_ref_tensor_transform_f32(rois); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_roipool_f32(finput, frois, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(frois); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(data); + struct csinn_tensor *frois = shl_ref_tensor_transform_f32(rois); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_roipool_f32(finput, frois, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(frois); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/round.c b/source/reference/round.c index 92c715d0..24b6653d 100644 --- a/source/reference/round.c +++ b/source/reference/round.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_round_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +36,8 @@ int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_round_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_round_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_round_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_round_f32); } diff --git a/source/reference/rsqrt.c b/source/reference/rsqrt.c index b9e05475..fffe7c1f 100644 --- a/source/reference/rsqrt.c +++ b/source/reference/rsqrt.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_rsqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = 1.0 / sqrt(input_data[i]); @@ -34,8 +33,8 @@ int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_rsqrt_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_rsqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_rsqrt_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_rsqrt_f32); } diff --git a/source/reference/scatter.c b/source/reference/scatter.c index a207a3db..d8adc79c 100644 --- a/source/reference/scatter.c +++ b/source/reference/scatter.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params) +int shl_ref_scatter_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params) { if (input->dim_count != 5 && indices->dim[indices->dim_count - 1] != 5) { return CSINN_FALSE; @@ -53,12 +52,12 @@ int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, m) * indices->dim[5]; - int output_index = csi_ref_get_index_5( + int output_index = shl_ref_get_index_5( input->dim, indices_data[indices_base], indices_data[indices_base + 1], indices_data[indices_base + 2], indices_data[indices_base + 3], indices_data[indices_base + 4]); - int updates_index = csi_ref_get_index_5(updates->dim, i, j, k, l, m); + int updates_index = shl_ref_get_index_5(updates->dim, i, j, k, l, m); output_data[output_index] = updates_data[updates_index]; } } @@ -69,17 +68,17 @@ int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices, return CSINN_TRUE; } -int csi_ref_scatter_nd_quant(struct csi_tensor *input, struct csi_tensor *indices, - struct csi_tensor *updates, struct csi_tensor *output, - struct scatter_nd_params *params) +int shl_ref_scatter_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params) { - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_updates = csi_ref_tensor_transform_f32(updates); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); - int ret = csi_ref_scatter_nd_f32(float_input, indices, float_updates, float_output, params); - csi_tensor_data_convert(output, float_output); - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_updates); + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_updates = shl_ref_tensor_transform_f32(updates); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); + int ret = shl_ref_scatter_nd_f32(float_input, indices, float_updates, float_output, params); + csinn_tensor_data_convert(output, float_output); + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_updates); return ret; } diff --git a/source/reference/segment_max.c b/source/reference/segment_max.c index 44598441..1e5eb23b 100644 --- a/source/reference/segment_max.c +++ b/source/reference/segment_max.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = -FLT_MAX; } } @@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = input_data[input_index] > output_data[output_index] ? input_data[input_index] @@ -67,8 +67,8 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -83,7 +83,7 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = -FLT_MAX; } } @@ -99,8 +99,8 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = input_data[input_index] > output_data[output_index] ? input_data[input_index] @@ -116,28 +116,29 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment return CSINN_TRUE; } -int csi_ref_unsorted_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_unsorted_segment_max_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_unsorted_segment_max_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_segment_max_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_segment_max_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/segment_mean.c b/source/reference/segment_mean.c index def9a277..b0bb53fe 100644 --- a/source/reference/segment_mean.c +++ b/source/reference/segment_mean.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -37,7 +37,7 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 0; } } @@ -55,9 +55,9 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); for (int k = 0; k < num; k++) { - int32_t input_index = csi_ref_get_index(input->dim, index[k], h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, index[k], h, w, c); output_data[output_index] += input_data[input_index]; } output_data[output_index] /= mean_n; @@ -70,8 +70,8 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -87,7 +87,7 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 0; } } @@ -106,9 +106,9 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); for (int k = 0; k < num; k++) { - int32_t input_index = csi_ref_get_index(input->dim, index[k], h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, index[k], h, w, c); output_data[output_index] += input_data[input_index]; } output_data[output_index] /= mean_n; @@ -121,28 +121,30 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen return CSINN_TRUE; } -int csi_ref_unsorted_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_mean_quant(struct csinn_tensor *input, + struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_unsorted_segment_mean_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_unsorted_segment_mean_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_mean_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_segment_mean_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_segment_mean_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/segment_min.c b/source/reference/segment_min.c index 8bdf984e..5cec7060 100644 --- a/source/reference/segment_min.c +++ b/source/reference/segment_min.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = FLT_MAX; } } @@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = input_data[input_index] < output_data[output_index] ? input_data[input_index] @@ -67,8 +67,8 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -83,7 +83,7 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = FLT_MAX; } } @@ -99,8 +99,8 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = input_data[input_index] < output_data[output_index] ? input_data[input_index] @@ -116,28 +116,29 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment return CSINN_TRUE; } -int csi_ref_unsorted_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_unsorted_segment_min_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_unsorted_segment_min_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_segment_min_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_segment_min_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/segment_prod.c b/source/reference/segment_prod.c index 849cba97..725273f8 100644 --- a/source/reference/segment_prod.c +++ b/source/reference/segment_prod.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 1; } } @@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] *= input_data[input_index]; } } @@ -64,8 +64,8 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -80,7 +80,7 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 1; } } @@ -96,8 +96,8 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] *= input_data[input_index]; } } @@ -110,28 +110,30 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen return CSINN_TRUE; } -int csi_ref_unsorted_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_prod_quant(struct csinn_tensor *input, + struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_unsorted_segment_prod_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_unsorted_segment_prod_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_prod_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_segment_prod_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_segment_prod_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/segment_sum.c b/source/reference/segment_sum.c index fe88dc7a..bd7bd4c0 100644 --- a/source/reference/segment_sum.c +++ b/source/reference/segment_sum.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -37,7 +37,7 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 0; } } @@ -54,8 +54,8 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, index[num], h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, index[num], h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] += input_data[input_index]; } } @@ -66,8 +66,8 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -83,7 +83,7 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] = 0; } } @@ -101,8 +101,8 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment for (int h = 0; h < input->dim[1]; h++) { for (int w = 0; w < input->dim[2]; w++) { for (int c = 0; c < input->dim[3]; c++) { - int32_t input_index = csi_ref_get_index(input->dim, index[num], h, w, c); - int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c); + int32_t input_index = shl_ref_get_index(input->dim, index[num], h, w, c); + int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c); output_data[output_index] += input_data[input_index]; } } @@ -113,28 +113,29 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment return CSINN_TRUE; } -int csi_ref_unsorted_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_unsorted_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_unsorted_segment_sum_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_unsorted_segment_sum_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids, - struct csi_tensor *output, struct segment_params *params) +int shl_ref_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_segment_sum_f32(finput, segment_ids, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_segment_sum_f32(finput, segment_ids, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } diff --git a/source/reference/select.c b/source/reference/select.c index cce83c71..46927113 100644 --- a/source/reference/select.c +++ b/source/reference/select.c @@ -16,20 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params) +int shl_ref_select_f32(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { float *input_data0 = input0->data; float *input_data1 = input1->data; float *conlist_data = condition->data; float *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i]; @@ -37,15 +36,15 @@ int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0, return CSINN_TRUE; } -int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params) +int shl_ref_select_u8(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { uint8_t *input_data0 = input0->data; uint8_t *input_data1 = input1->data; uint8_t *conlist_data = condition->data; uint8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i]; @@ -53,15 +52,15 @@ int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0, return CSINN_TRUE; } -int csi_ref_select_i8(struct csi_tensor *condition, struct csi_tensor *input0, - struct csi_tensor *input1, struct csi_tensor *output, - struct select_params *params) +int shl_ref_select_i8(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params) { int8_t *input_data0 = input0->data; int8_t *input_data1 = input1->data; int8_t *conlist_data = condition->data; int8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i]; diff --git a/source/reference/setup.c b/source/reference/setup.c index 9ad91e4b..5e28e83a 100644 --- a/source/reference/setup.c +++ b/source/reference/setup.c @@ -16,28 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -void *csi_init_map_ref(int op, int dtype) -{ - if (op == CSINN_OP_FLATTEN) { - return csi_ref_flatten_init; - } else if (op == CSINN_OP_RESHAPE) { - return csi_ref_reshape_init; - } else if (op == CSINN_OP_TRANSPOSE) { - return csi_ref_transpose_init; - } else if (op == CSINN_OP_CACHE_MATMUL) { - return csi_ref_cache_matmul_init; - } else if (op == CSINN_OP_CACHE_CONV1D) { - return csi_ref_cache_conv1d_init; - } - - return NULL; -} - -void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output) +void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output) { int size = 1; for (int i = 0; i < input->dim_count; i++) { @@ -121,7 +104,7 @@ void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output) for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = i * inner_size + j; - output_data[index] = csi_ref_float32_to_float16(input_data[index]); + output_data[index] = shl_ref_float32_to_float16(input_data[index]); } } } else if (output->dtype == CSINN_DTYPE_BFLOAT16) { @@ -130,15 +113,19 @@ void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output) for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = i * inner_size + j; - output_data[index] = csi_ref_float32_to_bfloat16(input_data[index]); + output_data[index] = shl_ref_float32_to_bfloat16(input_data[index]); } } + } else if (output->dtype == CSINN_DTYPE_FLOAT32) { + float *input_data = input->data; + float *output_data = output->data; + memcpy(output_data, input_data, size * 4); } else { - csi_debug_error("csi_ref_nn_init: unsupport dtype\n"); + shl_debug_error("shl_ref_nn_init: unsupport dtype\n"); } } -void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output) +void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output) { int size = 1; for (int i = 0; i < input->dim_count; i++) { @@ -190,7 +177,7 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output) } } } else if (input->dtype == CSINN_DTYPE_INT32) { - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); memcpy(output->data, input->data, size * 4); } else if (input->dtype == CSINN_DTYPE_FLOAT16) { int16_t *input_data = input->data; @@ -198,7 +185,7 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output) for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = i * inner_size + j; - output_data[index] = csi_ref_float16_to_float32(input_data[index]); + output_data[index] = shl_ref_float16_to_float32(input_data[index]); } } } else if (input->dtype == CSINN_DTYPE_BFLOAT16) { @@ -207,366 +194,537 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output) for (int i = 0; i < q_size; i++) { for (int j = 0; j < inner_size; j++) { int index = i * inner_size + j; - output_data[index] = csi_ref_bfloat16_to_float32(input_data[index]); + output_data[index] = shl_ref_bfloat16_to_float32(input_data[index]); } } } else if (input->dtype == CSINN_DTYPE_BOOL) { - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); memcpy(output->data, input->data, size); } else { - csi_debug_error("csi_ref_nn_deinit: unsupport dtype\n"); + shl_debug_error("shl_ref_nn_deinit: unsupport dtype\n"); } } -static void *setup_bc_map() +static void *setup_cb_map() { - static void *bc_map[CSINN_OP_AND_UTILS_SIZE][CSINN_DTYPE_SIZE]; + static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][CSINN_DTYPE_SIZE]; + memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * CSINN_DTYPE_SIZE); + for (int i = CSINN_DTYPE_INT4; i <= CSINN_DTYPE_BFLOAT16; i++) { - bc_map[CSINN_OP_ABS][i] = csi_ref_abs_quant; - bc_map[CSINN_OP_ACOS][i] = csi_ref_acos_quant; - bc_map[CSINN_OP_ACOSH][i] = csi_ref_acosh_quant; - bc_map[CSINN_OP_ADD][i] = csi_ref_add_quant; - bc_map[CSINN_OP_ARANGE][i] = csi_ref_arange_quant; - bc_map[CSINN_OP_ARGMAX][i] = csi_ref_argmax_stride_quant; - bc_map[CSINN_OP_ARGMIN][i] = csi_ref_argmin_stride_quant; - bc_map[CSINN_OP_ASIN][i] = csi_ref_asin_quant; - bc_map[CSINN_OP_ASINH][i] = csi_ref_asinh_quant; - bc_map[CSINN_OP_ATAN][i] = csi_ref_atan_quant; - bc_map[CSINN_OP_ATANH][i] = csi_ref_atanh_quant; - bc_map[CSINN_OP_AVGPOOL2D][i] = csi_ref_avgpool2d_quant; - bc_map[CSINN_OP_AVGPOOL3D][i] = csi_ref_avgpool3d_quant; - bc_map[CSINN_OP_BN][i] = csi_ref_batch_normalization_quant; - bc_map[CSINN_OP_BATCH_TO_SPACE][i] = csi_ref_batch_to_space_quant; - bc_map[CSINN_OP_BROADCOST][i] = csi_ref_broadcast_to_quant; - bc_map[CSINN_OP_CACHE_MATMUL][i] = csi_ref_cache_matmul_quant; - bc_map[CSINN_OP_CACHE_CONV1D][i] = csi_ref_cache_conv1d_quant; - bc_map[CSINN_OP_CEIL][i] = csi_ref_ceil_quant; - bc_map[CSINN_OP_CLIP][i] = csi_ref_clip_quant; - bc_map[CSINN_OP_CONCAT][i] = csi_ref_concat_quant; - bc_map[CSINN_OP_COS][i] = csi_ref_cos_quant; - bc_map[CSINN_OP_COSH][i] = csi_ref_cosh_quant; - bc_map[CSINN_OP_CUMPROD][i] = csi_ref_cumprod_quant; - bc_map[CSINN_OP_DATA_CONVERT][i] = csi_ref_data_convert_quant; - bc_map[CSINN_OP_CUMSUM][i] = csi_ref_cumsum_quant; - bc_map[CSINN_OP_DEPTH_TO_SPACE][i] = csi_ref_depth_to_space_quant; - bc_map[CSINN_OP_DIV][i] = csi_ref_div_quant; - bc_map[CSINN_OP_ELU][i] = csi_ref_elu_quant; - bc_map[CSINN_OP_EQUANL][i] = csi_ref_equal_quant; - bc_map[CSINN_OP_ERF][i] = csi_ref_erf_quant; - bc_map[CSINN_OP_EXP][i] = csi_ref_exp_quant; - bc_map[CSINN_OP_EXPAND_DIMS][i] = csi_ref_expand_dims_quant; - bc_map[CSINN_OP_EXPM1][i] = csi_ref_expm1_quant; - bc_map[CSINN_OP_FLATTEN][i] = csi_ref_flatten; - bc_map[CSINN_OP_FLOOR_DIVIDE][i] = csi_ref_floor_divide_quant; - bc_map[CSINN_OP_FLOOR_MOD][i] = csi_ref_floor_mod_quant; - bc_map[CSINN_OP_FLOOR][i] = csi_ref_floor_quant; - bc_map[CSINN_OP_FSMN][i] = csi_ref_fsmn_quant; - bc_map[CSINN_OP_GATHER_ND][i] = csi_ref_gather_nd_quant; - bc_map[CSINN_OP_GATHER][i] = csi_ref_gather_quant; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][i] = csi_ref_global_avgpool2d_quant; - bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][i] = csi_ref_global_maxpool2d_quant; - bc_map[CSINN_OP_GREATHER_EQUAL][i] = csi_ref_greater_equal_quant; - bc_map[CSINN_OP_GREATHER][i] = csi_ref_greater_quant; - bc_map[CSINN_OP_HARD_SIGMOID][i] = csi_ref_hard_sigmoid_quant; - bc_map[CSINN_OP_IM2COL][i] = csi_ref_im2col_quant; - bc_map[CSINN_OP_L2N][i] = csi_ref_l2_normalization_quant; - bc_map[CSINN_OP_LEAKY_RELU][i] = csi_ref_leaky_relu_quant; - bc_map[CSINN_OP_LESS_EQUAL][i] = csi_ref_less_equal_quant; - bc_map[CSINN_OP_LESS][i] = csi_ref_less_quant; - bc_map[CSINN_OP_LOG_SOFTMAX][i] = csi_ref_log_softmax_quant; - bc_map[CSINN_OP_LOG][i] = csi_ref_log_quant; - bc_map[CSINN_OP_LOG1P][i] = csi_ref_log1p_quant; - bc_map[CSINN_OP_LOGICAL_AND][i] = csi_ref_logical_and_quant; - bc_map[CSINN_OP_LOGICAL_NOT][i] = csi_ref_logical_not_quant; - bc_map[CSINN_OP_LOGICAL_OR][i] = csi_ref_logical_or_quant; - bc_map[CSINN_OP_LOGICAL_XOR][i] = csi_ref_logical_xor_quant; - bc_map[CSINN_OP_LRN][i] = csi_ref_lrn_quant; - bc_map[CSINN_OP_MATMUL][i] = csi_ref_matmul_quant; - bc_map[CSINN_OP_MAX][i] = csi_ref_max_stride_quant; - bc_map[CSINN_OP_MAXIMUM][i] = csi_ref_maximum_quant; - bc_map[CSINN_OP_MAXPOOL2D][i] = csi_ref_maxpool2d_quant; - bc_map[CSINN_OP_MAXPOOL2D_LOCAT][i] = csi_ref_maxpool2d_locat_quant; - bc_map[CSINN_OP_MAXPOOL3D][i] = csi_ref_maxpool3d_quant; - bc_map[CSINN_OP_MEAN][i] = csi_ref_mean_stride_quant; - bc_map[CSINN_OP_MEAN_STRIDE][i] = csi_ref_mean_stride_quant; - bc_map[CSINN_OP_MIN][i] = csi_ref_min_stride_quant; - bc_map[CSINN_OP_MINIMUM][i] = csi_ref_minimum_quant; - bc_map[CSINN_OP_MOD][i] = csi_ref_mod_quant; - bc_map[CSINN_OP_MUL][i] = csi_ref_mul_quant; - bc_map[CSINN_OP_NEGATIIVE][i] = csi_ref_negative_quant; - bc_map[CSINN_OP_NOT_EQUAL][i] = csi_ref_not_equal_quant; - bc_map[CSINN_OP_PAD][i] = csi_ref_pad_quant; - bc_map[CSINN_OP_POWER][i] = csi_ref_power_quant; - bc_map[CSINN_OP_PRELU][i] = csi_ref_prelu_quant; - bc_map[CSINN_OP_PROD][i] = csi_ref_prod_stride_quant; - bc_map[CSINN_OP_PROPOSAL][i] = csi_ref_proposal_quant; - bc_map[CSINN_OP_PSROIPOOLING][i] = csi_ref_psroipooling_quant; - bc_map[CSINN_OP_REDUCE_LOGSUMEXP][i] = csi_ref_reduce_logsumexp_quant; - bc_map[CSINN_OP_REDUCE_MAX][i] = csi_ref_reduce_max_quant; - bc_map[CSINN_OP_REDUCE_MEAN][i] = csi_ref_reduce_mean_quant; - bc_map[CSINN_OP_REDUCE_MIN][i] = csi_ref_reduce_min_quant; - bc_map[CSINN_OP_REDUCE_PROD][i] = csi_ref_reduce_prod_quant; - bc_map[CSINN_OP_REDUCE_SUM][i] = csi_ref_reduce_sum_quant; - bc_map[CSINN_OP_RELU][i] = csi_ref_relu_quant; - bc_map[CSINN_OP_RELU1][i] = csi_ref_relu1_quant; - bc_map[CSINN_OP_RELU6][i] = csi_ref_relu6_quant; - bc_map[CSINN_OP_RELUN][i] = csi_ref_relun_quant; - bc_map[CSINN_OP_RESHAPE][i] = csi_ref_reshape; - bc_map[CSINN_OP_RESIZE][i] = csi_ref_resize_quant; - bc_map[CSINN_OP_REVERSE][i] = csi_ref_reverse_quant; - bc_map[CSINN_OP_ROIPOOL][i] = csi_ref_roipool_quant; - bc_map[CSINN_OP_ROUND][i] = csi_ref_round_quant; - bc_map[CSINN_OP_RSQRT][i] = csi_ref_rsqrt_quant; - bc_map[CSINN_OP_SEGMENT_MAX][i] = csi_ref_segment_max_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i] = csi_ref_unsorted_segment_max_quant; - bc_map[CSINN_OP_SEGMENT_MEAN][i] = csi_ref_segment_mean_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i] = csi_ref_unsorted_segment_mean_quant; - bc_map[CSINN_OP_SEGMENT_MIN][i] = csi_ref_segment_min_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i] = csi_ref_unsorted_segment_min_quant; - bc_map[CSINN_OP_SEGMENT_PROD][i] = csi_ref_segment_prod_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i] = csi_ref_unsorted_segment_prod_quant; - bc_map[CSINN_OP_SEGMENT_SUM][i] = csi_ref_segment_sum_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i] = csi_ref_unsorted_segment_sum_quant; - bc_map[CSINN_OP_SHUFFLE_CHANNEL][i] = csi_ref_shuffle_channel_quant; - bc_map[CSINN_OP_SIGMOID][i] = csi_ref_sigmoid_quant; - bc_map[CSINN_OP_SIGN][i] = csi_ref_sign_quant; - bc_map[CSINN_OP_SIN][i] = csi_ref_sin_quant; - bc_map[CSINN_OP_SINH][i] = csi_ref_sinh_quant; - bc_map[CSINN_OP_SLICE][i] = csi_ref_slice_quant; - bc_map[CSINN_OP_SOFTMAX][i] = csi_ref_softmax_quant; - bc_map[CSINN_OP_SOFTPLUS][i] = csi_ref_softplus_quant; - bc_map[CSINN_OP_SOFTRELU][i] = csi_ref_softrelu_quant; - bc_map[CSINN_OP_SOFTSIGN][i] = csi_ref_softsign_quant; - bc_map[CSINN_OP_SPACE_TO_BATCH][i] = csi_ref_space_to_batch_quant; - bc_map[CSINN_OP_SPACE_TO_DEPTH][i] = csi_ref_space_to_depth_quant; - bc_map[CSINN_OP_SQRT][i] = csi_ref_sqrt_quant; - bc_map[CSINN_OP_STACK][i] = csi_ref_stack_quant; - bc_map[CSINN_OP_STRIDED_SLICE][i] = csi_ref_strided_slice_quant; - bc_map[CSINN_OP_SUB][i] = csi_ref_sub_quant; - bc_map[CSINN_OP_SUM][i] = csi_ref_sum_stride_quant; - bc_map[CSINN_OP_TAN][i] = csi_ref_tan_quant; - bc_map[CSINN_OP_TANH][i] = csi_ref_tanh_quant; - bc_map[CSINN_OP_THRESHOLD_RELU][i] = csi_ref_threshold_relu_quant; - bc_map[CSINN_OP_TILE][i] = csi_ref_tile_quant; - bc_map[CSINN_OP_TOPK][i] = csi_ref_topk_quant; - bc_map[CSINN_OP_TRANSPOSE][i] = csi_ref_transpose; - bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant; - bc_map[CSINN_OP_UNPOOLING][i] = csi_ref_unpooling_quant; - bc_map[CSINN_OP_YUV_RGB_SCALE][i] = csi_ref_yuv_rgb_scale_quant; - bc_map[CSINN_OP_CONV2D][i] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_CONV2D_RELU][i] = csi_ref_conv2d_relu_quant; - bc_map[CSINN_OP_CONV2D_RELU6][i] = csi_ref_conv2d_relu6_quant; - bc_map[CSINN_OP_CONV2D_CHANNEL][i] = csi_ref_conv2d_channel_quant; - bc_map[CSINN_OP_CONV2D_CHANNEL_RELU][i] = csi_ref_conv2d_channel_relu_quant; - bc_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i] = csi_ref_conv2d_channel_relu6_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][i] = csi_ref_depthwise_conv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i] = csi_ref_depthwise_conv2d_relu_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i] = csi_ref_depthwise_conv2d_relu6_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i] = csi_ref_depthwise_conv2d_channel_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i] = - csi_ref_depthwise_conv2d_channel_relu_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i] = - csi_ref_depthwise_conv2d_channel_relu6_quant; - bc_map[CSINN_OP_GROUP_CONV2D][i] = csi_ref_group_conv2d_quant; - bc_map[CSINN_OP_GROUP_CONV2D_RELU][i] = csi_ref_group_conv2d_relu_quant; - bc_map[CSINN_OP_GROUP_CONV2D_RELU6][i] = csi_ref_group_conv2d_relu6_quant; - bc_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i] = csi_ref_group_conv2d_channel_quant; - bc_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i] = csi_ref_group_conv2d_channel_relu_quant; - bc_map[CSINN_OP_CONV3D][i] = csi_ref_conv3d_quant; - bc_map[CSINN_OP_DECONV2D][i] = csi_ref_deconv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_DECONV2D][i] = csi_ref_depthwise_deconv2d_quant; - bc_map[CSINN_OP_DECONV3D][i] = csi_ref_deconv3d_quant; - bc_map[CSINN_OP_FULLYCONNECTED][i] = csi_ref_fullyconnected_quant; - bc_map[CSINN_OP_SCATTER_ND][i] = csi_ref_scatter_nd_quant; - bc_map[CSINN_OP_SPLIT][i] = csi_ref_split_quant; + cb_map[CSINN_OP_ABS][i].exec = shl_ref_abs_quant; + cb_map[CSINN_OP_ACOS][i].exec = shl_ref_acos_quant; + cb_map[CSINN_OP_ACOSH][i].exec = shl_ref_acosh_quant; + cb_map[CSINN_OP_ADD][i].exec = shl_ref_add_quant; + cb_map[CSINN_OP_ARANGE][i].exec = shl_ref_arange_quant; + cb_map[CSINN_OP_ARGMAX][i].exec = shl_ref_argmax_stride_quant; + cb_map[CSINN_OP_ARGMIN][i].exec = shl_ref_argmin_stride_quant; + cb_map[CSINN_OP_ASIN][i].exec = shl_ref_asin_quant; + cb_map[CSINN_OP_ASINH][i].exec = shl_ref_asinh_quant; + cb_map[CSINN_OP_ATAN][i].exec = shl_ref_atan_quant; + cb_map[CSINN_OP_ATANH][i].exec = shl_ref_atanh_quant; + cb_map[CSINN_OP_AVGPOOL2D][i].exec = shl_ref_avgpool2d_quant; + cb_map[CSINN_OP_AVGPOOL3D][i].exec = shl_ref_avgpool3d_quant; + cb_map[CSINN_OP_BN][i].exec = shl_ref_batch_normalization_quant; + cb_map[CSINN_OP_BATCH_TO_SPACE][i].exec = shl_ref_batch_to_space_quant; + cb_map[CSINN_OP_BROADCOST][i].exec = shl_ref_broadcast_to_quant; + cb_map[CSINN_OP_CACHE_MATMUL][i].exec = shl_ref_cache_matmul_quant; + cb_map[CSINN_OP_CACHE_MATMUL][i].init = shl_ref_cache_matmul_init; + cb_map[CSINN_OP_CACHE_CONV1D][i].exec = shl_ref_cache_conv1d_quant; + cb_map[CSINN_OP_CACHE_CONV1D][i].init = shl_ref_cache_conv1d_init; + cb_map[CSINN_OP_CEIL][i].exec = shl_ref_ceil_quant; + cb_map[CSINN_OP_CLIP][i].exec = shl_ref_clip_quant; + cb_map[CSINN_OP_CONCAT][i].exec = shl_ref_concat_quant; + cb_map[CSINN_OP_COS][i].exec = shl_ref_cos_quant; + cb_map[CSINN_OP_COSH][i].exec = shl_ref_cosh_quant; + cb_map[CSINN_OP_CUMPROD][i].exec = shl_ref_cumprod_quant; + cb_map[CSINN_OP_DATA_CONVERT][i].exec = shl_ref_data_convert_quant; + cb_map[CSINN_OP_CUMSUM][i].exec = shl_ref_cumsum_quant; + cb_map[CSINN_OP_DEPTH_TO_SPACE][i].exec = shl_ref_depth_to_space_quant; + cb_map[CSINN_OP_DIV][i].exec = shl_ref_div_quant; + cb_map[CSINN_OP_ELU][i].exec = shl_ref_elu_quant; + cb_map[CSINN_OP_EQUANL][i].exec = shl_ref_equal_quant; + cb_map[CSINN_OP_ERF][i].exec = shl_ref_erf_quant; + cb_map[CSINN_OP_EXP][i].exec = shl_ref_exp_quant; + cb_map[CSINN_OP_EXPAND_DIMS][i].exec = shl_ref_expand_dims_quant; + cb_map[CSINN_OP_EXPM1][i].exec = shl_ref_expm1_quant; + cb_map[CSINN_OP_FLATTEN][i].exec = shl_ref_flatten; + cb_map[CSINN_OP_FLATTEN][i].init = shl_ref_flatten_init; + cb_map[CSINN_OP_FLOOR_DIVIDE][i].exec = shl_ref_floor_divide_quant; + cb_map[CSINN_OP_FLOOR_MOD][i].exec = shl_ref_floor_mod_quant; + cb_map[CSINN_OP_FLOOR][i].exec = shl_ref_floor_quant; + cb_map[CSINN_OP_FSMN][i].exec = shl_ref_fsmn_quant; + cb_map[CSINN_OP_GATHER_ND][i].exec = shl_ref_gather_nd_quant; + cb_map[CSINN_OP_GATHER][i].exec = shl_ref_gather_quant; + cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].exec = shl_ref_global_avgpool2d_quant; + cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].exec = shl_ref_global_maxpool2d_quant; + cb_map[CSINN_OP_GREATHER_EQUAL][i].exec = shl_ref_greater_equal_quant; + cb_map[CSINN_OP_GREATHER][i].exec = shl_ref_greater_quant; + cb_map[CSINN_OP_HARD_SIGMOID][i].exec = shl_ref_hard_sigmoid_quant; + cb_map[CSINN_OP_IM2COL][i].exec = shl_ref_im2col_quant; + cb_map[CSINN_OP_L2N][i].exec = shl_ref_l2_normalization_quant; + cb_map[CSINN_OP_LEAKY_RELU][i].exec = shl_ref_leaky_relu_quant; + cb_map[CSINN_OP_LESS_EQUAL][i].exec = shl_ref_less_equal_quant; + cb_map[CSINN_OP_LESS][i].exec = shl_ref_less_quant; + cb_map[CSINN_OP_LOG_SOFTMAX][i].exec = shl_ref_log_softmax_quant; + cb_map[CSINN_OP_LOG][i].exec = shl_ref_log_quant; + cb_map[CSINN_OP_LOG1P][i].exec = shl_ref_log1p_quant; + cb_map[CSINN_OP_LOGICAL_AND][i].exec = shl_ref_logical_and_quant; + cb_map[CSINN_OP_LOGICAL_NOT][i].exec = shl_ref_logical_not_quant; + cb_map[CSINN_OP_LOGICAL_OR][i].exec = shl_ref_logical_or_quant; + cb_map[CSINN_OP_LOGICAL_XOR][i].exec = shl_ref_logical_xor_quant; + cb_map[CSINN_OP_LRN][i].exec = shl_ref_lrn_quant; + cb_map[CSINN_OP_MATMUL][i].exec = shl_ref_matmul_quant; + cb_map[CSINN_OP_MAX][i].exec = shl_ref_max_stride_quant; + cb_map[CSINN_OP_MAXIMUM][i].exec = shl_ref_maximum_quant; + cb_map[CSINN_OP_MAXPOOL2D][i].exec = shl_ref_maxpool2d_quant; + cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].exec = shl_ref_maxpool2d_locat_quant; + cb_map[CSINN_OP_MAXPOOL3D][i].exec = shl_ref_maxpool3d_quant; + cb_map[CSINN_OP_MEAN][i].exec = shl_ref_mean_stride_quant; + cb_map[CSINN_OP_MEAN_STRIDE][i].exec = shl_ref_mean_stride_quant; + cb_map[CSINN_OP_MIN][i].exec = shl_ref_min_stride_quant; + cb_map[CSINN_OP_MINIMUM][i].exec = shl_ref_minimum_quant; + cb_map[CSINN_OP_MOD][i].exec = shl_ref_mod_quant; + cb_map[CSINN_OP_MUL][i].exec = shl_ref_mul_quant; + cb_map[CSINN_OP_NEGATIIVE][i].exec = shl_ref_negative_quant; + cb_map[CSINN_OP_NOT_EQUAL][i].exec = shl_ref_not_equal_quant; + cb_map[CSINN_OP_PAD][i].exec = shl_ref_pad_quant; + cb_map[CSINN_OP_POWER][i].exec = shl_ref_power_quant; + cb_map[CSINN_OP_PRELU][i].exec = shl_ref_prelu_quant; + cb_map[CSINN_OP_PROD][i].exec = shl_ref_prod_stride_quant; + cb_map[CSINN_OP_PROPOSAL][i].exec = shl_ref_proposal_quant; + cb_map[CSINN_OP_PSROIPOOLING][i].exec = shl_ref_psroipooling_quant; + cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].exec = shl_ref_reduce_logsumexp_quant; + cb_map[CSINN_OP_REDUCE_MAX][i].exec = shl_ref_reduce_max_quant; + cb_map[CSINN_OP_REDUCE_MEAN][i].exec = shl_ref_reduce_mean_quant; + cb_map[CSINN_OP_REDUCE_MIN][i].exec = shl_ref_reduce_min_quant; + cb_map[CSINN_OP_REDUCE_PROD][i].exec = shl_ref_reduce_prod_quant; + cb_map[CSINN_OP_REDUCE_SUM][i].exec = shl_ref_reduce_sum_quant; + cb_map[CSINN_OP_RELU][i].exec = shl_ref_relu_quant; + cb_map[CSINN_OP_RELU1][i].exec = shl_ref_relu1_quant; + cb_map[CSINN_OP_RELU6][i].exec = shl_ref_relu6_quant; + cb_map[CSINN_OP_RELUN][i].exec = shl_ref_relun_quant; + cb_map[CSINN_OP_RESHAPE][i].exec = shl_ref_reshape; + cb_map[CSINN_OP_RESHAPE][i].init = shl_ref_reshape_init; + cb_map[CSINN_OP_RESIZE][i].exec = shl_ref_resize_quant; + cb_map[CSINN_OP_REVERSE][i].exec = shl_ref_reverse_quant; + cb_map[CSINN_OP_ROIPOOL][i].exec = shl_ref_roipool_quant; + cb_map[CSINN_OP_ROUND][i].exec = shl_ref_round_quant; + cb_map[CSINN_OP_RSQRT][i].exec = shl_ref_rsqrt_quant; + cb_map[CSINN_OP_SEGMENT_MAX][i].exec = shl_ref_segment_max_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].exec = shl_ref_unsorted_segment_max_quant; + cb_map[CSINN_OP_SEGMENT_MEAN][i].exec = shl_ref_segment_mean_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].exec = shl_ref_unsorted_segment_mean_quant; + cb_map[CSINN_OP_SEGMENT_MIN][i].exec = shl_ref_segment_min_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].exec = shl_ref_unsorted_segment_min_quant; + cb_map[CSINN_OP_SEGMENT_PROD][i].exec = shl_ref_segment_prod_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].exec = shl_ref_unsorted_segment_prod_quant; + cb_map[CSINN_OP_SEGMENT_SUM][i].exec = shl_ref_segment_sum_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].exec = shl_ref_unsorted_segment_sum_quant; + cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].exec = shl_ref_shuffle_channel_quant; + cb_map[CSINN_OP_SIGMOID][i].exec = shl_ref_sigmoid_quant; + cb_map[CSINN_OP_SIGN][i].exec = shl_ref_sign_quant; + cb_map[CSINN_OP_SIN][i].exec = shl_ref_sin_quant; + cb_map[CSINN_OP_SINH][i].exec = shl_ref_sinh_quant; + cb_map[CSINN_OP_SLICE][i].exec = shl_ref_slice_quant; + cb_map[CSINN_OP_SOFTMAX][i].exec = shl_ref_softmax_quant; + cb_map[CSINN_OP_SOFTPLUS][i].exec = shl_ref_softplus_quant; + cb_map[CSINN_OP_SOFTRELU][i].exec = shl_ref_softrelu_quant; + cb_map[CSINN_OP_SOFTSIGN][i].exec = shl_ref_softsign_quant; + cb_map[CSINN_OP_SPACE_TO_BATCH][i].exec = shl_ref_space_to_batch_quant; + cb_map[CSINN_OP_SPACE_TO_DEPTH][i].exec = shl_ref_space_to_depth_quant; + cb_map[CSINN_OP_SQRT][i].exec = shl_ref_sqrt_quant; + cb_map[CSINN_OP_STACK][i].exec = shl_ref_stack_quant; + cb_map[CSINN_OP_STRIDED_SLICE][i].exec = shl_ref_strided_slice_quant; + cb_map[CSINN_OP_SUB][i].exec = shl_ref_sub_quant; + cb_map[CSINN_OP_SUM][i].exec = shl_ref_sum_stride_quant; + cb_map[CSINN_OP_TAN][i].exec = shl_ref_tan_quant; + cb_map[CSINN_OP_TANH][i].exec = shl_ref_tanh_quant; + cb_map[CSINN_OP_THRESHOLD_RELU][i].exec = shl_ref_threshold_relu_quant; + cb_map[CSINN_OP_TILE][i].exec = shl_ref_tile_quant; + cb_map[CSINN_OP_TOPK][i].exec = shl_ref_topk_quant; + cb_map[CSINN_OP_TRANSPOSE][i].exec = shl_ref_transpose; + cb_map[CSINN_OP_TRANSPOSE][i].init = shl_ref_transpose_init; + cb_map[CSINN_OP_TRUNC][i].exec = shl_ref_trunc_quant; + cb_map[CSINN_OP_UNPOOLING][i].exec = shl_ref_unpooling_quant; + cb_map[CSINN_OP_YUV_RGB_SCALE][i].exec = shl_ref_yuv_rgb_scale_quant; + cb_map[CSINN_OP_CONV2D][i].exec = shl_ref_conv2d_quant; + cb_map[CSINN_OP_CONV2D_RELU][i].exec = shl_ref_conv2d_relu_quant; + cb_map[CSINN_OP_CONV2D_RELU6][i].exec = shl_ref_conv2d_relu6_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL][i].exec = shl_ref_conv2d_channel_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL_RELU][i].exec = shl_ref_conv2d_channel_relu_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i].exec = shl_ref_conv2d_channel_relu6_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].exec = shl_ref_depthwise_conv2d_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].exec = shl_ref_depthwise_conv2d_relu_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].exec = shl_ref_depthwise_conv2d_relu6_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i].exec = shl_ref_depthwise_conv2d_channel_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i].exec = + shl_ref_depthwise_conv2d_channel_relu_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i].exec = + shl_ref_depthwise_conv2d_channel_relu6_quant; + cb_map[CSINN_OP_GROUP_CONV2D][i].exec = shl_ref_group_conv2d_quant; + cb_map[CSINN_OP_GROUP_CONV2D_RELU][i].exec = shl_ref_group_conv2d_relu_quant; + cb_map[CSINN_OP_GROUP_CONV2D_RELU6][i].exec = shl_ref_group_conv2d_relu6_quant; + cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i].exec = shl_ref_group_conv2d_channel_quant; + cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i].exec = + shl_ref_group_conv2d_channel_relu_quant; + cb_map[CSINN_OP_CONV3D][i].exec = shl_ref_conv3d_quant; + cb_map[CSINN_OP_DECONV2D][i].exec = shl_ref_deconv2d_quant; + cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].exec = shl_ref_depthwise_deconv2d_quant; + cb_map[CSINN_OP_DECONV3D][i].exec = shl_ref_deconv3d_quant; + cb_map[CSINN_OP_FULLYCONNECTED][i].exec = shl_ref_fullyconnected_quant; + cb_map[CSINN_OP_SCATTER_ND][i].exec = shl_ref_scatter_nd_quant; + cb_map[CSINN_OP_SPLIT][i].exec = shl_ref_split_quant; } for (int i = CSINN_DTYPE_UINT8; i <= CSINN_DTYPE_FLOAT64; i++) { - bc_map[CSINN_OP_SQUEEZE][i] = csi_ref_squeeze; + cb_map[CSINN_OP_SQUEEZE][i].exec = shl_ref_squeeze; } - bc_map[CSINN_OP_AND][CSINN_DTYPE_UINT8] = csi_ref_and_u8; - bc_map[CSINN_OP_AND][CSINN_DTYPE_INT8] = csi_ref_and_i8; - bc_map[CSINN_OP_AND][CSINN_DTYPE_UINT32] = csi_ref_and_u32; - bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8] = csi_ref_ndarray_size_u8; - bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8] = csi_ref_ndarray_size_i8; - bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32] = csi_ref_ndarray_size_i32; - bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32] = csi_ref_ndarray_size_f32; - bc_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8] = csi_ref_not_u8; - bc_map[CSINN_OP_NOT][CSINN_DTYPE_INT8] = csi_ref_not_i8; - bc_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32] = csi_ref_not_u32; - bc_map[CSINN_OP_OR][CSINN_DTYPE_UINT8] = csi_ref_or_u8; - bc_map[CSINN_OP_OR][CSINN_DTYPE_INT8] = csi_ref_or_i8; - bc_map[CSINN_OP_OR][CSINN_DTYPE_UINT32] = csi_ref_or_u32; - bc_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8] = csi_ref_select_u8; - bc_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8] = csi_ref_select_i8; - bc_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32] = csi_ref_select_f32; - bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8] = csi_ref_shape_u8; - bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8] = csi_ref_shape_i8; - bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32] = csi_ref_shape_i32; - bc_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8] = csi_ref_xor_u8; - bc_map[CSINN_OP_XOR][CSINN_DTYPE_INT8] = csi_ref_xor_i8; - bc_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32] = csi_ref_xor_u32; + cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT8].exec = shl_ref_and_u8; + cb_map[CSINN_OP_AND][CSINN_DTYPE_INT8].exec = shl_ref_and_i8; + cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT32].exec = shl_ref_and_u32; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8].exec = shl_ref_ndarray_size_u8; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8].exec = shl_ref_ndarray_size_i8; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32].exec = shl_ref_ndarray_size_i32; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32].exec = shl_ref_ndarray_size_f32; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8].exec = shl_ref_not_u8; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_INT8].exec = shl_ref_not_i8; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32].exec = shl_ref_not_u32; + cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT8].exec = shl_ref_or_u8; + cb_map[CSINN_OP_OR][CSINN_DTYPE_INT8].exec = shl_ref_or_i8; + cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT32].exec = shl_ref_or_u32; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8].exec = shl_ref_select_u8; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8].exec = shl_ref_select_i8; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32].exec = shl_ref_select_f32; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8].exec = shl_ref_shape_u8; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8].exec = shl_ref_shape_i8; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32].exec = shl_ref_shape_i32; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8].exec = shl_ref_xor_u8; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_INT8].exec = shl_ref_xor_i8; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32].exec = shl_ref_xor_u32; - bc_map[CSINN_OP_ABS][CSINN_DTYPE_FLOAT32] = csi_ref_abs_f32; - bc_map[CSINN_OP_ACOS][CSINN_DTYPE_FLOAT32] = csi_ref_acos_f32; - bc_map[CSINN_OP_ACOSH][CSINN_DTYPE_FLOAT32] = csi_ref_acosh_f32; - bc_map[CSINN_OP_ADD][CSINN_DTYPE_FLOAT32] = csi_ref_add_f32; - bc_map[CSINN_OP_ARANGE][CSINN_DTYPE_FLOAT32] = csi_ref_arange_f32; - bc_map[CSINN_OP_ARGMAX][CSINN_DTYPE_FLOAT32] = csi_ref_argmax_stride_i32_f32; - bc_map[CSINN_OP_ARGMIN][CSINN_DTYPE_FLOAT32] = csi_ref_argmin_stride_i32_f32; - bc_map[CSINN_OP_ASIN][CSINN_DTYPE_FLOAT32] = csi_ref_asin_f32; - bc_map[CSINN_OP_ASINH][CSINN_DTYPE_FLOAT32] = csi_ref_asinh_f32; - bc_map[CSINN_OP_ATAN][CSINN_DTYPE_FLOAT32] = csi_ref_atan_f32; - bc_map[CSINN_OP_ATANH][CSINN_DTYPE_FLOAT32] = csi_ref_atanh_f32; - bc_map[CSINN_OP_AVGPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_avgpool2d_f32; - bc_map[CSINN_OP_AVGPOOL3D][CSINN_DTYPE_FLOAT32] = csi_ref_avgpool3d_f32; - bc_map[CSINN_OP_BN][CSINN_DTYPE_FLOAT32] = csi_ref_batch_normalization_f32; - bc_map[CSINN_OP_BATCH_TO_SPACE][CSINN_DTYPE_FLOAT32] = csi_ref_batch_to_space_f32; - bc_map[CSINN_OP_BROADCOST][CSINN_DTYPE_FLOAT32] = csi_ref_broadcast_to_f32; - bc_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32] = csi_ref_cache_matmul_f32; - bc_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32] = csi_ref_cache_conv1d_f32; - bc_map[CSINN_OP_CEIL][CSINN_DTYPE_FLOAT32] = csi_ref_ceil_f32; - bc_map[CSINN_OP_CLIP][CSINN_DTYPE_FLOAT32] = csi_ref_clip_f32; - bc_map[CSINN_OP_CONCAT][CSINN_DTYPE_FLOAT32] = csi_ref_concat_f32; - bc_map[CSINN_OP_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_conv2d_f32; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_depthwise_conv2d_f32; - bc_map[CSINN_OP_GROUP_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_group_conv2d_f32; - bc_map[CSINN_OP_CONV3D][CSINN_DTYPE_FLOAT32] = csi_ref_conv3d_f32; - bc_map[CSINN_OP_DECONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_deconv2d_f32; - bc_map[CSINN_OP_DEPTHWISE_DECONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_depthwise_deconv2d_f32; - bc_map[CSINN_OP_DECONV3D][CSINN_DTYPE_FLOAT32] = csi_ref_deconv3d_f32; - bc_map[CSINN_OP_COS][CSINN_DTYPE_FLOAT32] = csi_ref_cos_f32; - bc_map[CSINN_OP_COSH][CSINN_DTYPE_FLOAT32] = csi_ref_cosh_f32; - bc_map[CSINN_OP_CUMPROD][CSINN_DTYPE_FLOAT32] = csi_ref_cumprod_f32; - bc_map[CSINN_OP_CUMSUM][CSINN_DTYPE_FLOAT32] = csi_ref_cumsum_f32; - bc_map[CSINN_OP_DEPTH_TO_SPACE][CSINN_DTYPE_FLOAT32] = csi_ref_depth_to_space_f32; - bc_map[CSINN_OP_DIV][CSINN_DTYPE_FLOAT32] = csi_ref_div_f32; - bc_map[CSINN_OP_ELU][CSINN_DTYPE_FLOAT32] = csi_ref_elu_f32; - bc_map[CSINN_OP_EQUANL][CSINN_DTYPE_FLOAT32] = csi_ref_equal_f32; - bc_map[CSINN_OP_ERF][CSINN_DTYPE_FLOAT32] = csi_ref_erf_f32; - bc_map[CSINN_OP_EXP][CSINN_DTYPE_FLOAT32] = csi_ref_exp_f32; - bc_map[CSINN_OP_EXPAND_DIMS][CSINN_DTYPE_FLOAT32] = csi_ref_expand_dims_f32; - bc_map[CSINN_OP_EXPM1][CSINN_DTYPE_FLOAT32] = csi_ref_expm1_f32; - bc_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32] = csi_ref_flatten; - bc_map[CSINN_OP_FLOOR_DIVIDE][CSINN_DTYPE_FLOAT32] = csi_ref_floor_divide_f32; - bc_map[CSINN_OP_FLOOR_MOD][CSINN_DTYPE_FLOAT32] = csi_ref_floor_mod_f32; - bc_map[CSINN_OP_FLOOR][CSINN_DTYPE_FLOAT32] = csi_ref_floor_f32; - bc_map[CSINN_OP_FSMN][CSINN_DTYPE_FLOAT32] = csi_ref_fsmn_f32; - bc_map[CSINN_OP_FULLYCONNECTED][CSINN_DTYPE_FLOAT32] = csi_ref_fullyconnected_f32; - bc_map[CSINN_OP_GATHER_ND][CSINN_DTYPE_FLOAT32] = csi_ref_gather_nd_f32; - bc_map[CSINN_OP_GATHER][CSINN_DTYPE_FLOAT32] = csi_ref_gather_f32; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_global_avgpool2d_f32; - bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_global_maxpool2d_f32; - bc_map[CSINN_OP_GREATHER_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_greater_equal_f32; - bc_map[CSINN_OP_GREATHER][CSINN_DTYPE_FLOAT32] = csi_ref_greater_f32; - bc_map[CSINN_OP_HARD_SIGMOID][CSINN_DTYPE_FLOAT32] = csi_ref_hard_sigmoid_f32; - bc_map[CSINN_OP_IM2COL][CSINN_DTYPE_FLOAT32] = csi_ref_im2col_f32; - bc_map[CSINN_OP_L2N][CSINN_DTYPE_FLOAT32] = csi_ref_l2_normalization_f32; - bc_map[CSINN_OP_LEAKY_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_leaky_relu_f32; - bc_map[CSINN_OP_LESS_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_less_equal_f32; - bc_map[CSINN_OP_LESS][CSINN_DTYPE_FLOAT32] = csi_ref_less_f32; - bc_map[CSINN_OP_LOG_SOFTMAX][CSINN_DTYPE_FLOAT32] = csi_ref_log_softmax_f32; - bc_map[CSINN_OP_LOG][CSINN_DTYPE_FLOAT32] = csi_ref_log_f32; - bc_map[CSINN_OP_LOG1P][CSINN_DTYPE_FLOAT32] = csi_ref_log1p_f32; - bc_map[CSINN_OP_LOGICAL_AND][CSINN_DTYPE_FLOAT32] = csi_ref_logical_and_f32; - bc_map[CSINN_OP_LOGICAL_NOT][CSINN_DTYPE_FLOAT32] = csi_ref_logical_not_f32; - bc_map[CSINN_OP_LOGICAL_OR][CSINN_DTYPE_FLOAT32] = csi_ref_logical_or_f32; - bc_map[CSINN_OP_LOGICAL_XOR][CSINN_DTYPE_FLOAT32] = csi_ref_logical_xor_f32; - bc_map[CSINN_OP_LRN][CSINN_DTYPE_FLOAT32] = csi_ref_lrn_f32; - bc_map[CSINN_OP_MATMUL][CSINN_DTYPE_FLOAT32] = csi_ref_matmul_f32; - bc_map[CSINN_OP_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_max_stride_f32; - bc_map[CSINN_OP_MAXIMUM][CSINN_DTYPE_FLOAT32] = csi_ref_maximum_f32; - bc_map[CSINN_OP_MAXPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool2d_f32; - bc_map[CSINN_OP_MAXPOOL2D_LOCAT][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool2d_locat_f32; - bc_map[CSINN_OP_MAXPOOL3D][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool3d_f32; - bc_map[CSINN_OP_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_mean_stride_f32; - bc_map[CSINN_OP_MEAN_STRIDE][CSINN_DTYPE_FLOAT32] = csi_ref_mean_stride_f32; - bc_map[CSINN_OP_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_min_stride_f32; - bc_map[CSINN_OP_MINIMUM][CSINN_DTYPE_FLOAT32] = csi_ref_minimum_f32; - bc_map[CSINN_OP_MOD][CSINN_DTYPE_FLOAT32] = csi_ref_mod_f32; - bc_map[CSINN_OP_MUL][CSINN_DTYPE_FLOAT32] = csi_ref_mul_f32; - bc_map[CSINN_OP_NEGATIIVE][CSINN_DTYPE_FLOAT32] = csi_ref_negative_f32; - bc_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32] = csi_ref_non_max_suppression_std; - bc_map[CSINN_OP_NOT_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_not_equal_f32; - bc_map[CSINN_OP_PAD][CSINN_DTYPE_FLOAT32] = csi_ref_pad_f32; - bc_map[CSINN_OP_POWER][CSINN_DTYPE_FLOAT32] = csi_ref_power_f32; - bc_map[CSINN_OP_PRELU][CSINN_DTYPE_FLOAT32] = csi_ref_prelu_f32; - bc_map[CSINN_OP_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_prod_stride_f32; - bc_map[CSINN_OP_PROPOSAL][CSINN_DTYPE_FLOAT32] = csi_ref_proposal_f32; - bc_map[CSINN_OP_PSROIPOOLING][CSINN_DTYPE_FLOAT32] = csi_ref_psroipooling_f32; - bc_map[CSINN_OP_REDUCE_LOGSUMEXP][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_logsumexp_f32; - bc_map[CSINN_OP_REDUCE_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_max_f32; - bc_map[CSINN_OP_REDUCE_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_mean_f32; - bc_map[CSINN_OP_REDUCE_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_min_f32; - bc_map[CSINN_OP_REDUCE_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_prod_f32; - bc_map[CSINN_OP_REDUCE_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_sum_f32; - bc_map[CSINN_OP_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_relu_f32; - bc_map[CSINN_OP_RELU1][CSINN_DTYPE_FLOAT32] = csi_ref_relu1_f32; - bc_map[CSINN_OP_RELU6][CSINN_DTYPE_FLOAT32] = csi_ref_relu6_f32; - bc_map[CSINN_OP_RELUN][CSINN_DTYPE_FLOAT32] = csi_ref_relun_f32; - bc_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32] = csi_ref_reshape; - bc_map[CSINN_OP_RESIZE][CSINN_DTYPE_FLOAT32] = csi_ref_resize_f32; - bc_map[CSINN_OP_REVERSE][CSINN_DTYPE_FLOAT32] = csi_ref_reverse_f32; - bc_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32] = csi_ref_roi_align_f32; - bc_map[CSINN_OP_ROIPOOL][CSINN_DTYPE_FLOAT32] = csi_ref_roipool_f32; - bc_map[CSINN_OP_ROUND][CSINN_DTYPE_FLOAT32] = csi_ref_round_f32; - bc_map[CSINN_OP_RSQRT][CSINN_DTYPE_FLOAT32] = csi_ref_rsqrt_f32; - bc_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32] = csi_ref_scatter_nd_f32; - bc_map[CSINN_OP_SEGMENT_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_segment_max_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_max_f32; - bc_map[CSINN_OP_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_segment_mean_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_mean_f32; - bc_map[CSINN_OP_SEGMENT_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_segment_min_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_min_f32; - bc_map[CSINN_OP_SEGMENT_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_segment_prod_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_prod_f32; - bc_map[CSINN_OP_SEGMENT_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_segment_sum_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_sum_f32; - bc_map[CSINN_OP_SHUFFLE_CHANNEL][CSINN_DTYPE_FLOAT32] = csi_ref_shuffle_channel_f32; - bc_map[CSINN_OP_SIGMOID][CSINN_DTYPE_FLOAT32] = csi_ref_sigmoid_f32; - bc_map[CSINN_OP_SIGN][CSINN_DTYPE_FLOAT32] = csi_ref_sign_f32; - bc_map[CSINN_OP_SIN][CSINN_DTYPE_FLOAT32] = csi_ref_sin_f32; - bc_map[CSINN_OP_SINH][CSINN_DTYPE_FLOAT32] = csi_ref_sinh_f32; - bc_map[CSINN_OP_SLICE][CSINN_DTYPE_FLOAT32] = csi_ref_slice_f32; - bc_map[CSINN_OP_SOFTMAX][CSINN_DTYPE_FLOAT32] = csi_ref_softmax_f32; - bc_map[CSINN_OP_SOFTPLUS][CSINN_DTYPE_FLOAT32] = csi_ref_softplus_f32; - bc_map[CSINN_OP_SOFTRELU][CSINN_DTYPE_FLOAT32] = csi_ref_softrelu_f32; - bc_map[CSINN_OP_SOFTSIGN][CSINN_DTYPE_FLOAT32] = csi_ref_softsign_f32; - bc_map[CSINN_OP_SPACE_TO_BATCH][CSINN_DTYPE_FLOAT32] = csi_ref_space_to_batch_f32; - bc_map[CSINN_OP_SPACE_TO_DEPTH][CSINN_DTYPE_FLOAT32] = csi_ref_space_to_depth_f32; - bc_map[CSINN_OP_SPLIT][CSINN_DTYPE_FLOAT32] = csi_ref_split_f32; - bc_map[CSINN_OP_SQRT][CSINN_DTYPE_FLOAT32] = csi_ref_sqrt_f32; - bc_map[CSINN_OP_SQUARE][CSINN_DTYPE_FLOAT32] = csi_ref_square_f32; - bc_map[CSINN_OP_STACK][CSINN_DTYPE_FLOAT32] = csi_ref_stack_f32; - bc_map[CSINN_OP_STRIDED_SLICE][CSINN_DTYPE_FLOAT32] = csi_ref_strided_slice_f32; - bc_map[CSINN_OP_SUB][CSINN_DTYPE_FLOAT32] = csi_ref_sub_f32; - bc_map[CSINN_OP_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_sum_stride_f32; - bc_map[CSINN_OP_TAN][CSINN_DTYPE_FLOAT32] = csi_ref_tan_f32; - bc_map[CSINN_OP_TANH][CSINN_DTYPE_FLOAT32] = csi_ref_tanh_f32; - bc_map[CSINN_OP_THRESHOLD_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_threshold_relu_f32; - bc_map[CSINN_OP_TILE][CSINN_DTYPE_FLOAT32] = csi_ref_tile_f32; - bc_map[CSINN_OP_TOPK][CSINN_DTYPE_FLOAT32] = csi_ref_topk_f32; - bc_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32] = csi_ref_transpose; - bc_map[CSINN_OP_TRUNC][CSINN_DTYPE_FLOAT32] = csi_ref_trunc_f32; - bc_map[CSINN_OP_UNPOOLING][CSINN_DTYPE_FLOAT32] = csi_ref_unpooling_f32; - bc_map[CSINN_OP_YUV_RGB_SCALE][CSINN_DTYPE_FLOAT32] = csi_ref_yuv_rgb_scale_f32; - bc_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32] = csi_ref_col2im_f32; - bc_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32] = csi_ref_isnan_bool_f32; - bc_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_l2pool_f32; + cb_map[CSINN_OP_ABS][CSINN_DTYPE_FLOAT32].exec = shl_ref_abs_f32; + cb_map[CSINN_OP_ACOS][CSINN_DTYPE_FLOAT32].exec = shl_ref_acos_f32; + cb_map[CSINN_OP_ACOSH][CSINN_DTYPE_FLOAT32].exec = shl_ref_acosh_f32; + cb_map[CSINN_OP_ADD][CSINN_DTYPE_FLOAT32].exec = shl_ref_add_f32; + cb_map[CSINN_OP_ARANGE][CSINN_DTYPE_FLOAT32].exec = shl_ref_arange_f32; + cb_map[CSINN_OP_ARGMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_argmax_stride_i32_f32; + cb_map[CSINN_OP_ARGMIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_argmin_stride_i32_f32; + cb_map[CSINN_OP_ASIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_asin_f32; + cb_map[CSINN_OP_ASINH][CSINN_DTYPE_FLOAT32].exec = shl_ref_asinh_f32; + cb_map[CSINN_OP_ATAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_atan_f32; + cb_map[CSINN_OP_ATANH][CSINN_DTYPE_FLOAT32].exec = shl_ref_atanh_f32; + cb_map[CSINN_OP_AVGPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_avgpool2d_f32; + cb_map[CSINN_OP_AVGPOOL3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_avgpool3d_f32; + cb_map[CSINN_OP_BN][CSINN_DTYPE_FLOAT32].exec = shl_ref_batch_normalization_f32; + cb_map[CSINN_OP_BATCH_TO_SPACE][CSINN_DTYPE_FLOAT32].exec = shl_ref_batch_to_space_f32; + cb_map[CSINN_OP_BROADCOST][CSINN_DTYPE_FLOAT32].exec = shl_ref_broadcast_to_f32; + cb_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_cache_matmul_f32; + cb_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32].init = shl_ref_cache_matmul_init; + cb_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32].exec = shl_ref_cache_conv1d_f32; + cb_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32].init = shl_ref_cache_conv1d_init; + cb_map[CSINN_OP_CEIL][CSINN_DTYPE_FLOAT32].exec = shl_ref_ceil_f32; + cb_map[CSINN_OP_CLIP][CSINN_DTYPE_FLOAT32].exec = shl_ref_clip_f32; + cb_map[CSINN_OP_CONCAT][CSINN_DTYPE_FLOAT32].exec = shl_ref_concat_f32; + cb_map[CSINN_OP_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_conv2d_f32; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_depthwise_conv2d_f32; + cb_map[CSINN_OP_GROUP_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_group_conv2d_f32; + cb_map[CSINN_OP_CONV3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_conv3d_f32; + cb_map[CSINN_OP_DECONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_deconv2d_f32; + cb_map[CSINN_OP_DEPTHWISE_DECONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_depthwise_deconv2d_f32; + cb_map[CSINN_OP_DECONV3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_deconv3d_f32; + cb_map[CSINN_OP_COS][CSINN_DTYPE_FLOAT32].exec = shl_ref_cos_f32; + cb_map[CSINN_OP_COSH][CSINN_DTYPE_FLOAT32].exec = shl_ref_cosh_f32; + cb_map[CSINN_OP_CUMPROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_cumprod_f32; + cb_map[CSINN_OP_CUMSUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_cumsum_f32; + cb_map[CSINN_OP_DEPTH_TO_SPACE][CSINN_DTYPE_FLOAT32].exec = shl_ref_depth_to_space_f32; + cb_map[CSINN_OP_DIV][CSINN_DTYPE_FLOAT32].exec = shl_ref_div_f32; + cb_map[CSINN_OP_ELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_elu_f32; + cb_map[CSINN_OP_EQUANL][CSINN_DTYPE_FLOAT32].exec = shl_ref_equal_f32; + cb_map[CSINN_OP_ERF][CSINN_DTYPE_FLOAT32].exec = shl_ref_erf_f32; + cb_map[CSINN_OP_EXP][CSINN_DTYPE_FLOAT32].exec = shl_ref_exp_f32; + cb_map[CSINN_OP_EXPAND_DIMS][CSINN_DTYPE_FLOAT32].exec = shl_ref_expand_dims_f32; + cb_map[CSINN_OP_EXPM1][CSINN_DTYPE_FLOAT32].exec = shl_ref_expm1_f32; + cb_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32].exec = shl_ref_flatten; + cb_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32].init = shl_ref_flatten_init; + cb_map[CSINN_OP_FLOOR_DIVIDE][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_divide_f32; + cb_map[CSINN_OP_FLOOR_MOD][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_mod_f32; + cb_map[CSINN_OP_FLOOR][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_f32; + cb_map[CSINN_OP_FSMN][CSINN_DTYPE_FLOAT32].exec = shl_ref_fsmn_f32; + cb_map[CSINN_OP_FULLYCONNECTED][CSINN_DTYPE_FLOAT32].exec = shl_ref_fullyconnected_f32; + cb_map[CSINN_OP_GATHER_ND][CSINN_DTYPE_FLOAT32].exec = shl_ref_gather_nd_f32; + cb_map[CSINN_OP_GATHER][CSINN_DTYPE_FLOAT32].exec = shl_ref_gather_f32; + cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_global_avgpool2d_f32; + cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_global_maxpool2d_f32; + cb_map[CSINN_OP_GREATHER_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_greater_equal_f32; + cb_map[CSINN_OP_GREATHER][CSINN_DTYPE_FLOAT32].exec = shl_ref_greater_f32; + cb_map[CSINN_OP_HARD_SIGMOID][CSINN_DTYPE_FLOAT32].exec = shl_ref_hard_sigmoid_f32; + cb_map[CSINN_OP_IM2COL][CSINN_DTYPE_FLOAT32].exec = shl_ref_im2col_f32; + cb_map[CSINN_OP_L2N][CSINN_DTYPE_FLOAT32].exec = shl_ref_l2_normalization_f32; + cb_map[CSINN_OP_LEAKY_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_leaky_relu_f32; + cb_map[CSINN_OP_LESS_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_less_equal_f32; + cb_map[CSINN_OP_LESS][CSINN_DTYPE_FLOAT32].exec = shl_ref_less_f32; + cb_map[CSINN_OP_LOG_SOFTMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_log_softmax_f32; + cb_map[CSINN_OP_LOG][CSINN_DTYPE_FLOAT32].exec = shl_ref_log_f32; + cb_map[CSINN_OP_LOG1P][CSINN_DTYPE_FLOAT32].exec = shl_ref_log1p_f32; + cb_map[CSINN_OP_LOGICAL_AND][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_and_f32; + cb_map[CSINN_OP_LOGICAL_NOT][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_not_f32; + cb_map[CSINN_OP_LOGICAL_OR][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_or_f32; + cb_map[CSINN_OP_LOGICAL_XOR][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_xor_f32; + cb_map[CSINN_OP_LRN][CSINN_DTYPE_FLOAT32].exec = shl_ref_lrn_f32; + cb_map[CSINN_OP_MATMUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_matmul_f32; + cb_map[CSINN_OP_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_max_stride_f32; + cb_map[CSINN_OP_MAXIMUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_maximum_f32; + cb_map[CSINN_OP_MAXPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool2d_f32; + cb_map[CSINN_OP_MAXPOOL2D_LOCAT][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool2d_locat_f32; + cb_map[CSINN_OP_MAXPOOL3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool3d_f32; + cb_map[CSINN_OP_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_mean_stride_f32; + cb_map[CSINN_OP_MEAN_STRIDE][CSINN_DTYPE_FLOAT32].exec = shl_ref_mean_stride_f32; + cb_map[CSINN_OP_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_min_stride_f32; + cb_map[CSINN_OP_MINIMUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_minimum_f32; + cb_map[CSINN_OP_MOD][CSINN_DTYPE_FLOAT32].exec = shl_ref_mod_f32; + cb_map[CSINN_OP_MUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_mul_f32; + cb_map[CSINN_OP_NEGATIIVE][CSINN_DTYPE_FLOAT32].exec = shl_ref_negative_f32; + cb_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32].exec = + shl_ref_non_max_suppression_std; + cb_map[CSINN_OP_NOT_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_not_equal_f32; + cb_map[CSINN_OP_PAD][CSINN_DTYPE_FLOAT32].exec = shl_ref_pad_f32; + cb_map[CSINN_OP_POWER][CSINN_DTYPE_FLOAT32].exec = shl_ref_power_f32; + cb_map[CSINN_OP_PRELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_prelu_f32; + cb_map[CSINN_OP_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_prod_stride_f32; + cb_map[CSINN_OP_PROPOSAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_proposal_f32; + cb_map[CSINN_OP_PSROIPOOLING][CSINN_DTYPE_FLOAT32].exec = shl_ref_psroipooling_f32; + cb_map[CSINN_OP_REDUCE_LOGSUMEXP][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_logsumexp_f32; + cb_map[CSINN_OP_REDUCE_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_max_f32; + cb_map[CSINN_OP_REDUCE_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_mean_f32; + cb_map[CSINN_OP_REDUCE_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_min_f32; + cb_map[CSINN_OP_REDUCE_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_prod_f32; + cb_map[CSINN_OP_REDUCE_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_sum_f32; + cb_map[CSINN_OP_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu_f32; + cb_map[CSINN_OP_RELU1][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu1_f32; + cb_map[CSINN_OP_RELU6][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu6_f32; + cb_map[CSINN_OP_RELUN][CSINN_DTYPE_FLOAT32].exec = shl_ref_relun_f32; + cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32].exec = shl_ref_reshape; + cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32].init = shl_ref_reshape_init; + cb_map[CSINN_OP_RESIZE][CSINN_DTYPE_FLOAT32].exec = shl_ref_resize_f32; + cb_map[CSINN_OP_REVERSE][CSINN_DTYPE_FLOAT32].exec = shl_ref_reverse_f32; + cb_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_roi_align_f32; + cb_map[CSINN_OP_ROIPOOL][CSINN_DTYPE_FLOAT32].exec = shl_ref_roipool_f32; + cb_map[CSINN_OP_ROUND][CSINN_DTYPE_FLOAT32].exec = shl_ref_round_f32; + cb_map[CSINN_OP_RSQRT][CSINN_DTYPE_FLOAT32].exec = shl_ref_rsqrt_f32; + cb_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32].exec = shl_ref_scatter_nd_f32; + cb_map[CSINN_OP_SEGMENT_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_max_f32; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][CSINN_DTYPE_FLOAT32].exec = + shl_ref_unsorted_segment_max_f32; + cb_map[CSINN_OP_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_mean_f32; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32].exec = + shl_ref_unsorted_segment_mean_f32; + cb_map[CSINN_OP_SEGMENT_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_min_f32; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][CSINN_DTYPE_FLOAT32].exec = + shl_ref_unsorted_segment_min_f32; + cb_map[CSINN_OP_SEGMENT_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_prod_f32; + cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][CSINN_DTYPE_FLOAT32].exec = + shl_ref_unsorted_segment_prod_f32; + cb_map[CSINN_OP_SEGMENT_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_sum_f32; + cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][CSINN_DTYPE_FLOAT32].exec = + shl_ref_unsorted_segment_sum_f32; + cb_map[CSINN_OP_SHUFFLE_CHANNEL][CSINN_DTYPE_FLOAT32].exec = shl_ref_shuffle_channel_f32; + cb_map[CSINN_OP_SIGMOID][CSINN_DTYPE_FLOAT32].exec = shl_ref_sigmoid_f32; + cb_map[CSINN_OP_SIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_sign_f32; + cb_map[CSINN_OP_SIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_sin_f32; + cb_map[CSINN_OP_SINH][CSINN_DTYPE_FLOAT32].exec = shl_ref_sinh_f32; + cb_map[CSINN_OP_SLICE][CSINN_DTYPE_FLOAT32].exec = shl_ref_slice_f32; + cb_map[CSINN_OP_SOFTMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_softmax_f32; + cb_map[CSINN_OP_SOFTPLUS][CSINN_DTYPE_FLOAT32].exec = shl_ref_softplus_f32; + cb_map[CSINN_OP_SOFTRELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_softrelu_f32; + cb_map[CSINN_OP_SOFTSIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_softsign_f32; + cb_map[CSINN_OP_SPACE_TO_BATCH][CSINN_DTYPE_FLOAT32].exec = shl_ref_space_to_batch_f32; + cb_map[CSINN_OP_SPACE_TO_DEPTH][CSINN_DTYPE_FLOAT32].exec = shl_ref_space_to_depth_f32; + cb_map[CSINN_OP_SPLIT][CSINN_DTYPE_FLOAT32].exec = shl_ref_split_f32; + cb_map[CSINN_OP_SQRT][CSINN_DTYPE_FLOAT32].exec = shl_ref_sqrt_f32; + cb_map[CSINN_OP_SQUARE][CSINN_DTYPE_FLOAT32].exec = shl_ref_square_f32; + cb_map[CSINN_OP_STACK][CSINN_DTYPE_FLOAT32].exec = shl_ref_stack_f32; + cb_map[CSINN_OP_STRIDED_SLICE][CSINN_DTYPE_FLOAT32].exec = shl_ref_strided_slice_f32; + cb_map[CSINN_OP_SUB][CSINN_DTYPE_FLOAT32].exec = shl_ref_sub_f32; + cb_map[CSINN_OP_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_sum_stride_f32; + cb_map[CSINN_OP_TAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_tan_f32; + cb_map[CSINN_OP_TANH][CSINN_DTYPE_FLOAT32].exec = shl_ref_tanh_f32; + cb_map[CSINN_OP_THRESHOLD_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_threshold_relu_f32; + cb_map[CSINN_OP_TILE][CSINN_DTYPE_FLOAT32].exec = shl_ref_tile_f32; + cb_map[CSINN_OP_TOPK][CSINN_DTYPE_FLOAT32].exec = shl_ref_topk_f32; + cb_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32].exec = shl_ref_transpose; + cb_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32].init = shl_ref_transpose_init; + cb_map[CSINN_OP_TRUNC][CSINN_DTYPE_FLOAT32].exec = shl_ref_trunc_f32; + cb_map[CSINN_OP_UNPOOLING][CSINN_DTYPE_FLOAT32].exec = shl_ref_unpooling_f32; + cb_map[CSINN_OP_YUV_RGB_SCALE][CSINN_DTYPE_FLOAT32].exec = shl_ref_yuv_rgb_scale_f32; + cb_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32].exec = shl_ref_col2im_f32; + cb_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_isnan_bool_f32; + cb_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_l2pool_f32; - return bc_map; +#ifdef SHL_BUILD_GREF +#include "shl_gref.h" + shl_register_runtime_callback(CSINN_REF, shl_gref_runtime_callback); + for (int i = 0; i < CSINN_DTYPE_SIZE; i++) { + cb_map[CSINN_OP_ABS][i].est = shl_gref_abs; + cb_map[CSINN_OP_ACOS][i].est = shl_gref_acos; + cb_map[CSINN_OP_ACOSH][i].est = shl_gref_acosh; + cb_map[CSINN_OP_ADD][i].est = shl_gref_add; + cb_map[CSINN_OP_ARANGE][i].est = shl_gref_arange; + cb_map[CSINN_OP_ARGMAX][i].est = shl_gref_argmax; + cb_map[CSINN_OP_ARGMIN][i].est = shl_gref_argmin; + cb_map[CSINN_OP_ASIN][i].est = shl_gref_asin; + cb_map[CSINN_OP_ASINH][i].est = shl_gref_asinh; + cb_map[CSINN_OP_ATAN][i].est = shl_gref_atan; + cb_map[CSINN_OP_ATANH][i].est = shl_gref_atanh; + cb_map[CSINN_OP_AVGPOOL2D][i].est = shl_gref_avgpool2d; + cb_map[CSINN_OP_AVGPOOL3D][i].est = shl_gref_avgpool3d; + cb_map[CSINN_OP_BN][i].est = shl_gref_batch_normalization; + cb_map[CSINN_OP_BATCH_TO_SPACE][i].est = shl_gref_batch_to_space; + cb_map[CSINN_OP_BROADCOST][i].est = shl_gref_broadcast_to; + cb_map[CSINN_OP_CACHE_MATMUL][i].est = shl_gref_cache_matmul; + cb_map[CSINN_OP_CACHE_CONV1D][i].est = shl_gref_cache_conv1d; + cb_map[CSINN_OP_CEIL][i].est = shl_gref_ceil; + cb_map[CSINN_OP_CLIP][i].est = shl_gref_clip; + cb_map[CSINN_OP_CONCAT][i].est = shl_gref_concat; + cb_map[CSINN_OP_COS][i].est = shl_gref_cos; + cb_map[CSINN_OP_COSH][i].est = shl_gref_cosh; + cb_map[CSINN_OP_CUMPROD][i].est = shl_gref_cumprod; + cb_map[CSINN_OP_DATA_CONVERT][i].est = shl_gref_data_convert; + cb_map[CSINN_OP_CUMSUM][i].est = shl_gref_cumsum; + cb_map[CSINN_OP_DEPTH_TO_SPACE][i].est = shl_gref_depth_to_space; + cb_map[CSINN_OP_DIV][i].est = shl_gref_div; + cb_map[CSINN_OP_ELU][i].est = shl_gref_elu; + cb_map[CSINN_OP_EQUANL][i].est = shl_gref_equal; + cb_map[CSINN_OP_ERF][i].est = shl_gref_erf; + cb_map[CSINN_OP_EXP][i].est = shl_gref_exp; + cb_map[CSINN_OP_EXPAND_DIMS][i].est = shl_gref_expand_dims; + cb_map[CSINN_OP_EXPM1][i].est = shl_gref_expm1; + cb_map[CSINN_OP_FLATTEN][i].est = shl_gref_flatten; + cb_map[CSINN_OP_FLOOR_DIVIDE][i].est = shl_gref_floor_divide; + cb_map[CSINN_OP_FLOOR_MOD][i].est = shl_gref_floor_mod; + cb_map[CSINN_OP_FLOOR][i].est = shl_gref_floor; + cb_map[CSINN_OP_FSMN][i].est = shl_gref_fsmn; + cb_map[CSINN_OP_GATHER_ND][i].est = shl_gref_gather_nd; + cb_map[CSINN_OP_GATHER][i].est = shl_gref_gather; + cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].est = shl_gref_global_avgpool2d; + cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].est = shl_gref_global_maxpool2d; + cb_map[CSINN_OP_GREATHER_EQUAL][i].est = shl_gref_greater_equal; + cb_map[CSINN_OP_GREATHER][i].est = shl_gref_greater; + cb_map[CSINN_OP_HARD_SIGMOID][i].est = shl_gref_hard_sigmoid; + cb_map[CSINN_OP_IM2COL][i].est = shl_gref_im2col; + cb_map[CSINN_OP_L2N][i].est = shl_gref_l2_normalization; + cb_map[CSINN_OP_LEAKY_RELU][i].est = shl_gref_leaky_relu; + cb_map[CSINN_OP_LESS_EQUAL][i].est = shl_gref_less_equal; + cb_map[CSINN_OP_LESS][i].est = shl_gref_less; + cb_map[CSINN_OP_LOG_SOFTMAX][i].est = shl_gref_log_softmax; + cb_map[CSINN_OP_LOG][i].est = shl_gref_log; + cb_map[CSINN_OP_LOG1P][i].est = shl_gref_log1p; + cb_map[CSINN_OP_LOGICAL_AND][i].est = shl_gref_logical_and; + cb_map[CSINN_OP_LOGICAL_NOT][i].est = shl_gref_logical_not; + cb_map[CSINN_OP_LOGICAL_OR][i].est = shl_gref_logical_or; + cb_map[CSINN_OP_LOGICAL_XOR][i].est = shl_gref_logical_xor; + cb_map[CSINN_OP_LRN][i].est = shl_gref_lrn; + cb_map[CSINN_OP_MATMUL][i].est = shl_gref_matmul; + cb_map[CSINN_OP_MAX][i].est = shl_gref_max; + cb_map[CSINN_OP_MAXIMUM][i].est = shl_gref_maximum; + cb_map[CSINN_OP_MAXPOOL2D][i].est = shl_gref_maxpool2d; + cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].est = shl_gref_maxpool2d_locat; + cb_map[CSINN_OP_MAXPOOL3D][i].est = shl_gref_maxpool3d; + cb_map[CSINN_OP_MEAN][i].est = shl_gref_mean; + cb_map[CSINN_OP_MEAN_STRIDE][i].est = shl_gref_mean; + cb_map[CSINN_OP_MIN][i].est = shl_gref_min; + cb_map[CSINN_OP_MINIMUM][i].est = shl_gref_minimum; + cb_map[CSINN_OP_MOD][i].est = shl_gref_mod; + cb_map[CSINN_OP_MUL][i].est = shl_gref_mul; + cb_map[CSINN_OP_NEGATIIVE][i].est = shl_gref_negative; + cb_map[CSINN_OP_NOT_EQUAL][i].est = shl_gref_not_equal; + cb_map[CSINN_OP_PAD][i].est = shl_gref_pad; + cb_map[CSINN_OP_POWER][i].est = shl_gref_power; + cb_map[CSINN_OP_PRELU][i].est = shl_gref_prelu; + cb_map[CSINN_OP_PROD][i].est = shl_gref_prod; + cb_map[CSINN_OP_PROPOSAL][i].est = shl_gref_proposal; + cb_map[CSINN_OP_PSROIPOOLING][i].est = shl_gref_psroipooling; + cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].est = shl_gref_reduce_logsumexp; + cb_map[CSINN_OP_REDUCE_MAX][i].est = shl_gref_reduce_max; + cb_map[CSINN_OP_REDUCE_MEAN][i].est = shl_gref_reduce_mean; + cb_map[CSINN_OP_REDUCE_MIN][i].est = shl_gref_reduce_min; + cb_map[CSINN_OP_REDUCE_PROD][i].est = shl_gref_reduce_prod; + cb_map[CSINN_OP_REDUCE_SUM][i].est = shl_gref_reduce_sum; + cb_map[CSINN_OP_RELU][i].est = shl_gref_relu; + cb_map[CSINN_OP_RELU1][i].est = shl_gref_relu1; + cb_map[CSINN_OP_RELU6][i].est = shl_gref_relu6; + cb_map[CSINN_OP_RELUN][i].est = shl_gref_relun; + cb_map[CSINN_OP_RESHAPE][i].est = shl_gref_reshape; + cb_map[CSINN_OP_RESIZE][i].est = shl_gref_resize; + cb_map[CSINN_OP_REVERSE][i].est = shl_gref_reverse; + cb_map[CSINN_OP_ROIPOOL][i].est = shl_gref_roipool; + cb_map[CSINN_OP_ROUND][i].est = shl_gref_round; + cb_map[CSINN_OP_RSQRT][i].est = shl_gref_rsqrt; + cb_map[CSINN_OP_SEGMENT_MAX][i].est = shl_gref_segment_max; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].est = shl_gref_segment_max; + cb_map[CSINN_OP_SEGMENT_MEAN][i].est = shl_gref_segment_mean; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].est = shl_gref_segment_mean; + cb_map[CSINN_OP_SEGMENT_MIN][i].est = shl_gref_segment_min; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].est = shl_gref_segment_min; + cb_map[CSINN_OP_SEGMENT_PROD][i].est = shl_gref_segment_prod; + cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].est = shl_gref_segment_prod; + cb_map[CSINN_OP_SEGMENT_SUM][i].est = shl_gref_segment_sum; + cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].est = shl_gref_segment_sum; + cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].est = shl_gref_shuffle_channel; + cb_map[CSINN_OP_SIGMOID][i].est = shl_gref_sigmoid; + cb_map[CSINN_OP_SIGN][i].est = shl_gref_sign; + cb_map[CSINN_OP_SIN][i].est = shl_gref_sin; + cb_map[CSINN_OP_SINH][i].est = shl_gref_sinh; + cb_map[CSINN_OP_SLICE][i].est = shl_gref_slice; + cb_map[CSINN_OP_SOFTMAX][i].est = shl_gref_softmax; + cb_map[CSINN_OP_SOFTPLUS][i].est = shl_gref_softplus; + cb_map[CSINN_OP_SOFTRELU][i].est = shl_gref_softrelu; + cb_map[CSINN_OP_SOFTSIGN][i].est = shl_gref_softsign; + cb_map[CSINN_OP_SPACE_TO_BATCH][i].est = shl_gref_space_to_batch; + cb_map[CSINN_OP_SPACE_TO_DEPTH][i].est = shl_gref_space_to_depth; + cb_map[CSINN_OP_SQRT][i].est = shl_gref_sqrt; + cb_map[CSINN_OP_STACK][i].est = shl_gref_stack; + cb_map[CSINN_OP_STRIDED_SLICE][i].est = shl_gref_strided_slice; + cb_map[CSINN_OP_SUB][i].est = shl_gref_sub; + cb_map[CSINN_OP_SUM][i].est = shl_gref_sum; + cb_map[CSINN_OP_TAN][i].est = shl_gref_tan; + cb_map[CSINN_OP_TANH][i].est = shl_gref_tanh; + cb_map[CSINN_OP_THRESHOLD_RELU][i].est = shl_gref_threshold_relu; + cb_map[CSINN_OP_TILE][i].est = shl_gref_tile; + cb_map[CSINN_OP_TOPK][i].est = shl_gref_topk; + cb_map[CSINN_OP_TRANSPOSE][i].est = shl_gref_transpose; + cb_map[CSINN_OP_TRUNC][i].est = shl_gref_trunc; + cb_map[CSINN_OP_UNPOOLING][i].est = shl_gref_unpooling; + cb_map[CSINN_OP_YUV_RGB_SCALE][i].est = shl_gref_yuv_rgb_scale; + cb_map[CSINN_OP_CONV2D][i].est = shl_gref_conv2d; + cb_map[CSINN_OP_CONV2D_RELU][i].est = shl_gref_conv2d_relu; + cb_map[CSINN_OP_CONV2D_RELU6][i].est = shl_gref_conv2d_relu6; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].est = shl_gref_depthwise_conv2d; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].est = shl_gref_depthwise_conv2d_relu; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].est = shl_gref_depthwise_conv2d_relu6; + cb_map[CSINN_OP_GROUP_CONV2D][i].est = shl_gref_group_conv2d; + cb_map[CSINN_OP_CONV3D][i].est = shl_gref_conv3d; + cb_map[CSINN_OP_DECONV2D][i].est = shl_gref_deconv2d; + cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].est = shl_gref_depthwise_deconv2d; + cb_map[CSINN_OP_DECONV3D][i].est = shl_gref_deconv3d; + cb_map[CSINN_OP_FULLYCONNECTED][i].est = shl_gref_fullyconnected; + cb_map[CSINN_OP_SCATTER_ND][i].est = shl_gref_scatter_nd; + cb_map[CSINN_OP_SPLIT][i].est = shl_gref_split; + } +#endif + return cb_map; } -static int get_bc_map_index(int op, int dtype) { return op * CSINN_DTYPE_SIZE + dtype; } +static int get_cb_map_index(int op, int dtype) { return op * CSINN_DTYPE_SIZE + dtype; } +static struct csinn_callback *__cb_map_table_ref; +struct csinn_callback *shl_cb_map_ref(int op, int dtype) +{ + return &__cb_map_table_ref[get_cb_map_index(op, dtype)]; +} -void *csi_bc_map_ref(int op, int dtype) +void shl_target_init_ref() { - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; - } - return bc_map_table[get_bc_map_index(op, dtype)]; + __cb_map_table_ref = setup_cb_map(); + shl_register_runtime_callback(CSINN_REF, NULL); + shl_register_op_callback(CSINN_REF, shl_cb_map_ref); } diff --git a/source/reference/shape.c b/source/reference/shape.c index 9c16cd56..ec109d06 100644 --- a/source/reference/shape.c +++ b/source/reference/shape.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params) +int shl_ref_shape_i32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { int32_t *data = output->data; for (int i = 0; i < input->dim_count; i++) { @@ -31,8 +30,8 @@ int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params) +int shl_ref_shape_u8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { uint8_t *data = output->data; for (int i = 0; i < input->dim_count; i++) { @@ -41,8 +40,8 @@ int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_shape_i8(struct csi_tensor *input, struct csi_tensor *output, - struct shape_params *params) +int shl_ref_shape_i8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params) { uint8_t *data = output->data; for (int i = 0; i < input->dim_count; i++) { diff --git a/source/reference/shuffle_channel.c b/source/reference/shuffle_channel.c index 60d38381..b89cfc76 100644 --- a/source/reference/shuffle_channel.c +++ b/source/reference/shuffle_channel.c @@ -1,4 +1,4 @@ - /* +/* * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_shuffle_channel_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params) +static int shl_ref_shuffle_channel_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -51,33 +50,34 @@ static int csi_ref_shuffle_channel_nhwc_f32(struct csi_tensor *input, struct csi return CSINN_TRUE; } -static int csi_ref_shuffle_channel_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output, - struct shuffle_channel_params *params) +static int shl_ref_shuffle_channel_nchw_f32(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_shuffle_channel_params *params) { - struct csi_tensor *input; - struct csi_tensor *output; - input = csi_ref_nchw_to_nhwc_f32(o_input); - output = csi_ref_nchw_to_nhwc_f32(o_output); - csi_ref_shuffle_channel_nhwc_f32(input, output, params); - csi_ref_nhwc_to_nchw_f32(o_output, output); - csi_ref_free_float_tensor(input); + struct csinn_tensor *input; + struct csinn_tensor *output; + input = shl_ref_nchw_to_nhwc_f32(o_input); + output = shl_ref_nchw_to_nhwc_f32(o_output); + shl_ref_shuffle_channel_nhwc_f32(input, output, params); + shl_ref_nhwc_to_nchw_f32(o_output, output); + shl_ref_free_float_tensor(input); return CSINN_TRUE; } -int csi_ref_shuffle_channel_f32(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params) +int shl_ref_shuffle_channel_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_shuffle_channel_nchw_f32(input, output, params); + shl_ref_shuffle_channel_nchw_f32(input, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_shuffle_channel_nhwc_f32(input, output, params); + shl_ref_shuffle_channel_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } } -int csi_ref_shuffle_channel_quant(struct csi_tensor *input, struct csi_tensor *output, - struct shuffle_channel_params *params) +int shl_ref_shuffle_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_shuffle_channel_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_shuffle_channel_f32); } diff --git a/source/reference/sigmoid.c b/source/reference/sigmoid.c index c2dd538d..875edee4 100644 --- a/source/reference/sigmoid.c +++ b/source/reference/sigmoid.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params) +int shl_ref_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -38,8 +37,8 @@ int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params) +int shl_ref_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sigmoid_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sigmoid_f32); } diff --git a/source/reference/sign.c b/source/reference/sign.c index 2a009035..02bdce4c 100644 --- a/source/reference/sign.c +++ b/source/reference/sign.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" float sign(float v) { @@ -31,8 +30,8 @@ float sign(float v) return -1; } -int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sign_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -47,8 +46,8 @@ int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_sign_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sign_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sign_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sign_f32); } diff --git a/source/reference/sin.c b/source/reference/sin.c index eb52de1f..139bbbcd 100644 --- a/source/reference/sin.c +++ b/source/reference/sin.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_sin_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = sin(input_data[i]); @@ -33,8 +33,8 @@ int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_sin_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sin_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sin_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sin_f32); } diff --git a/source/reference/sinh.c b/source/reference/sinh.c index 8faf61ee..b85893e1 100644 --- a/source/reference/sinh.c +++ b/source/reference/sinh.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sinh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = sinh(input_data[i]); @@ -34,8 +33,8 @@ int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_sinh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sinh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sinh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sinh_f32); } diff --git a/source/reference/slice.c b/source/reference/slice.c index 1fc048db..d6e661ed 100644 --- a/source/reference/slice.c +++ b/source/reference/slice.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params) +int shl_ref_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -31,9 +30,9 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output, for (int c = params->begin[1]; c < params->end[1]; c++) { for (int h = params->begin[2]; h < params->end[2]; h++) { for (int w = params->begin[3]; w < params->end[3]; w++) { - int32_t input_index = csi_ref_get_index(input->dim, b, c, h, w); + int32_t input_index = shl_ref_get_index(input->dim, b, c, h, w); float out_val = input_data[input_index]; - int32_t out_index = csi_ref_get_index( + int32_t out_index = shl_ref_get_index( output->dim, b - params->begin[0], c - params->begin[1], h - params->begin[2], w - params->begin[3]); output_data[out_index] = out_val; @@ -47,9 +46,9 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output, for (int k = params->begin[2]; k < params->end[2]; k++) { for (int l = params->begin[3]; l < params->end[3]; l++) { for (int m = params->begin[4]; m < params->end[4]; m++) { - int32_t input_index = csi_ref_get_index_5(input->dim, i, j, k, l, m); + int32_t input_index = shl_ref_get_index_5(input->dim, i, j, k, l, m); float out_val = input_data[input_index]; - int32_t out_index = csi_ref_get_index_5( + int32_t out_index = shl_ref_get_index_5( output->dim, i - params->begin[0], j - params->begin[1], k - params->begin[2], l - params->begin[3], m - params->begin[4]); output_data[out_index] = out_val; @@ -62,8 +61,8 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_slice_quant(struct csi_tensor *input, struct csi_tensor *output, - struct slice_params *params) +int shl_ref_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_slice_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_slice_f32); } \ No newline at end of file diff --git a/source/reference/softmax.c b/source/reference/softmax.c index 7199fbd8..de1bee0e 100644 --- a/source/reference/softmax.c +++ b/source/reference/softmax.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params) +int shl_ref_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -69,8 +68,8 @@ int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_softmax_quant(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params) +int shl_ref_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_softmax_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_softmax_f32); } diff --git a/source/reference/softplus.c b/source/reference/softplus.c index 7f57def9..edeab93b 100644 --- a/source/reference/softplus.c +++ b/source/reference/softplus.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_softplus_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,8 +35,8 @@ int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_softplus_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_softplus_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_softplus_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_softplus_f32); } diff --git a/source/reference/softrelu.c b/source/reference/softrelu.c index eeee842c..2776b21a 100644 --- a/source/reference/softrelu.c +++ b/source/reference/softrelu.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float softrelu(float x, float y) { return log(1 + exp(fmax(fmin(x, y), y))); } -int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_softrelu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,8 +38,8 @@ int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_softrelu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_softrelu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_softrelu_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_softrelu_f32); } diff --git a/source/reference/softsign.c b/source/reference/softsign.c index 84ed6e55..814d18bb 100644 --- a/source/reference/softsign.c +++ b/source/reference/softsign.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_softsign_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,8 +35,8 @@ int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_softsign_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_softsign_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_softsign_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_softsign_f32); } diff --git a/source/reference/space_to_batch.c b/source/reference/space_to_batch.c index d54b2a70..74fcf542 100644 --- a/source/reference/space_to_batch.c +++ b/source/reference/space_to_batch.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" // tf.nn.space_to_batch:the input mast a 4-D Tensor with shape [batch, height, width, depth]. -int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params) +int shl_ref_space_to_batch_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -46,7 +45,7 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp for (int out_h = 0; out_h < out_height * block_size; out_h = out_h + block_size) { for (int out_w = 0; out_w < out_width * block_size; out_w = out_w + block_size) { for (int out_c = 0; out_c < in_channel; ++out_c) { - float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float)); + float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float)); int h_origin = out_h - params->pad_top; int w_origin = out_w - params->pad_left; for (int h = 0; h < block_size; ++h) { @@ -55,18 +54,18 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp int w_now = w_origin + w; if (h_now >= 0 && h_now < in_height && w_now >= 0 && w_now < in_width) { int in_addr = - csi_ref_get_index(input->dim, in_b, out_c, h_now, w_now); + shl_ref_get_index(input->dim, in_b, out_c, h_now, w_now); temp[h * block_size + w] = input_data[in_addr]; } } } - int out_start_addr = csi_ref_get_index(output->dim, in_b, out_c, + int out_start_addr = shl_ref_get_index(output->dim, in_b, out_c, out_h / block_size, out_w / block_size); for (int i = 0; i < block_size2; ++i) { output_data[out_start_addr + i * batch * out_channel * out_height * out_width] = temp[i]; } - csi_mem_free(temp); + shl_mem_free(temp); } } } @@ -74,8 +73,8 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_ref_space_to_batch_quant(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_batch_params *params) +int shl_ref_space_to_batch_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_space_to_batch_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_space_to_batch_f32); } diff --git a/source/reference/space_to_depth.c b/source/reference/space_to_depth.c index 08889470..549ad2c9 100644 --- a/source/reference/space_to_depth.c +++ b/source/reference/space_to_depth.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" // the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params) +int shl_ref_space_to_depth_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -45,20 +44,20 @@ int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *outp for (int out_h = 0; out_h < out_height; ++out_h) { for (int out_w = 0; out_w < out_width; ++out_w) { for (int in_c = 0; in_c < in_channel; ++in_c) { - float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float)); - int in_start_addr = csi_ref_get_index(input->dim, out_b, in_c, + float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float)); + int in_start_addr = shl_ref_get_index(input->dim, out_b, in_c, out_h * block_size, out_w * block_size); for (int h = 0; h < block_size; h++) { for (int w = 0; w < block_size; w++) { temp[h * block_size + w] = input_data[in_start_addr + h * in_width + w]; } } - int out_start_addr = csi_ref_get_index(output->dim, out_b, in_c, out_h, out_w); + int out_start_addr = shl_ref_get_index(output->dim, out_b, in_c, out_h, out_w); for (int i = 0; i < block_size2; i++) { output_data[out_start_addr + i * in_channel * out_height * out_width] = temp[i]; } - csi_mem_free(temp); + shl_mem_free(temp); } } } @@ -66,8 +65,8 @@ int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_ref_space_to_depth_quant(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params) +int shl_ref_space_to_depth_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_space_to_depth_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_space_to_depth_f32); } diff --git a/source/reference/split.c b/source/reference/split.c index f8d0d6f6..652b86d3 100644 --- a/source/reference/split.c +++ b/source/reference/split.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params) +int shl_ref_split_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { int32_t inner_size = 1; int32_t out_size = 1; @@ -62,22 +61,22 @@ int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output, return CSINN_TRUE; } -int csi_ref_split_quant(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params) +int shl_ref_split_quant(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params) { - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); - struct csi_tensor *foutput[params->output_num]; + struct csinn_tensor *foutput[params->output_num]; for (int i = 0; i < params->output_num; i++) { - foutput[i] = csi_ref_tensor_transform_f32(output[i]); + foutput[i] = shl_ref_tensor_transform_f32(output[i]); } - int ret = csi_ref_split_f32(finput, foutput, params); + int ret = shl_ref_split_f32(finput, foutput, params); for (int i = 0; i < params->output_num; i++) { - csi_tensor_data_convert(output[i], foutput[i]); - csi_ref_tensor_transform_free_f32(foutput[i]); + csinn_tensor_data_convert(output[i], foutput[i]); + shl_ref_tensor_transform_free_f32(foutput[i]); } - csi_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(finput); return ret; } \ No newline at end of file diff --git a/source/reference/sqrt.c b/source/reference/sqrt.c index 4ee9fef8..9d722faa 100644 --- a/source/reference/sqrt.c +++ b/source/reference/sqrt.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +36,8 @@ int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_sqrt_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_sqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sqrt_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sqrt_f32); } diff --git a/source/reference/square.c b/source/reference/square.c index e01eb739..99d3381c 100644 --- a/source/reference/square.c +++ b/source/reference/square.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_square_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_square_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; diff --git a/source/reference/squeeze.c b/source/reference/squeeze.c index e3346cba..529d9365 100644 --- a/source/reference/squeeze.c +++ b/source/reference/squeeze.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_squeeze(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params) +int shl_ref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_byte_size(input); + int size = csinn_tensor_byte_size(input); if (input_data != output_data) { memcpy(output_data, input_data, size); } diff --git a/source/reference/stack.c b/source/reference/stack.c index 7f879d06..f98e0a30 100644 --- a/source/reference/stack.c +++ b/source/reference/stack.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params) +int shl_ref_stack_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params) { int input_count = params->inputs_count; int axis = params->axis; @@ -42,7 +41,7 @@ int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output, float *output_data = (float *)output->data; for (int i = 0; i < outer_size; ++i) { for (int j = 0; j < input_count; ++j) { - struct csi_tensor *input_item = input[j]; + struct csinn_tensor *input_item = input[j]; float *input_item_data = (float *)input_item->data; const float *input_ptr = input_item_data + i * copy_size; memcpy(output_data, input_ptr, copy_size * sizeof(float)); @@ -52,8 +51,8 @@ int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output, - struct stack_params *params) +int shl_ref_stack_quant(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params) { if (params->axis == -1) { params->axis = input[0]->dim_count - 1; @@ -61,19 +60,19 @@ int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output, int input_count = params->inputs_count; int ret; - struct csi_tensor *finput[input_count]; - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *finput[input_count]; + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); for (int i = 0; i < input_count; i++) { - finput[i] = csi_ref_tensor_transform_f32(input[i]); + finput[i] = shl_ref_tensor_transform_f32(input[i]); } - ret = csi_ref_stack_f32(finput, foutput, params); + ret = shl_ref_stack_f32(finput, foutput, params); - csi_tensor_data_convert(output, foutput); + csinn_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(foutput); + shl_ref_tensor_transform_free_f32(foutput); for (int i = 0; i < input_count; i++) { - csi_ref_tensor_transform_free_f32(finput[i]); + shl_ref_tensor_transform_free_f32(finput[i]); } return ret; } diff --git a/source/reference/strided_slice.c b/source/reference/strided_slice.c index 4d20692a..592c216a 100644 --- a/source/reference/strided_slice.c +++ b/source/reference/strided_slice.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params) +int shl_ref_strided_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -40,6 +39,7 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu inner_size *= input->dim[i]; } + float *temp_copy = NULL; for (int slice_dim = 0; slice_dim < slice_dim_count; slice_dim++) { int begin = params->begin[slice_dim]; int end = params->end[slice_dim]; @@ -59,8 +59,7 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu out_size *= inner_size_copy_num; float *temp = - (float *)csi_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float)); - float *temp_copy = NULL; + (float *)shl_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float)); float *temp_addr = temp; for (int n = 0; n < outer_size; n++) { for (int i = begin; i < end; i = i + stride) { @@ -70,23 +69,23 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu input_data += inner_size * input->dim[slice_dim]; } if (temp != NULL) { - csi_mem_free(temp_copy); + shl_mem_free(temp_copy); } temp_copy = - (float *)csi_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float)); + (float *)shl_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float)); memcpy(temp_copy, temp, outer_size * inner_size * inner_size_copy_num * sizeof(float)); input_data = temp_copy; - csi_mem_free(temp); + shl_mem_free(temp); temp = NULL; } out_size = out_size * inner_size; memcpy(output_data, input_data, out_size * sizeof(float)); - csi_mem_free(input_data); + shl_mem_free(input_data); return CSINN_TRUE; } -int csi_ref_strided_slice_quant(struct csi_tensor *input, struct csi_tensor *output, - struct strided_slice_params *params) +int shl_ref_strided_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_strided_slice_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_strided_slice_f32); } diff --git a/source/reference/sub.c b/source/reference/sub.c index 119124c2..fcdc9b76 100644 --- a/source/reference/sub.c +++ b/source/reference/sub.c @@ -16,28 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static void element_sub_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx) { dest[output_idx] = src0[output_idx] - src1[input_idx]; } -int csi_ref_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - struct csi_ref_diso_callback cb; + struct shl_ref_diso_callback cb; cb.bc = element_sub_f32; - csi_ref_diso_broadcast_base(input0, input1, output, params, &cb); + shl_ref_diso_broadcast_base(input0, input1, output, params, &cb); return CSINN_TRUE; } -int csi_ref_sub_quant(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_ref_sub_quant(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { - return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_sub_f32); + return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_sub_f32); } diff --git a/source/reference/sum.c b/source/reference/sum.c index c68353f5..4f7d6294 100644 --- a/source/reference/sum.c +++ b/source/reference/sum.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_sum_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,10 +40,10 @@ int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output, for (int32_t out = 0; out < out_size; out++) { float result = 0; int32_t out_index = - csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); + shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n); for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = - out_index + csi_ref_get_reduction_index(inner, params->inner_strides, + out_index + shl_ref_get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); float val = input_data[index]; result += val; @@ -55,8 +54,8 @@ int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_sum_stride_quant(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_ref_sum_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_sum_stride_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_sum_stride_f32); } diff --git a/source/reference/tan.c b/source/reference/tan.c index c3d78fd8..46021c7c 100644 --- a/source/reference/tan.c +++ b/source/reference/tan.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) +int shl_ref_tan_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -36,8 +36,8 @@ int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output, struct return CSINN_TRUE; } -int csi_ref_tan_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_tan_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_tan_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_tan_f32); } diff --git a/source/reference/tanh.c b/source/reference/tanh.c index 243ae068..7948834b 100644 --- a/source/reference/tanh.c +++ b/source/reference/tanh.c @@ -16,17 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_tanh_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = tanh(input_data[i]); @@ -34,12 +33,12 @@ int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_tanh_f64(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { double *input_data = input->data; double *output_data = output->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); for (int i = 0; i < size; i++) { output_data[i] = tanh(input_data[i]); @@ -47,8 +46,8 @@ int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_tanh_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_tanh_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_tanh_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_tanh_f32); } diff --git a/source/reference/threshold_relu.c b/source/reference/threshold_relu.c index a5234652..cc81bf06 100644 --- a/source/reference/threshold_relu.c +++ b/source/reference/threshold_relu.c @@ -16,15 +16,14 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static float threshold_relu(float x, float theta) { return x > theta ? x : 0; } -int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_threshold_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,8 +39,8 @@ int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_ref_threshold_relu_quant(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_ref_threshold_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_threshold_relu_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_threshold_relu_f32); } diff --git a/source/reference/tile.c b/source/reference/tile.c index 650ce851..adb93786 100644 --- a/source/reference/tile.c +++ b/source/reference/tile.c @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" static int Multiplication(int32_t *dim, int s, int e) { @@ -30,8 +29,8 @@ static int Multiplication(int32_t *dim, int s, int e) return res; } -int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output, - struct tile_params *params) +int shl_ref_tile_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -54,7 +53,7 @@ int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output, int num = Multiplication(input->dim, 0, dim_idx) / (input->dim[dim_idx]); int step = Multiplication(input->dim, dim_idx, input->dim_count - 1) * Multiplication(params->reps, dim_idx, reps_count - 1) / (params->reps[dim_idx]); - float *temp = (float *)csi_mem_alloc(reps_num * num * step * sizeof(float)); + float *temp = (float *)shl_mem_alloc(reps_num * num * step * sizeof(float)); float *temp_cpy_addr = temp; for (int input_pre_i = 0; input_pre_i < num; input_pre_i++) { for (int rep_i = 0; rep_i < reps_num; rep_i++) { @@ -65,15 +64,15 @@ int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output, } memcpy(output_data, temp, reps_num * num * step * sizeof(float)); input_data = output_data; - csi_mem_free(temp); + shl_mem_free(temp); temp = NULL; } memcpy(output_data, input_data, out_size * sizeof(float)); return CSINN_TRUE; } -int csi_ref_tile_quant(struct csi_tensor *input, struct csi_tensor *output, - struct tile_params *params) +int shl_ref_tile_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_tile_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_tile_f32); } diff --git a/source/reference/topk.c b/source/reference/topk.c index 1571dceb..5856e5a7 100644 --- a/source/reference/topk.c +++ b/source/reference/topk.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1, - struct csi_tensor *output2, struct topk_params *params) +int shl_ref_topk_f32(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params) { float *input_data = (float *)input->data; float *values_data = (float *)output1->data; @@ -36,7 +35,7 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1, } float *input_sort_addr = input_data; for (int n = 0; n < inner_size; n++) { - int *flag = (int *)csi_mem_alloc(last_dim * sizeof(int)); + int *flag = (int *)shl_mem_alloc(last_dim * sizeof(int)); for (int i = 0; i < k; i++) { values_data[i] = -FLT_MAX; for (int j = 0; j < last_dim; j++) { @@ -47,7 +46,7 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1, } flag[indices_data[i]] = 1; } - csi_mem_free(flag); + shl_mem_free(flag); flag = NULL; input_sort_addr += last_dim; values_data += k; @@ -56,15 +55,15 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1, return CSINN_TRUE; } -int csi_ref_topk_quant(struct csi_tensor *input, struct csi_tensor *output0, - struct csi_tensor *output1, struct topk_params *params) +int shl_ref_topk_quant(struct csinn_tensor *input, struct csinn_tensor *output0, + struct csinn_tensor *output1, struct csinn_topk_params *params) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput0 = csi_ref_tensor_transform_f32(output0); - ret = csi_ref_topk_f32(finput, foutput0, output1, params); - csi_tensor_data_convert(output0, foutput0); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput0); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput0 = shl_ref_tensor_transform_f32(output0); + ret = shl_ref_topk_f32(finput, foutput0, output1, params); + csinn_tensor_data_convert(output0, foutput0); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput0); return ret; } diff --git a/source/reference/transpose.c b/source/reference/transpose.c index 9eba342b..48b63424 100644 --- a/source/reference/transpose.c +++ b/source/reference/transpose.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_transpose_init(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params) +int shl_ref_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { + struct csinn_callback *cb = params->base.cb; if (input->quant_channel == output->quant_channel) { - int quant_size = input->quant_channel * sizeof(struct csi_quant_info); + int quant_size = input->quant_channel * sizeof(struct csinn_quant_info); int t = memcmp(input->qinfo, output->qinfo, quant_size); if (t == 0) { - params->base.bc = csi_ref_transpose; + cb->exec = shl_ref_transpose; return CSINN_TRUE; } } - params->base.bc = csi_ref_transpose_quant; + cb->exec = shl_ref_transpose_quant; return CSINN_TRUE; } -static void copy_element(struct csi_tensor *input, struct csi_tensor *output, int input_idx, +static void copy_element(struct csinn_tensor *input, struct csinn_tensor *output, int input_idx, int output_idx) { if (input->dtype == CSINN_DTYPE_FLOAT32) { @@ -47,22 +47,24 @@ static void copy_element(struct csi_tensor *input, struct csi_tensor *output, in int8_t *src8 = input->data; int8_t *dest8 = output->data; dest8[output_idx] = src8[input_idx]; - } else if (input->dtype == CSINN_DTYPE_INT16) { + } else if (input->dtype == CSINN_DTYPE_INT16 || input->dtype == CSINN_DTYPE_FLOAT16) { int16_t *src16 = input->data; int16_t *dest16 = output->data; dest16[output_idx] = src16[input_idx]; + } else { + shl_debug_error("Transpose unsupport dtype\n"); } } -static void swap(int32_t *out_idx, int32_t *in_idx, struct csi_tensor *input, - struct csi_tensor *output, int32_t *perm, int iter_count) +static void swap(int32_t *out_idx, int32_t *in_idx, struct csinn_tensor *input, + struct csinn_tensor *output, int32_t *perm, int iter_count) { for (out_idx[iter_count] = 0; out_idx[iter_count] < output->dim[iter_count]; out_idx[iter_count]++) { in_idx[perm[iter_count]] = out_idx[iter_count]; if (iter_count == 0) { - int input_idx = csi_ref_get_index_iter(input->dim, input->dim_count - 1, in_idx); - int output_idx = csi_ref_get_index_iter(output->dim, output->dim_count - 1, out_idx); + int input_idx = shl_ref_get_index_iter(input->dim, input->dim_count - 1, in_idx); + int output_idx = shl_ref_get_index_iter(output->dim, output->dim_count - 1, out_idx); copy_element(input, output, input_idx, output_idx); } else { swap(out_idx, in_idx, input, output, perm, iter_count - 1); @@ -70,31 +72,31 @@ static void swap(int32_t *out_idx, int32_t *in_idx, struct csi_tensor *input, } } -int csi_ref_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params) +int shl_ref_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { const int unextended_output_size = output->dim_count; - int32_t *o = csi_mem_alloc(unextended_output_size * sizeof(int32_t)); - int32_t *i = csi_mem_alloc(unextended_output_size * sizeof(int32_t)); + int32_t *o = shl_mem_alloc(unextended_output_size * sizeof(int32_t)); + int32_t *i = shl_mem_alloc(unextended_output_size * sizeof(int32_t)); if (input->dtype != CSINN_DTYPE_FLOAT32 && input->qinfo->scale != output->qinfo->scale && input->qinfo->zero_point != output->qinfo->zero_point) { int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - ret = csi_ref_transpose(finput, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + ret = shl_ref_transpose(finput, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); } else { swap(o, i, input, output, params->permute, unextended_output_size - 1); } - csi_mem_free(o); - csi_mem_free(i); + shl_mem_free(o); + shl_mem_free(i); return CSINN_TRUE; } -int csi_ref_transpose_quant(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params) +int shl_ref_transpose_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_transpose); + return shl_ref_siso_callback_base(input, output, params, shl_ref_transpose); } diff --git a/source/reference/trunc.c b/source/reference/trunc.c index 81f2694a..5f710ef3 100644 --- a/source/reference/trunc.c +++ b/source/reference/trunc.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_trunc_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,8 +36,8 @@ int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_ref_trunc_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_trunc_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_trunc_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_trunc_f32); } diff --git a/source/reference/unpooling.c b/source/reference/unpooling.c index 7f7eae61..6fad3d05 100644 --- a/source/reference/unpooling.c +++ b/source/reference/unpooling.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params) +static int shl_ref_unpooling_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, + struct csinn_unpooling_params *params) { float *input_data = input->data; int *mask_data = mask->data; @@ -36,19 +36,19 @@ static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tenso const int output_height = output->dim[1]; const int output_width = output->dim[2]; - int size = csi_tensor_size(output); + int size = csinn_tensor_size(output); memset(output_data, 0, size * sizeof(float)); for (int b = 0; b < batches; b++) { for (int h = 0; h < input_height; h++) { for (int w = 0; w < input_width; w++) { for (int c = 0; c < depth; c++) { - int index = csi_ref_get_index(input->dim, b, h, w, c); + int index = shl_ref_get_index(input->dim, b, h, w, c); int id = mask_data[index]; if (id < output_height * output_width) { int id_h = id / output_width; int id_w = id % output_width; - int o_index = csi_ref_get_index(output->dim, b, id_h, id_w, c); + int o_index = shl_ref_get_index(output->dim, b, id_h, id_w, c); output_data[o_index] = input_data[index]; } } @@ -58,8 +58,9 @@ static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params) +static int shl_ref_unpooling_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, + struct csinn_unpooling_params *params) { float *input_data = input->data; int *mask_data = mask->data; @@ -73,19 +74,19 @@ static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tenso const int output_height = output->dim[2]; const int output_width = output->dim[3]; - int size = csi_tensor_size(output); + int size = csinn_tensor_size(output); memset(output_data, 0, size * sizeof(float)); for (int b = 0; b < batches; b++) { for (int c = 0; c < depth; c++) { for (int h = 0; h < input_height; h++) { for (int w = 0; w < input_width; w++) { - int index = csi_ref_get_index(input->dim, b, c, h, w); + int index = shl_ref_get_index(input->dim, b, c, h, w); int id = mask_data[index]; if (id < output_height * output_width) { int id_h = id / output_width; int id_w = id % output_width; - int o_index = csi_ref_get_index(output->dim, b, c, id_h, id_w); + int o_index = shl_ref_get_index(output->dim, b, c, id_h, id_w); output_data[o_index] = input_data[index]; } } @@ -95,27 +96,27 @@ static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tenso return CSINN_TRUE; } -int csi_ref_unpooling_f32(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params) +int shl_ref_unpooling_f32(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params) { if (params->base.layout == CSINN_LAYOUT_NCHW) { - csi_ref_unpooling_nchw_f32(input, mask, output, params); + shl_ref_unpooling_nchw_f32(input, mask, output, params); } else if (params->base.layout == CSINN_LAYOUT_NHWC) { - csi_ref_unpooling_nhwc_f32(input, mask, output, params); + shl_ref_unpooling_nhwc_f32(input, mask, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } return CSINN_TRUE; } -int csi_ref_unpooling_quant(struct csi_tensor *input, struct csi_tensor *mask, - struct csi_tensor *output, struct unpooling_params *params) +int shl_ref_unpooling_quant(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params) { - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - csi_ref_unpooling_f32(finput, mask, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + shl_ref_unpooling_f32(finput, mask, foutput, params); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return CSINN_TRUE; } diff --git a/source/reference/unstack.c b/source/reference/unstack.c index 15dfa7df..9e7eb45e 100644 --- a/source/reference/unstack.c +++ b/source/reference/unstack.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params) +int shl_ref_unstack_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params) { int axis = params->axis; int output_count = input->dim[axis]; @@ -42,7 +41,7 @@ int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output, float *input_data = (float *)input->data; for (int i = 0; i < outer_size; i++) { for (int j = 0; j < output_count; j++) { - struct csi_tensor *output_item = output[j]; + struct csinn_tensor *output_item = output[j]; float *output_item_data = (float *)output_item->data; float *output_ptr = output_item_data + i * copy_size; memcpy(output_ptr, input_data, copy_size * sizeof(float)); @@ -52,27 +51,27 @@ int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output, return CSINN_TRUE; } -int csi_ref_unstack_qunat(struct csi_tensor *input, struct csi_tensor **output, - struct unstack_params *params) +int shl_ref_unstack_qunat(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params) { int ret; int axis = params->axis; int output_count = input->dim[axis]; - struct csi_tensor *foutput[output_count]; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput[output_count]; + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); for (int i = 0; i < output_count; i++) { - foutput[i] = csi_ref_tensor_transform_f32(output[i]); + foutput[i] = shl_ref_tensor_transform_f32(output[i]); } - ret = csi_ref_unstack_f32(finput, foutput, params); + ret = shl_ref_unstack_f32(finput, foutput, params); for (int i = 0; i < output_count; i++) { - csi_tensor_data_convert(output[i], foutput[i]); + csinn_tensor_data_convert(output[i], foutput[i]); } - csi_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(finput); for (int i = 0; i < output_count; i++) { - csi_ref_tensor_transform_free_f32(foutput[i]); + shl_ref_tensor_transform_free_f32(foutput[i]); } return ret; } diff --git a/source/reference/utils.c b/source/reference/utils.c index a010c2e7..20b64abb 100644 --- a/source/reference/utils.c +++ b/source/reference/utils.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" -int32_t csi_ref_max_internal_s32(int32_t a, int32_t b) +int32_t shl_ref_max_internal_s32(int32_t a, int32_t b) { if (a >= b) { return a; @@ -32,7 +31,7 @@ int32_t csi_ref_max_internal_s32(int32_t a, int32_t b) } } -int32_t csi_ref_min_internal_s32(int32_t a, int32_t b) +int32_t shl_ref_min_internal_s32(int32_t a, int32_t b) { if (a <= b) { return a; @@ -41,24 +40,24 @@ int32_t csi_ref_min_internal_s32(int32_t a, int32_t b) } } -int32_t csi_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, +int32_t shl_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, int32_t index3) { return ((index0 * dim[1] + index1) * dim[2] + index2) * dim[3] + index3; } -int32_t csi_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, +int32_t shl_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2, int32_t index3, int32_t index4) { return dim[4] * (dim[3] * (dim[2] * (dim[1] * index0 + index1) + index2) + index3) + index4; } /* iteration to calculate index */ -int32_t csi_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index) +int32_t shl_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index) { int32_t ret; if (dim_idx > 0) { - ret = csi_ref_get_index_iter(dim, dim_idx - 1, index) * dim[dim_idx] + index[dim_idx]; + ret = shl_ref_get_index_iter(dim, dim_idx - 1, index) * dim[dim_idx] + index[dim_idx]; } else { ret = index[dim_idx]; } @@ -66,11 +65,11 @@ int32_t csi_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index) return ret; } -int32_t *csi_ref_get_input_dim(struct csi_tensor *input, int dim_count, int32_t *axis, +int32_t *shl_ref_get_input_dim(struct csinn_tensor *input, int dim_count, int32_t *axis, int axis_size) { int8_t alloc_size = dim_count * sizeof(int32_t *); - int32_t *ret = csi_mem_alloc(alloc_size); + int32_t *ret = shl_mem_alloc(alloc_size); for (int i = 0; i < dim_count; i++) { ret[i] = 1; @@ -83,21 +82,9 @@ int32_t *csi_ref_get_input_dim(struct csi_tensor *input, int dim_count, int32_t return ret; } -int csi_check_rhs_shape(struct csi_tensor *input) -{ - int axis = -1; - int in_size = csi_tensor_size(input); - for (int i = 0; i < input->dim_count; i++) { - if (input->dim[i] == in_size) { - axis = i; - } - } - return axis; -} - -int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params, - struct csi_ref_diso_callback *cb) +int shl_ref_diso_broadcast_base(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct shl_ref_diso_callback *cb) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -105,30 +92,30 @@ int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *in cb->output = output; - int out_size = csi_tensor_size(output); - float *in0_data_b = csi_mem_alloc(out_size * 4); - float *in1_data_b = csi_mem_alloc(out_size * 4); + int out_size = csinn_tensor_size(output); + float *in0_data_b = shl_mem_alloc(out_size * 4); + float *in1_data_b = shl_mem_alloc(out_size * 4); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - if (csi_ref_broadcast_to_shape(input0, b_input0, output->dim, output->dim_count) == + if (shl_ref_broadcast_to_shape(input0, b_input0, output->dim, output->dim_count) == CSINN_FALSE) { - CSI_DEBUG_CALL(csi_debug_info("%s: broadcast input0 failed.", __func__)); + SHL_DEBUG_CALL(shl_debug_info("%s: broadcast input0 failed.", __func__)); return CSINN_FALSE; }; - if (csi_ref_broadcast_to_shape(input1, b_input1, output->dim, output->dim_count) == + if (shl_ref_broadcast_to_shape(input1, b_input1, output->dim, output->dim_count) == CSINN_FALSE) { - CSI_DEBUG_CALL(csi_debug_info("%s: broadcast input1 failed.", __func__)); + SHL_DEBUG_CALL(shl_debug_info("%s: broadcast input1 failed.", __func__)); return CSINN_FALSE; }; - int size0 = csi_tensor_size(b_input0); - int size1 = csi_tensor_size(b_input1); + int size0 = csinn_tensor_size(b_input0); + int size1 = csinn_tensor_size(b_input1); if (size0 == size1) { for (int i = 0; i < size0; i++) { @@ -137,12 +124,12 @@ int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *in } else { return CSINN_FALSE; } - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); return CSINN_TRUE; } -float csi_ref_get_scale(int32_t multiplier, int32_t shift) +float shl_ref_get_scale(int32_t multiplier, int32_t shift) { float scale = multiplier / pow(2, 31) * pow(2, shift); @@ -178,57 +165,58 @@ static int32_t high_mul_sat_round_double(int32_t a, int32_t b) return overflow ? INT32_MAX : ab_x2_high32; } -uint8_t csi_ref_quantize_channel_u8(int32_t data, struct csi_tensor *input, - struct csi_tensor *output, float wscale) +uint8_t shl_ref_quantize_channel_u8(int32_t data, struct csinn_tensor *input, + struct csinn_tensor *output, float wscale) { float out = data * input->qinfo->scale * wscale; - return csi_ref_quantize_f32_to_u8(out, output->qinfo); + return shl_ref_quantize_f32_to_u8(out, output->qinfo); } -int8_t csi_ref_quantize_channel_i8(int32_t data, struct csi_tensor *input, - struct csi_tensor *output, float wscale) +int8_t shl_ref_quantize_channel_i8(int32_t data, struct csinn_tensor *input, + struct csinn_tensor *output, float wscale) { float out = data * input->qinfo->scale * wscale; - return csi_ref_quantize_f32_to_i8(out, output->qinfo); + return shl_ref_quantize_f32_to_i8(out, output->qinfo); } -float csi_ref_dequantize_u8_to_f32(uint8_t input, struct csi_quant_info *qinfo) +float shl_ref_dequantize_u8_to_f32(uint8_t input, struct csinn_quant_info *qinfo) { float x = input; x -= qinfo->zero_point; - float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift); + float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift); return x * scale; } -float csi_ref_dequantize_i8_to_f32(int8_t input, struct csi_quant_info *qinfo) +float shl_ref_dequantize_i8_to_f32(int8_t input, struct csinn_quant_info *qinfo) { float x = input; x -= qinfo->zero_point; - float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift); + float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift); return x * scale; } -uint8_t csi_ref_quantize_f32_to_u8(float input, struct csi_quant_info *qinfo) +uint8_t shl_ref_quantize_f32_to_u8(float input, struct csinn_quant_info *qinfo) { - float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift); + float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift); float output = round(input / scale + qinfo->zero_point); return fmin(255, fmax(0, output)); } -int8_t csi_ref_quantize_f32_to_i8(float input, struct csi_quant_info *qinfo) +int8_t shl_ref_quantize_f32_to_i8(float input, struct csinn_quant_info *qinfo) { - float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift); + float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift); float output = round(input / scale + qinfo->zero_point); return fmin(127, fmax(-127, output)); } -struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t, int32_t *permute) +struct csinn_tensor *shl_ref_deconv_kernel_nchw_to_nhwc_f32(struct csinn_tensor *t, + int32_t *permute) { - struct csi_tensor *nt = csi_alloc_tensor(NULL); + struct csinn_tensor *nt = csinn_alloc_tensor(NULL); assert(t->dim_count < 5); - int size = csi_tensor_byte_size(t); + int size = csinn_tensor_byte_size(t); for (int i = t->dim_count; i < 4; i++) { t->dim[i] = 1; @@ -237,26 +225,26 @@ struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t, int t_dim = t->dim_count; t->dim_count = 4; t->quant_channel = 0; - csi_tensor_copy(nt, t); + csinn_tensor_copy(nt, t); nt->dim[0] = t->dim[permute[0]]; nt->dim[1] = t->dim[permute[1]]; nt->dim[2] = t->dim[permute[2]]; nt->dim[3] = t->dim[permute[3]]; - nt->data = csi_mem_alloc(size); + nt->data = shl_mem_alloc(size); - struct transpose_params tparams; + struct csinn_transpose_params tparams; tparams.permute = permute; tparams.base.api = CSINN_REF; tparams.base.name = "internal_transpose"; - csi_ref_transpose(t, nt, &tparams); + shl_ref_transpose(t, nt, &tparams); t->dim_count = t_dim; return nt; } -struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t) +struct csinn_tensor *shl_ref_nchw_to_nhwc_8(struct csinn_tensor *t) { - struct csi_tensor *nt = csi_alloc_tensor(NULL); + struct csinn_tensor *nt = csinn_alloc_tensor(NULL); assert(t->dim_count < 5); @@ -271,24 +259,24 @@ struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t) int t_dim = t->dim_count; t->dim_count = 4; - csi_tensor_copy(nt, t); + csinn_tensor_copy(nt, t); nt->dim[1] = t->dim[2]; nt->dim[2] = t->dim[3]; nt->dim[3] = t->dim[1]; - nt->data = csi_mem_alloc(size); + nt->data = shl_mem_alloc(size); int32_t permute[4] = {0, 2, 3, 1}; - struct transpose_params tparams; + struct csinn_transpose_params tparams; tparams.permute = permute; tparams.base.api = CSINN_REF; tparams.base.name = "internal_transpose"; - csi_ref_transpose(t, nt, &tparams); + shl_ref_transpose(t, nt, &tparams); t->dim_count = t_dim; return nt; } -void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t) +void shl_ref_nhwc_to_nchw_8(struct csinn_tensor *nt, struct csinn_tensor *t) { nt->dim[1] = t->dim[3]; nt->dim[2] = t->dim[1]; @@ -299,21 +287,21 @@ void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t) int32_t permute[4] = {0, 3, 1, 2}; - struct transpose_params tparams; + struct csinn_transpose_params tparams; tparams.permute = permute; tparams.base.api = CSINN_REF; tparams.base.name = "internal_transpose"; - csi_ref_transpose(t, nt, &tparams); + shl_ref_transpose(t, nt, &tparams); nt->dim_count = nt_dim; - csi_mem_free(t->data); - csi_mem_free(t); + shl_mem_free(t->data); + shl_mem_free(t); } -struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t) +struct csinn_tensor *shl_ref_nchw_to_nhwc_f32(struct csinn_tensor *t) { - struct csi_tensor *nt = csi_alloc_tensor(NULL); + struct csinn_tensor *nt = csinn_alloc_tensor(NULL); assert(t->dim_count < 5); @@ -329,25 +317,25 @@ struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t) int t_dim = t->dim_count; t->dim_count = 4; t->quant_channel = 0; - csi_tensor_copy(nt, t); + csinn_tensor_copy(nt, t); nt->dim[1] = t->dim[2]; nt->dim[2] = t->dim[3]; nt->dim[3] = t->dim[1]; - nt->data = csi_mem_alloc(size * 4); + nt->data = shl_mem_alloc(size * 4); int32_t permute[4] = {0, 2, 3, 1}; - struct transpose_params tparams; + struct csinn_transpose_params tparams; tparams.permute = permute; tparams.permute_num = 4; tparams.base.api = CSINN_REF; tparams.base.name = "internal_transpose"; - csi_ref_transpose(t, nt, &tparams); + shl_ref_transpose(t, nt, &tparams); t->dim_count = t_dim; return nt; } -void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t) +void shl_ref_nhwc_to_nchw_f32(struct csinn_tensor *nt, struct csinn_tensor *t) { nt->dim[1] = t->dim[3]; nt->dim[2] = t->dim[1]; @@ -358,24 +346,24 @@ void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t) int32_t permute[4] = {0, 3, 1, 2}; - struct transpose_params tparams; + struct csinn_transpose_params tparams; tparams.permute = permute; tparams.permute_num = 4; tparams.base.api = CSINN_REF; tparams.base.name = "internal_transpose"; - csi_ref_transpose(t, nt, &tparams); + shl_ref_transpose(t, nt, &tparams); nt->dim_count = nt_dim; if (t->qinfo != NULL) { - csi_mem_free(t->qinfo); + shl_mem_free(t->qinfo); t->qinfo = NULL; } - csi_mem_free(t->data); - csi_mem_free(t); + shl_mem_free(t->data); + shl_mem_free(t); } -int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents, +int32_t shl_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents, int32_t n) { int32_t index = 0; @@ -392,17 +380,17 @@ int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int return index; } -float csi_ref_uint8_to_float(uint8_t i, struct csi_tensor *t) +float shl_ref_uint8_to_float(uint8_t i, struct csinn_tensor *t) { return ((float)i - t->qinfo->zero_point) * t->qinfo->scale; } -float csi_ref_int8_to_float(int8_t i, struct csi_tensor *t) +float shl_ref_int8_to_float(int8_t i, struct csinn_tensor *t) { return ((float)i - t->qinfo->zero_point) * t->qinfo->scale; } -int16_t csi_ref_float32_to_float16(float value) +int16_t shl_ref_float32_to_float16(float value) { int16_t ret; if (value > -6.1e-5 && value < 6.1e-5) { @@ -410,7 +398,7 @@ int16_t csi_ref_float32_to_float16(float value) return 0; } if (value > 65504) { - csi_debug_error("too large f32 to f16\n"); + shl_debug_error("too large f32 to f16\n"); /* saturate to f16 max value: 65504 */ value = 65504; } @@ -422,7 +410,7 @@ int16_t csi_ref_float32_to_float16(float value) return ret; } -float csi_ref_float16_to_float32(int16_t value) +float shl_ref_float16_to_float32(int16_t value) { float ret; if (value == 0 || value == 0x8000) { @@ -437,7 +425,7 @@ float csi_ref_float16_to_float32(int16_t value) return ret; } -int16_t csi_ref_float32_to_bfloat16(float value) +int16_t shl_ref_float32_to_bfloat16(float value) { int16_t ret; int32_t org_format = *(int32_t *)&value; @@ -445,7 +433,7 @@ int16_t csi_ref_float32_to_bfloat16(float value) return ret; } -float csi_ref_bfloat16_to_float32(int16_t value) +float shl_ref_bfloat16_to_float32(int16_t value) { float ret; int32_t ret_format = value << 16; @@ -454,38 +442,38 @@ float csi_ref_bfloat16_to_float32(int16_t value) return ret; } -struct csi_tensor *csi_ref_alloc_float_tensor(struct csi_tensor *src) +struct csinn_tensor *shl_ref_alloc_float_tensor(struct csinn_tensor *src) { - struct csi_tensor *ret = csi_alloc_tensor(NULL); - csi_tensor_copy(ret, src); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, src); ret->dtype = CSINN_DTYPE_FLOAT32; - int size = csi_tensor_byte_size(ret); - float *data = csi_mem_alloc(size); + int size = csinn_tensor_byte_size(ret); + float *data = shl_mem_alloc(size); ret->data = data; return ret; } -void csi_ref_free_float_tensor(struct csi_tensor *src) +void shl_ref_free_float_tensor(struct csinn_tensor *src) { - csi_mem_free(src->data); - csi_free_tensor(src); + shl_mem_free(src->data); + csinn_free_tensor(src); } -struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src) +struct csinn_tensor *shl_ref_convert_float_tensor(struct csinn_tensor *src) { - struct csi_tensor *ret = csi_ref_alloc_float_tensor(src); - int size = csi_tensor_size(src); + struct csinn_tensor *ret = shl_ref_alloc_float_tensor(src); + int size = csinn_tensor_size(src); float *float_data = ret->data; if (src->dtype == CSINN_DTYPE_UINT8) { uint8_t *input_data = src->data; for (int i = 0; i < size; i++) { - float_data[i] = csi_ref_uint8_to_float(input_data[i], src); + float_data[i] = shl_ref_uint8_to_float(input_data[i], src); } } else if (src->dtype == CSINN_DTYPE_INT8) { int8_t *input_data = src->data; for (int i = 0; i < size; i++) { - float_data[i] = csi_ref_int8_to_float(input_data[i], src); + float_data[i] = shl_ref_int8_to_float(input_data[i], src); } } else { return NULL; @@ -494,21 +482,21 @@ struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src) return ret; } -void csi_ref_conv_free_float_tensor(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias) +void shl_ref_conv_free_float_tensor(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias) { - csi_ref_free_float_tensor(input); - csi_ref_free_float_tensor(output); - csi_ref_free_float_tensor(kernel); - csi_ref_free_float_tensor(bias); + shl_ref_free_float_tensor(input); + shl_ref_free_float_tensor(output); + shl_ref_free_float_tensor(kernel); + shl_ref_free_float_tensor(bias); } -struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input) +struct csinn_tensor *shl_ref_tensor_transform_f32(struct csinn_tensor *input) { - struct csi_tensor *ret = csi_alloc_tensor(NULL); - csi_tensor_copy(ret, input); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, input); if (ret->qinfo != NULL) { - csi_mem_free(ret->qinfo); + shl_mem_free(ret->qinfo); ret->qinfo = NULL; } ret->quant_channel = 0; @@ -516,97 +504,97 @@ struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input) if (ret->dim_count == 0) { return ret; } - ret->data = csi_mem_alloc(csi_tensor_size(input) * 4); - if (csi_tensor_data_convert(ret, input) == CSINN_TRUE) { + ret->data = shl_mem_alloc(csinn_tensor_size(input) * 4); + if (csinn_tensor_data_convert(ret, input) == CSINN_TRUE) { return ret; } return NULL; } -int csi_ref_tensor_transform_free_f32(struct csi_tensor *input) +int shl_ref_tensor_transform_free_f32(struct csinn_tensor *input) { - csi_mem_free(input->data); - csi_free_tensor(input); + shl_mem_free(input->data); + csinn_free_tensor(input); return CSINN_TRUE; } -int csi_ref_siso_callback_base(struct csi_tensor *input, struct csi_tensor *output, void *params, - void *cb) +int shl_ref_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output, + void *params, void *cb) { int (*callback)() = cb; int ret; - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); ret = callback(finput, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_diso_callback_base(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, void *params, void *cb) +int shl_ref_diso_callback_base(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, void *params, void *cb) { int (*callback)() = cb; int ret; - struct csi_tensor *finput0 = csi_ref_tensor_transform_f32(input0); - struct csi_tensor *finput1 = csi_ref_tensor_transform_f32(input1); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *finput0 = shl_ref_tensor_transform_f32(input0); + struct csinn_tensor *finput1 = shl_ref_tensor_transform_f32(input1); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); ret = callback(finput0, finput1, foutput, params); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput0); - csi_ref_tensor_transform_free_f32(finput1); - csi_ref_tensor_transform_free_f32(foutput); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput0); + shl_ref_tensor_transform_free_f32(finput1); + shl_ref_tensor_transform_free_f32(foutput); return ret; } -int csi_ref_conv_callback_base(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, void *params, +int shl_ref_conv_callback_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, void *params, void *cb) { int (*callback)() = cb; - struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input); - struct csi_tensor *float_kernel = csi_ref_tensor_transform_f32(kernel); - struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias); - struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output); + struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *float_kernel = shl_ref_tensor_transform_f32(kernel); + struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias); + struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); int ret = callback(float_input, float_output, float_kernel, float_bias, params); - csi_tensor_data_convert(output, float_output); - csi_ref_tensor_transform_free_f32(float_input); - csi_ref_tensor_transform_free_f32(float_output); - csi_ref_tensor_transform_free_f32(float_kernel); - csi_ref_tensor_transform_free_f32(float_bias); + csinn_tensor_data_convert(output, float_output); + shl_ref_tensor_transform_free_f32(float_input); + shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_kernel); + shl_ref_tensor_transform_free_f32(float_bias); return ret; } -uint8_t *csi_ref_f32_to_input_dtype(uint32_t index, float *data, struct csi_session *sess) +uint8_t *shl_ref_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess) { - struct csi_tensor *ftmp = csi_alloc_tensor(NULL); - csi_tensor_copy(ftmp, sess->input[index]); + struct csinn_tensor *ftmp = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ftmp, sess->input[index]); ftmp->data = data; ftmp->dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *ret = csi_alloc_tensor(NULL); - csi_tensor_copy(ret, sess->input[index]); - ret->data = csi_mem_alloc(csi_tensor_byte_size(ret)); - csi_tensor_data_convert(ret, ftmp); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, sess->input[index]); + ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret)); + csinn_tensor_data_convert(ret, ftmp); uint8_t *ret_data = ret->data; - csi_free_tensor(ret); - csi_free_tensor(ftmp); + csinn_free_tensor(ret); + csinn_free_tensor(ftmp); return ret_data; } -int csi_ref_broadcast_to_shape(struct csi_tensor *input, struct csi_tensor *output, int32_t *shape, - int32_t shape_count) +int shl_ref_broadcast_to_shape(struct csinn_tensor *input, struct csinn_tensor *output, + int32_t *shape, int32_t shape_count) { int ret; if (input->dtype != CSINN_DTYPE_FLOAT32) { - ret = csi_ref_broadcast_to_shape_quant(input, output, shape, shape_count); + ret = shl_ref_broadcast_to_shape_quant(input, output, shape, shape_count); } else { - ret = csi_ref_broadcast_to_shape_f32(input, output, shape, shape_count); + ret = shl_ref_broadcast_to_shape_f32(input, output, shape, shape_count); } return ret; } -int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *output, +int shl_ref_broadcast_to_shape_f32(struct csinn_tensor *input, struct csinn_tensor *output, int32_t *shape, int32_t shape_count) { float *input_data = (float *)input->data; @@ -623,7 +611,7 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor * for (int i = 0; i < in_shape_rank; i++) { if ((in_shape[in_shape_rank - i - 1] != target_shape[target_shape_rank - i - 1]) && (in_shape[in_shape_rank - i - 1] != 1)) { - csi_debug_error("The shapes of input and target do not meet the rules of broadcast!"); + shl_debug_error("The shapes of input and target do not meet the rules of broadcast!"); return CSINN_FALSE; } } @@ -642,9 +630,9 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor * } in_shape = new_shape; - int data_size = csi_tensor_size(input); - int out_size = csi_tensor_size(output); - float *output_data_t = csi_mem_alloc(out_size * 4); + int data_size = csinn_tensor_size(input); + int out_size = csinn_tensor_size(output); + float *output_data_t = shl_mem_alloc(out_size * 4); memcpy(output_data_t, input_data, data_size * 4); memcpy(output_data, input_data, data_size * 4); @@ -684,18 +672,18 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor * memcpy(output_data_t, output_data, out_size * 4); } } - csi_mem_free(output_data_t); + shl_mem_free(output_data_t); return CSINN_TRUE; } -int csi_ref_broadcast_to_shape_quant(struct csi_tensor *input, struct csi_tensor *output, +int shl_ref_broadcast_to_shape_quant(struct csinn_tensor *input, struct csinn_tensor *output, int32_t *shape, int32_t shape_count) { - struct csi_tensor *finput = csi_ref_tensor_transform_f32(input); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - int ret = csi_ref_broadcast_to_shape_f32(finput, foutput, shape, shape_count); - csi_tensor_data_convert(output, foutput); - csi_ref_tensor_transform_free_f32(finput); - csi_ref_tensor_transform_free_f32(foutput); + struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + int ret = shl_ref_broadcast_to_shape_f32(finput, foutput, shape, shape_count); + csinn_tensor_data_convert(output, foutput); + shl_ref_tensor_transform_free_f32(finput); + shl_ref_tensor_transform_free_f32(foutput); return ret; } \ No newline at end of file diff --git a/source/reference/xor.c b/source/reference/xor.c index 86e4a749..dba14ef2 100644 --- a/source/reference/xor.c +++ b/source/reference/xor.c @@ -16,17 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" +#include "shl_ref.h" -int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_xor_u32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; uint32_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] ^ input1_data[i]; @@ -34,13 +34,13 @@ int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct return CSINN_TRUE; } -int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_xor_u8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; uint8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] ^ input1_data[i]; @@ -48,13 +48,13 @@ int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct return CSINN_TRUE; } -int csi_ref_xor_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params) +int shl_ref_xor_i8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int8_t *input0_data = input0->data; int8_t *input1_data = input1->data; int8_t *output_data = output->data; - int size = csi_tensor_size(input0); + int size = csinn_tensor_size(input0); for (int i = 0; i < size; i++) { output_data[i] = input0_data[i] ^ input1_data[i]; diff --git a/source/reference/yuv_rgb_scale.c b/source/reference/yuv_rgb_scale.c index f19df80d..860e196f 100644 --- a/source/reference/yuv_rgb_scale.c +++ b/source/reference/yuv_rgb_scale.c @@ -16,16 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_ref.h" -#include "csi_utils.h" +#include "shl_ref.h" /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L3279-L3325 * line 3279*/ -int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_yuv_rgb_scale_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -53,8 +52,8 @@ int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *outpu return CSINN_TRUE; } -int csi_ref_yuv_rgb_scale_quant(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params) +int shl_ref_yuv_rgb_scale_quant(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) { - return csi_ref_siso_callback_base(input, output, params, csi_ref_yuv_rgb_scale_f32); + return shl_ref_siso_callback_base(input, output, params, shl_ref_yuv_rgb_scale_f32); } diff --git a/source/thead_rvv/add.c b/source/thead_rvv/add.c index 0b10e9b0..9c89737e 100644 --- a/source/thead_rvv/add.c +++ b/source/thead_rvv/add.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 @@ -38,16 +38,16 @@ static void element_add_fp32(float *input0, float *input1, float *output, int si } } -int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; float *output_data = (float *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224] if (in_size1 == 1) { @@ -74,28 +74,28 @@ int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { - float *in0_data_b = csi_mem_alloc(out_size * sizeof(float)); - float *in1_data_b = csi_mem_alloc(out_size * sizeof(float)); + float *in0_data_b = shl_mem_alloc(out_size * sizeof(float)); + float *in1_data_b = shl_mem_alloc(out_size * sizeof(float)); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_add_fp32(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or // [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] @@ -127,16 +127,16 @@ static void element_add_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int } } -int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { __fp16 *input0_data = (__fp16 *)input0->data; __fp16 *input1_data = (__fp16 *)input1->data; __fp16 *output_data = (__fp16 *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224] if (in_size1 == 1) { @@ -163,28 +163,28 @@ int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1, } // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224] if (!flag) { - __fp16 *in0_data_b = csi_mem_alloc(out_size * sizeof(__fp16)); - __fp16 *in1_data_b = csi_mem_alloc(out_size * sizeof(__fp16)); + __fp16 *in0_data_b = shl_mem_alloc(out_size * sizeof(__fp16)); + __fp16 *in1_data_b = shl_mem_alloc(out_size * sizeof(__fp16)); - struct csi_tensor *b_input0 = csi_alloc_tensor(NULL); - struct csi_tensor *b_input1 = csi_alloc_tensor(NULL); - csi_tensor_copy(b_input0, output); - csi_tensor_copy(b_input1, output); + struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL); + csinn_tensor_copy(b_input0, output); + csinn_tensor_copy(b_input1, output); b_input0->data = in0_data_b; b_input1->data = in1_data_b; - csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); - csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count); + shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count); input0_data = b_input0->data; input1_data = b_input1->data; element_add_fp16(input0_data, input1_data, output_data, out_size); - csi_mem_free(in0_data_b); - csi_mem_free(in1_data_b); - csi_mem_free(b_input0); - csi_mem_free(b_input1); + shl_mem_free(in0_data_b); + shl_mem_free(in1_data_b); + shl_mem_free(b_input0); + shl_mem_free(b_input1); } // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224] or // [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224] @@ -253,22 +253,22 @@ static void element_add_int8(int8_t *input0, int8_t *input1, int8_t *output, int } } -int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int8_t *input0_data = (int8_t *)input0->data; int8_t *input1_data = (int8_t *)input1->data; int8_t *output_data = (int8_t *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // TODO: move to init api float real_scale0 = input0->qinfo->scale / output->qinfo->scale; float real_scale1 = input1->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale0, &input0->qinfo->multiplier, &input0->qinfo->shift); - csi_quantize_multiplier(real_scale1, &input1->qinfo->multiplier, &input1->qinfo->shift); + shl_quantize_multiplier(real_scale0, &input0->qinfo->multiplier, &input0->qinfo->shift); + shl_quantize_multiplier(real_scale1, &input1->qinfo->multiplier, &input1->qinfo->shift); if (in_size0 == in_size1) { element_add_int8(input0_data, input1_data, output_data, in_size0, input0->qinfo->multiplier, @@ -276,7 +276,7 @@ int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1, input0->qinfo->zero_point, input1->qinfo->zero_point, output->qinfo->zero_point); } else { - csi_debug_error("Only support elementwise add on RVV CPU\n"); + shl_debug_error("Only support elementwise add on RVV CPU\n"); } return CSINN_TRUE; diff --git a/source/thead_rvv/avgpool.c b/source/thead_rvv/avgpool.c index eeab563a..37350445 100644 --- a/source/thead_rvv/avgpool.c +++ b/source/thead_rvv/avgpool.c @@ -16,37 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - int32_t input_h = input->dim[2]; - int32_t input_w = input->dim[3]; - + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; int32_t kernel_h = params->filter_height; int32_t kernel_w = params->filter_width; int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; - int32_t pad_left = params->pad_left; int32_t pad_right = params->pad_right; int32_t pad_top = params->pad_top; int32_t pad_down = params->pad_down; - params->base.bc = NULL; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(float); // global avgpool2d - if (input_h == kernel_h && input_w == kernel_w) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_global_avgpool2d_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_global_avgpool2d_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_ref_avgpool2d_quant; - } + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 + : shl_rvv_global_avgpool2d_fp32; return CSINN_TRUE; } @@ -54,74 +51,194 @@ int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *outpu if (kernel_h == 2 && kernel_w == 2) { if (pad_left == 0 && pad_top == 0) { // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (input_h % 2 == 1 && params->ceil_mode == 1) { + if (in_h % 2 == 1 && params->ceil_mode == 1) { if (params->pad_down) params->pad_down++; } - if (input_w % 2 == 1 && params->ceil_mode == 1) { + if (in_w % 2 == 1 && params->ceil_mode == 1) { if (params->pad_right) params->pad_right++; } // end consider ceil_mode 2x2s2p0 - - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_avgpool2x2s2_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_avgpool2x2s2_fp16; - } + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32 + : shl_rvv_avgpool2x2s2_fp32; } else if (pad_left == 1 && pad_top == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_avgpool2x2s2_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_avgpool2x2s2_p1_fp16; - } + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32 + : shl_rvv_avgpool2x2s2_p1_fp32; } } else if (kernel_h == 3 && kernel_w == 3) { if (pad_left == 0 && pad_top == 0) { // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (input_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; // origin pad_down mast be equal to zero ? } - if (input_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right) params->pad_right++; + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; } // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32 + : shl_rvv_avgpool3x3s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32 + : shl_rvv_avgpool3x3s2_p1_fp32; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp32 + : shl_rvv_avgpool3x3s1_p1_fp32; + } + } + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_f32; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_avgpool3x3s2_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_avgpool3x3s2_fp16; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 + : shl_rvv_global_avgpool2d_fp16; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right) params->pad_right++; } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16 + : shl_rvv_avgpool2x2s2_fp16; } else if (pad_left == 1 && pad_top == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_avgpool3x3s2_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_avgpool3x3s2_p1_fp16; + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16 + : shl_rvv_avgpool2x2s2_p1_fp16; + } + } else if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16 + : shl_rvv_avgpool3x3s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16 + : shl_rvv_avgpool3x3s2_p1_fp16; } } } else if (stride_h == 1 && stride_w == 1) { if (kernel_h == 3 && kernel_w == 3) { if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_avgpool3x3s1_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_avgpool3x3s1_p1_fp16; - } + cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp16 + : shl_rvv_avgpool3x3s1_p1_fp16; } } } - if (params->base.bc == NULL) { - csi_debug_warning( - "avgpool is not optimized to achieve under this condition on RVV, call reference func " + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference func " "replaced.\n"); - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_avgpool2d_f32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_avgpool2d_quant; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_ref_avgpool2d_quant; - } + cb->exec = shl_ref_avgpool2d_quant; // fixme: consider ncxhwx } return CSINN_TRUE; } + +int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 + : shl_ref_global_avgpool2d_quant; + return CSINN_TRUE; + } + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference func " + "replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; // fixme: consider ncxhwx + } +} + +int shl_rvv_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + return CSINN_FALSE; +} + +int shl_rvv_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + int packn = 0; + + if (input->dtype == CSINN_DTYPE_FLOAT32) { + packn = csrr_vlenb() / sizeof(float); + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 + : shl_rvv_global_avgpool2d_fp32; + } else if (input->dtype == CSINN_DTYPE_FLOAT16) { + packn = csrr_vlenb() / sizeof(__fp16); + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 + : shl_rvv_global_avgpool2d_fp16; + } else if (input->dtype == CSINN_DTYPE_INT8) { + packn = csrr_vlenb() / sizeof(int8_t) / 2; + cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 + : shl_ref_global_avgpool2d_quant; + } +} diff --git a/source/thead_rvv/avgpool_2x2_fp16.c b/source/thead_rvv/avgpool_2x2_fp16.c index f9d34264..1298050e 100644 --- a/source/thead_rvv/avgpool_2x2_fp16.c +++ b/source/thead_rvv/avgpool_2x2_fp16.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -121,8 +121,8 @@ int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_avgpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/avgpool_2x2_fp16_packn.c b/source/thead_rvv/avgpool_2x2_fp16_packn.c new file mode 100644 index 00000000..5d15724c --- /dev/null +++ b/source/thead_rvv/avgpool_2x2_fp16_packn.c @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +// TODO: consider params->count_include_pad +int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _acc = vle16_v_f16m1(line0, vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn, vl), vl); + vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.25f, vl); + vse16_v_f16m1(out0, _avg, vl); + + line0 += packn * 2; + line1 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/avgpool_2x2.c b/source/thead_rvv/avgpool_2x2_fp32.c similarity index 96% rename from source/thead_rvv/avgpool_2x2.c rename to source/thead_rvv/avgpool_2x2_fp32.c index 6919a5bc..2e581c98 100644 --- a/source/thead_rvv/avgpool_2x2.c +++ b/source/thead_rvv/avgpool_2x2_fp32.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 @@ -28,8 +28,8 @@ pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -131,8 +131,8 @@ int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *ou pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_avgpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; diff --git a/source/thead_rvv/avgpool_2x2_fp32_packn.c b/source/thead_rvv/avgpool_2x2_fp32_packn.c new file mode 100644 index 00000000..5aaabfbc --- /dev/null +++ b/source/thead_rvv/avgpool_2x2_fp32_packn.c @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +// TODO: consider params->count_include_pad +int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _acc = vle32_v_f32m1(line0, vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn, vl), vl); + vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.25f, vl); + vse32_v_f32m1(out0, _avg, vl); + + line0 += packn * 2; + line1 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/avgpool_3x3_fp16.c b/source/thead_rvv/avgpool_3x3_fp16.c index bbe72fe0..2819a7cc 100644 --- a/source/thead_rvv/avgpool_3x3_fp16.c +++ b/source/thead_rvv/avgpool_3x3_fp16.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -153,8 +153,8 @@ int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -350,8 +350,8 @@ int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_nn_rvv_avgpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/avgpool_3x3_fp16_packn.c b/source/thead_rvv/avgpool_3x3_fp16_packn.c new file mode 100644 index 00000000..72ce53c2 --- /dev/null +++ b/source/thead_rvv/avgpool_3x3_fp16_packn.c @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + const __fp16 *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _acc = vle16_v_f16m1(line0, vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 2, vl), vl); + vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.11111111f, vl); + vse16_v_f16m1(out0, _avg, vl); + + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + line2 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} + +int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + const __fp16 *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _acc = vle16_v_f16m1(line0, vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 1, vl), vl); + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 2, vl), vl); + vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.11111111f, vl); + vse16_v_f16m1(out0, _avg, vl); + + line0 += packn * 1; + line1 += packn * 1; + line2 += packn * 1; + out0 += packn; + } + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/avgpool_3x3.c b/source/thead_rvv/avgpool_3x3_fp32.c similarity index 97% rename from source/thead_rvv/avgpool_3x3.c rename to source/thead_rvv/avgpool_3x3_fp32.c index 0dbf61d3..044d4196 100644 --- a/source/thead_rvv/avgpool_3x3.c +++ b/source/thead_rvv/avgpool_3x3_fp32.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 @@ -28,8 +28,8 @@ pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -163,8 +163,8 @@ int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *ou pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -364,8 +364,8 @@ int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor pad_left = pad_right = pad_top = pad_down = 1 in_w = out_w in_h = out_h */ -int csi_nn_rvv_avgpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_avgpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; diff --git a/source/thead_rvv/avgpool_3x3_fp32_packn.c b/source/thead_rvv/avgpool_3x3_fp32_packn.c new file mode 100644 index 00000000..3d9ad58c --- /dev/null +++ b/source/thead_rvv/avgpool_3x3_fp32_packn.c @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + const float *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _acc = vle32_v_f32m1(line0, vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 2, vl), vl); + vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.11111111f, vl); + vse32_v_f32m1(out0, _avg, vl); + + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + line2 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} + +int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + const float *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _acc = vle32_v_f32m1(line0, vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 1, vl), vl); + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 2, vl), vl); + vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.11111111f, vl); + vse32_v_f32m1(out0, _avg, vl); + + line0 += packn * 1; + line1 += packn * 1; + line2 += packn * 1; + out0 += packn; + } + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/concat.c b/source/thead_rvv/concat.c index 5675d87d..80b4ab15 100644 --- a/source/thead_rvv/concat.c +++ b/source/thead_rvv/concat.c @@ -16,11 +16,11 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" -int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params) +int shl_rvv_concat_fp32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -35,7 +35,7 @@ int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output, float *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; float *input_item_data = input_item->data; int copy_size = input_item->dim[params->axis] * base_inner_size; const float *input_ptr = input_item_data + k * copy_size; @@ -52,8 +52,8 @@ int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params) +int shl_rvv_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -68,7 +68,7 @@ int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output, __fp16 *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; __fp16 *input_item_data = input_item->data; int copy_size = input_item->dim[params->axis] * base_inner_size; const __fp16 *input_ptr = input_item_data + k * copy_size; @@ -85,8 +85,8 @@ int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output, return CSINN_TRUE; } -int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output, - struct concat_params *params) +int shl_rvv_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { @@ -100,7 +100,7 @@ int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output, int8_t *output_ptr = (int8_t *)output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input[i]; + struct csinn_tensor *input_item = input[i]; int8_t *input_item_data = (int8_t *)input_item->data; int copy_size = input_item->dim[params->axis] * base_inner_size; const int8_t *input_ptr = input_item_data + k * copy_size; diff --git a/source/thead_rvv/convolution.c b/source/thead_rvv/convolution.c index 098f88ce..11be1b98 100644 --- a/source/thead_rvv/convolution.c +++ b/source/thead_rvv/convolution.c @@ -16,19 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/* - only support layout:NCHW - input layout: N C H W - kernel layout: O I h w - output layout: N O H W -*/ -int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int32_t out_c = kernel->dim[0]; int32_t in_c = kernel->dim[1]; @@ -40,179 +34,385 @@ int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, int32_t stride_w = params->stride_width; int32_t dalition_h = params->dilation_height; int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(float); - // check - int out_height = (in_h + params->pad_top + params->pad_down - kernel_h) / stride_h + 1; - int out_width = (in_w + params->pad_left + params->pad_right - kernel_w) / stride_w + 1; - if (out_height != output->dim[2] || out_width != output->dim[3]) { - printf("output dim don't match.\n"); - return CSINN_FALSE; + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp32; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { + if (params->group > 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_fp32; + return CSINN_TRUE; + } else { + params->conv_extra.conv_mode = CSINN_WINOGRAD; + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + cb->exec = shl_rvv_wg_b4f3s1_packn_fp32; + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + cb->exec = shl_rvv_wg_b6f3s1_packn_fp32; + } + params->conv_extra.kernel_tm = t_kernel; + } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_fp32; + } } - if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && - dalition_w == 1) { + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { params->conv_extra.conv_mode = CSINN_GEMM; - if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(kernel, params); - params->base.bc = csi_nn_rvv_conv1x1s1_gemm_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_nn_rvv_conv1x1s1_gemm_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { -#ifdef __riscv_xtheadv - params->conv_extra.kernel_tm = csi_alloc_tensor(NULL); - csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(kernel, params); - // support channel quantization - for (int i = 0; i < kernel->quant_channel; i++) { - float real_scale = - input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), - &(kernel->qinfo[i].shift)); - } - params->base.bc = csi_nn_rvv_conv1x1s1_gemm_int8; -#endif + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp32; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp32; + } + } + + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp32; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp32; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_fp32; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_fp32; } - // winograd convolution condition: - } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && - dalition_h == 1 && dalition_w == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { + } + return CSINN_TRUE; +} + +int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp16; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32; + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16; return CSINN_TRUE; - } - - // pack4 for winograd convolution - if ((out_c % 4 == 0) && (in_c % 4 == 0)) { + } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(kernel, t_kernel); + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + cb->exec = shl_rvv_wg_b4f3s1_packn_fp16; + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + cb->exec = shl_rvv_wg_b6f3s1_packn_fp16; + } params->conv_extra.kernel_tm = t_kernel; - params->base.bc = csi_nn_rvv_conv3x3s1_winograd64_packn_fp32; - } else { - params->conv_extra.conv_mode = CSINN_GEMM; - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32; } + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16; + } + } + + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp16; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp16; + } + } - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp16; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp16; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_fp16; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_fp16; + } + } + return CSINN_TRUE; +} + +int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ +#ifdef XTHEADV + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // packn + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packn_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && + dalition_h == 1 && dalition_w == 1) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; return CSINN_TRUE; - } - - // pack8 for winograd convolution - if ((out_c % 8 == 0) && (in_c % 8 == 0)) { + } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(kernel, t_kernel); + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + cb->exec = shl_rvv_wg_b4f3s1_packn_int8; params->conv_extra.kernel_tm = t_kernel; - params->base.bc = csi_nn_rvv_conv3x3s1_winograd64_packn_fp16; - } else { - params->conv_extra.conv_mode = CSINN_GEMM; - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16; } - } else if (input->dtype == CSINN_DTYPE_INT8) { -#ifdef __riscv_xtheadv + } else { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csi_alloc_tensor(NULL); - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(kernel, params); - // support channel quantization - for (int i = 0; i < kernel->quant_channel; i++) { - float real_scale = - input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), - &(kernel->qinfo[i].shift)); - } - params->base.bc = csi_nn_rvv_conv_im2col_gemm_int8; -#endif + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; } + } - } else { + // pack1ton + if (in_c % packn != 0 && out_c % packn == 0) { params->conv_extra.conv_mode = CSINN_GEMM; - if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params); - params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { -#ifdef __riscv_xtheadv - params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csi_alloc_tensor(NULL); - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(kernel, params); - // support channel quantization - for (int i = 0; i < kernel->quant_channel; i++) { - float real_scale = - input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), - &(kernel->qinfo[i].shift)); + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_int8; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_int8; + } + } + + // packnto1 + if (in_c % packn == 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_int8; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_packnto1_int8; + } + } + + // pack1 + if (in_c % packn != 0 && out_c % packn != 0) { + params->conv_extra.conv_mode = CSINN_GEMM; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + cb->exec = shl_rvv_conv1x1s1_gemm_int8; + } else { + shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + cb->exec = shl_rvv_conv_im2col_gemm_int8; + } + } + + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + // trick for winograd b4f3 + if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { + real_scale = real_scale / 576.0f; + } + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + + // enable fuse zeropoint to bias for gemm + if (params->conv_extra.conv_mode == CSINN_GEMM) { + if (!params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + bias->data = bias_data; + } + int kernel_inner = in_c * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] -= tmp; + } + } + } + + // recover fuse zeropoint to bias for winograd + if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { + if (params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + int kernel_inner = in_c * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] += tmp; } - params->base.bc = csi_nn_rvv_conv_im2col_gemm_int8; -#endif } } return CSINN_TRUE; +#else + shl_debug_error("unsupport conv2d for int8 without xtheadv extension\n"); + return CSINN_FALSE; +#endif } -int csi_nn_rvv_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { - int32_t batch = input->dim[0]; - int32_t in_ch = input->dim[1]; +#ifdef XTHEADV + int32_t out_c = kernel->dim[0]; + int32_t in_c = kernel->dim[1]; int32_t in_h = input->dim[2]; int32_t in_w = input->dim[3]; - - int32_t out_ch = output->dim[1]; - int32_t out_h = output->dim[2]; - int32_t out_w = output->dim[3]; - int32_t kernel_h = kernel->dim[2]; int32_t kernel_w = kernel->dim[3]; int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; + int32_t dalition_h = params->dilation_height; + int32_t dalition_w = params->dilation_width; + struct csinn_callback *cb = params->base.cb; - if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_dwconv3x3s1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_dwconv3x3s1_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - // support channel quantization - for (int i = 0; i < kernel->quant_channel; i++) { - float real_scale = - input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), - &(kernel->qinfo[i].shift)); + // xxx: only int4 support nhwc layout now + if (input->layout == CSINN_LAYOUT_NHWC) { + out_c = kernel->dim[0]; + in_c = kernel->dim[3]; + in_h = input->dim[1]; + in_w = input->dim[2]; + kernel_h = kernel->dim[1]; + kernel_w = kernel->dim[2]; + if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && + dalition_w == 1) { + params->conv_extra.conv_mode = CSINN_GEMM; + if (input->dtype == CSINN_DTYPE_INT4) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(kernel, params); + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = + input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + cb->exec = shl_rvv_conv1x1s1_gemm_int4; } - params->base.bc = csi_nn_rvv_dwconv3x3s1_int8; - } - } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_dwconv3x3s2_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_dwconv3x3s2_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - // support channel quantization - for (int i = 0; i < kernel->quant_channel; i++) { - float real_scale = - input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), - &(kernel->qinfo[i].shift)); + } else { + params->conv_extra.conv_mode = CSINN_GEMM; + if (input->dtype == CSINN_DTYPE_INT4) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_int4(kernel, params); + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = + input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + cb->exec = shl_rvv_conv_im2col_gemm_int4; } - params->base.bc = csi_nn_rvv_dwconv3x3s2_int8; - } - } else { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_depthwise_conv2d_f32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_depthwise_conv2d_quant; } + return CSINN_TRUE; } - return CSINN_TRUE; + return CSINN_FALSE; +#else + shl_debug_error("unsupport conv2d for int4 without xtheadv extension\n"); + return CSINN_FALSE; +#endif } diff --git a/source/thead_rvv/convolution_1x1_fp16.c b/source/thead_rvv/convolution_1x1_fp16.c index aced0510..75db46ed 100644 --- a/source/thead_rvv/convolution_1x1_fp16.c +++ b/source/thead_rvv/convolution_1x1_fp16.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params) +void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { __fp16 *kernel_data = (__fp16 *)kernel->data; int group = params->group; @@ -29,17 +29,17 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel, int m = kernel->dim[0] / group; // out_ch int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16)); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -57,7 +57,7 @@ int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor * int32_t k = in_ch / group; int32_t n = out_h * out_w; - __fp16 *pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -66,14 +66,14 @@ int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor * __fp16 *pc = output_data; // pack - csi_nn_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n); + shl_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); + shl_mem_free(pb_reorder); return CSINN_TRUE; } diff --git a/source/thead_rvv/convolution_1x1_fp16_pack1ton.c b/source/thead_rvv/convolution_1x1_fp16_pack1ton.c new file mode 100644 index 00000000..3d2a8ca1 --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp16_pack1ton.c @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n); + + // gemm + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1_fp16_packn.c b/source/thead_rvv/convolution_1x1_fp16_packn.c new file mode 100644 index 00000000..4ce749ea --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp16_packn.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n); + // GEMM + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1_fp16_packnto1.c b/source/thead_rvv/convolution_1x1_fp16_packnto1.c new file mode 100644 index 00000000..82fd22a7 --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp16_packnto1.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + __fp16 *kernel_ptr = kernel_data + g * m * k; + __fp16 *in_ptr = pb_reorder; + __fp16 *out_ptr = output_data; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n); + // GEMM + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, + n); + + shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1.c b/source/thead_rvv/convolution_1x1_fp32.c similarity index 67% rename from source/thead_rvv/convolution_1x1.c rename to source/thead_rvv/convolution_1x1_fp32.c index 53d7408c..52dfafc9 100644 --- a/source/thead_rvv/convolution_1x1.c +++ b/source/thead_rvv/convolution_1x1_fp32.c @@ -16,12 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel, - struct conv2d_params *params) +void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { float *kernel_data = (float *)kernel->data; int group = params->group; @@ -29,17 +29,17 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel, int m = kernel->dim[0] / group; // out_ch / group int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) - float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float)); + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -57,7 +57,7 @@ int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor * int32_t k = in_ch / group; int32_t n = out_h * out_w; - float *pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float)); + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -65,13 +65,13 @@ int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor * float *pb = pb_reorder; float *pc = output_data; // pack - csi_nn_rvv_reorder_input_z8_fp32(input_data, pb, k, n, n); + shl_rvv_reorder_input_z8_fp32(input_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x8_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n); input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); + shl_mem_free(pb_reorder); return CSINN_TRUE; } diff --git a/source/thead_rvv/convolution_1x1_fp32_pack1ton.c b/source/thead_rvv/convolution_1x1_fp32_pack1ton.c new file mode 100644 index 00000000..3fa58caa --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp32_pack1ton.c @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n); + + // gemm + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1_fp32_packn.c b/source/thead_rvv/convolution_1x1_fp32_packn.c new file mode 100644 index 00000000..4c3c39cc --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp32_packn.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n); + // GEMM + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1_fp32_packnto1.c b/source/thead_rvv/convolution_1x1_fp32_packnto1.c new file mode 100644 index 00000000..21d0a7c4 --- /dev/null +++ b/source/thead_rvv/convolution_1x1_fp32_packnto1.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; // assert(batch == 1); + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + float *kernel_ptr = kernel_data + g * m * k; + float *in_ptr = pb_reorder; + float *out_ptr = output_data; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + // pack + shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n); + // GEMM + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, + n); + + shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_1x1_int4.c b/source/thead_rvv/convolution_1x1_int4.c index b989dfea..dab5c26b 100644 --- a/source/thead_rvv/convolution_1x1_int4.c +++ b/source/thead_rvv/convolution_1x1_int4.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#ifdef __riscv_xtheadv -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" +#ifdef XTHEADV // kernel_layout: [o, h, w, i] -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel, - struct conv2d_params *params) +void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { int8_t *kernel_data = (int8_t *)kernel->data; int group = params->group; @@ -33,21 +32,21 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel, int k_2 = (((k - 1) & -2) + 2) >> 1; // pair of int4, col of kernel_matrix int k4 = ((k_2 - 1) & -4) + 4; // align of 4 for int8 - params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * n * k4 * sizeof(int8_t)); + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * n * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, - k_2, k_2); + shl_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, k_2, + k_2); } // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t)); - // csi_mem_free(pa_reorder); + // shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv1x1s1_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -68,9 +67,9 @@ int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor * int32_t k_2 = (((k - 1) & -2) + 2) >> 1; int32_t k4 = ((k_2 - 1) & -4) + 4; - int8_t *pa_reorder = (int8_t *)csi_mem_alloc(m * k4 * sizeof(int8_t)); - int32_t *multiplier = (int32_t *)csi_mem_alloc(n * sizeof(int32_t)); - int32_t *shift = (int32_t *)csi_mem_alloc(n * sizeof(int32_t)); + int8_t *pa_reorder = (int8_t *)shl_mem_alloc(m * k4 * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(n * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(n * sizeof(int32_t)); int j = 0; for (int i = 0; i < batch; i++) { @@ -92,17 +91,17 @@ int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor * } // pack - csi_nn_rvv_reorder_input_n8_int4(input_data, pa, m, k_2, k_2); + shl_rvv_reorder_input_n8_int4(input_data, pa, m, k_2, k_2); // GEMM - csi_nn_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n, - output->qinfo->zero_point, multiplier, shift); + shl_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n, + output->qinfo->zero_point, multiplier, shift); input_data += m * k_2; output_data += m * n / 2; } } - csi_mem_free(pa_reorder); - csi_mem_free(multiplier); - csi_mem_free(shift); + shl_mem_free(pa_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); return CSINN_TRUE; } #endif diff --git a/source/thead_rvv/convolution_1x1_int4_packn.c b/source/thead_rvv/convolution_1x1_int4_packn.c new file mode 100644 index 00000000..1aaee0db --- /dev/null +++ b/source/thead_rvv/convolution_1x1_int4_packn.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k / 2 * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k / 2 * n * sizeof(int8_t)); + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m / 2 * n * sizeof(int8_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, k, out_h, out_w); + + shl_rvv_reorder_input_z12_packn_int8(input_ncxhwx, pb_reorder, k, n, n); + + shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, + n, output->qinfo->zero_point, multiplier, shift); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(input_ncxhwx); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_1x1_int8.c b/source/thead_rvv/convolution_1x1_int8.c index fb9a84ea..ca1a4329 100644 --- a/source/thead_rvv/convolution_1x1_int8.c +++ b/source/thead_rvv/convolution_1x1_int8.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#ifdef __riscv_xtheadv -#include "csi_thead_rvv.h" - -void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel, - struct conv2d_params *params) +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { int8_t *kernel_data = (int8_t *)kernel->data; int group = params->group; @@ -31,21 +30,20 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel, int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; - params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * m * k4 * sizeof(int8_t)); + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, - k); + shl_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k); } // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t)); - // csi_mem_free(pa_reorder); + // shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -64,9 +62,9 @@ int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor * int32_t n = out_h * out_w; int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; - int8_t *pb_reorder = (int8_t *)csi_mem_alloc(k4 * n * sizeof(int8_t)); - int32_t *multiplier = (int32_t *)csi_mem_alloc(m * sizeof(int32_t)); - int32_t *shift = (int32_t *)csi_mem_alloc(m * sizeof(int32_t)); + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); int j = 0; for (int i = 0; i < batch; i++) { @@ -88,18 +86,18 @@ int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor * } // pack - csi_nn_rvv_reorder_input_z8_int8(input_data, pb, k, n, n); + shl_rvv_reorder_input_z8_int8(input_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x8_int8(pc, pa, pb, m, k4, n, n, bias_data + g * m, - output->qinfo->zero_point, multiplier, shift); + shl_rvv_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); input_data += k * n; output_data += m * n; } } - csi_mem_free(pb_reorder); - csi_mem_free(multiplier); - csi_mem_free(shift); + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); return CSINN_TRUE; } #endif diff --git a/source/thead_rvv/convolution_1x1_int8_pack1ton.c b/source/thead_rvv/convolution_1x1_int8_pack1ton.c new file mode 100644 index 00000000..71262773 --- /dev/null +++ b/source/thead_rvv/convolution_1x1_int8_pack1ton.c @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); +} + +static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, int inc, int inh, + int inw) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e8mf2(inc); + int vl4 = ((vl - 1) & -4) + 4; + int8_t *in_ptr = (int8_t *)src; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl); + in_ptr++; + vse8_v_i8mf2(dst, _tmp, vl); + dst += vl4; + } + } + src += in_size * vl; + inc -= vl; + } +} + +int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_c / group; + int32_t k = in_c / group; + int32_t n = out_h * out_w; + int32_t k4 = ((k - 1) & -4) + 4; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k4; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w); + + // reorder(pack) + shl_rvv_reorder_input_z12_pack1ton_int8(input_ncxhwx, in_ptr, k4, 1, n, n); + + // gemm + shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(pb_reorder); + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_1x1_int8_packn.c b/source/thead_rvv/convolution_1x1_int8_packn.c new file mode 100644 index 00000000..aef9a3c2 --- /dev/null +++ b/source/thead_rvv/convolution_1x1_int8_packn.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n); + + shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n, + output->qinfo->zero_point, multiplier, shift); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_1x1_int8_packnto1.c b/source/thead_rvv/convolution_1x1_int8_packnto1.c new file mode 100644 index 00000000..4856319d --- /dev/null +++ b/source/thead_rvv/convolution_1x1_int8_packnto1.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); +} + +int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t out_ch = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t m = out_ch / group; + int32_t k = in_ch / group; + int32_t n = out_h * out_w; + + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + int8_t *kernel_ptr = kernel_data + g * m * k; + int8_t *in_ptr = pb_reorder; + int8_t *out_ptr = output_data; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n); + + shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n, + n, output->qinfo->zero_point, multiplier, shift); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + + input_data += k * n; + output_data += m * n; + } + } + shl_mem_free(pb_reorder); + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_3x3.c b/source/thead_rvv/convolution_3x3.c deleted file mode 100644 index 466d7675..00000000 --- a/source/thead_rvv/convolution_3x3.c +++ /dev/null @@ -1,807 +0,0 @@ -/* - * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CSI-NN2 version 1.12.x */ - -#include "csi_thead_rvv.h" - -/************************************************************* - note: VLEN = 128/256 ... -*************************************************************/ -/* - padding input for winograd input transform , and change memory layout to [n c/4 h w 4] - input layout: [n c h w] - input_padded layout: [n c/packn h w packn] - constrain: input channel % packn = 0 -*/ - -static void winograd_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, - int inh, int inw, int padded_h, int padded_w, - int pad_top, int pad_left) -{ - const int packn = csrr_vlenb() / sizeof(float); - const int vl = vsetvl_e32m1(packn); - - int padded_hw = padded_h * padded_w; - const int in_size = inh * inw; // per-channel size - - float *pad_ptr = input_padded; - float *inp_ptr = (float *)input; - int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) - - vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); - - int c = 0; - for (; c + packn - 1 < inc; c += packn) { - inp_ptr = (float *)input + c * in_size; - // pad h_top - for (int i = 0; i < pad_top * padded_w; i++) { - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += packn; - } - // pad h_mid - for (int i = 0; i < inh; i++) { - // pad w_left - for (int j = 0; j < pad_left; j++) { - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += packn; - } - // pad w_mid - for (int j = 0; j < inw; j++) { - vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl); - inp_ptr++; - vse32_v_f32m1(pad_ptr, _tmp, vl); - pad_ptr += packn; - } - // pad w_end - for (int j = 0; j < pad_right; j++) { - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += packn; - } - } - // pad h_bottom - for (int i = 0; i < pad_down * padded_w; i++) { - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += packn; - } - } -} - -static void winograd_crop_output_packnto1_fp32(const float *output_trans, float *output, int out_c, - int out_h, int out_w, int wino_h, int wino_w) -{ - const int packn = csrr_vlenb() / sizeof(float); - const int vl = vsetvl_e32m1(packn); - - const int out_size = out_h * out_w; // per-channel size - const int crop_size = wino_h * wino_w; - - float *out_tm_ptr = (float *)output_trans; - float *out_ptr = output; - - int c = 0; - for (; c + packn - 1 < out_c; c += packn) { - out_tm_ptr = (float *)output_trans + c * crop_size; - out_ptr = output + c * out_size; - - for (int h = 0; h < out_h; h++) { - float *crop_ptr = out_tm_ptr + h * wino_w * packn; - for (int w = 0; w < out_w; w++) { - vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl); - crop_ptr += packn; - vsse32_v_f32m1(out_ptr, out_size * sizeof(float), _tmp, vl); - out_ptr++; - } - } - } -} - -/* - packn = VLEN / 32 (128/32=4 or 256/32=8) - constrain: output channel % packn = 0 - input channel % packn = 0 - kernel before: [O I 3*3] - kernel after : [O/packn 8*8 I packn] -*/ -void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) -{ - int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; - - float *kernel_data = (float *)o_kernel->data; - // for kernel transform buf, 3x3 --> 8x8 - float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); - // kernel transform matrix: G - const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; - - // const float ktm[8][3] = { - // {1.0f, 0.0f, 0.0f}, - // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - // {1.0f / 90, 1.0f / 45, 2.0f / 45}, - // {1.0f / 90, -1.0f / 45, 2.0f / 45}, - // {32.0f / 45, 16.0f / 45, 8.0f / 45}, - // {32.0f / 45, -16.0f / 45, 8.0f / 45}, - // {0.0f, 0.0f, 1.0f} - // }; - - csi_tensor_copy(t_kernel, o_kernel); - - for (int p = 0; p < outch; p++) { - for (int q = 0; q < inch; q++) { - const float *kernel0 = kernel_data + p * inch * 9 + q * 9; - float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; - - // transform kernel - const float *k0 = kernel0; - const float *k1 = kernel0 + 3; - const float *k2 = kernel0 + 6; - - // h : first compute the transport matrix tmp = (g * GT)T - float tmp[8][3]; - for (int i = 0; i < 8; i++) { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; - tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; - tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; - } - - // U - for (int j = 0; j < 8; j++) { - float *tmpp = &tmp[j][0]; - - for (int i = 0; i < 8; i++) { - kernel_tmp[j * 8 + i] = - tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; - } - } - } - } - // optimized layout for winograd64 - - const int packn = csrr_vlenb() / sizeof(float); - - float *kernel_tm_packn = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); - t_kernel->data = kernel_tm_packn; - - for (int oc = 0; oc < outch / packn; oc++) { - float *g0 = kernel_tm_packn + oc * 64 * inch * packn; - - for (int k = 0; k < 64; k++) { - float *g00 = g0 + k * inch * packn; - - for (int ic = 0; ic < inch / packn; ic++) { - for (int i = 0; i < packn; i++) { - for (int j = 0; j < packn; j++) { - const float *k00 = - kernel_tm + (oc * packn + j) * 64 * inch + (ic * packn + i) * 64; - *g00++ = k00[k]; - } - } - } - } - } - csi_mem_free(kernel_tm); -} - -/* - n = VLEN / 32 - constrain: output channel % n = 0 - input channel % n = 0 -*/ -int csi_nn_rvv_conv3x3s1_winograd64_packn_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *kernel_data = (float *)params->conv_extra.kernel_tm->data; - float *bias_data = (float *)bias->data; - - // param - int kernel_h = kernel->dim[2]; - int kernel_w = kernel->dim[3]; - int stride_h = params->stride_height; - int stride_w = params->stride_width; - int dilation_h = params->dilation_height; - int dilation_w = params->dilation_width; - int pad_left = params->pad_left; - int pad_top = params->pad_top; - - int batch = input->dim[0]; - int in_c = input->dim[1]; - int in_h = input->dim[2]; - int in_w = input->dim[3]; - int input_size = in_c * in_h * in_w; - int kernel_size = in_c * kernel_h * kernel_w; - - int out_c = kernel->dim[0]; - int out_h = output->dim[2]; - int out_w = output->dim[3]; - int output_size = out_c * out_h * out_w; - - // winograd param - int block_h = (out_h + 5) / 6; - int block_w = (out_w + 5) / 6; - - // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 - int padded_in_h = block_h * 6 + 2; - int padded_in_w = block_w * 6 + 2; - int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel - - /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias - if (bias_data == NULL) { - flag_bias = 0; - bias_data = (float *)csi_mem_alloc(out_c * sizeof(float)); - } - - const int packn = csrr_vlenb() / sizeof(float); - const int vl = vsetvl_e32m1(packn); - - for (int n = 0; n < batch; n++) { - // pad buffer: [in_c/8 h w 8] - float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float)); - - // pad input - winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, - padded_in_w, pad_top, pad_left); - input_data += input_size; - - // input transform buffer1: [in_ch/8, 64, blocks, 8] - float *input_tm1_buf = - (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float)); - - /****************************** transform input *****************************/ - /* - BT = { - { 1 0 -5.25 0 5.25 0 -1 0 }; - { 0 1 1 -4.25 -4.25 1 1 0 }; - { 0 -1 1 4.25 -4.25 -1 1 0 }; - { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; - { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; - { 0 2 4 -2.5 -5 0.5 1 0 }; - { 0 -2 4 2.5 -5 -0.5 1 0 }; - { 0 -1 0 5.25 0 -5.25 0 1 } - }; - */ - int tiles = block_h * block_w; - -#pragma omp parallel for num_threads(1) - for (int q = 0; q < in_c / packn; q++) { - float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * - packn; // feature map after padding - q channel - float *img0_tm = - input_tm1_buf + q * 64 * tiles * packn; // transform and interleave - q channel - - float tmp[8][8][packn]; - - for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { - float *r0 = img0 + (i * padded_in_w * 6 + j * 6) * - packn; // feature map after padding 8*8 start addr - float *r0_tm = - img0_tm + (i * block_w + j) * packn; // input_tm1 8*8 block start addr - - for (int m = 0; m < 8; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); - vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); - vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); - - vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, - vfsub_vv_f32m1(_r04, _r02, vl), vl); - vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, - vfsub_vv_f32m1(_r03, _r05, vl), vl); - - vfloat32m1_t _tmp12a = - vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); - vfloat32m1_t _tmp12b = - vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); - vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); - vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); - - vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); - vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, - _r05, vl); - vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); - vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); - - vfloat32m1_t _tmp56a = - vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); - vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, - _r05, vl); - vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); - vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); - - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[7][m], _tmp7m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); - vse32_v_f32m1(tmp[4][m], _tmp4m, vl); - vse32_v_f32m1(tmp[5][m], _tmp5m, vl); - vse32_v_f32m1(tmp[6][m], _tmp6m, vl); - - r0 += padded_in_w * packn; - } - - for (int m = 0; m < 8; m++) { - float *r0_tm0 = r0_tm; - float *r0_tm1 = r0_tm0 + tiles * packn; - float *r0_tm2 = r0_tm1 + tiles * packn; - float *r0_tm3 = r0_tm2 + tiles * packn; - float *r0_tm4 = r0_tm3 + tiles * packn; - float *r0_tm5 = r0_tm4 + tiles * packn; - float *r0_tm6 = r0_tm5 + tiles * packn; - float *r0_tm7 = r0_tm6 + tiles * packn; - - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); - vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); - vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); - - vfloat32m1_t _r0tm0 = - vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, - vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); - vfloat32m1_t _r0tm7 = - vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, - vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); - - vfloat32m1_t _tmp12a = - vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); - vfloat32m1_t _tmp12b = - vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); - vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); - vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); - - vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); - vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), - 2.f, _tmp05, vl); - vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); - vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); - - vfloat32m1_t _tmp56a = vfmacc_vf_f32m1( - _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); - vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), - 0.5f, _tmp05, vl); - vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); - vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); - - vse32_v_f32m1(r0_tm0, _r0tm0, vl); - vse32_v_f32m1(r0_tm7, _r0tm7, vl); - vse32_v_f32m1(r0_tm1, _r0tm1, vl); - vse32_v_f32m1(r0_tm2, _r0tm2, vl); - vse32_v_f32m1(r0_tm3, _r0tm3, vl); - vse32_v_f32m1(r0_tm4, _r0tm4, vl); - vse32_v_f32m1(r0_tm5, _r0tm5, vl); - vse32_v_f32m1(r0_tm6, _r0tm6, vl); - - r0_tm += tiles * packn * 8; - } - } - } - } - csi_mem_free(input_padd_buf); - - /*********************************** dot ***************************************/ - // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = - 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 64 * tiles * in_c; - } - float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float)); - -#pragma omp parallel for num_threads(1) - for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data - - int t = 0; - for (; t + 7 < tiles; t += 8) { - float *tm2 = img_tm2 + t * in_c; // img_tm2 row data - float *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); - vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); - vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); - vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); - vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); - vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); - vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); - vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); - - vsseg8e32_v_f32m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, - vl); - tm1 += 64 * tiles * packn; - tm2 += 8 * packn; - } - } - for (; t + 3 < tiles; t += 4) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data - float *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); - vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); - vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); - vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); - - vsseg4e32_v_f32m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); - tm1 += 64 * tiles * packn; - tm2 += 4 * packn; - } - } - for (; t + 1 < tiles; t += 2) { - float *tm2 = - img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data - float *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); - vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); - - vsseg2e32_v_f32m1(tm2, _tmp0, _tmp1, vl); - tm1 += 64 * tiles * packn; - tm2 += 2 * packn; - } - } - for (; t < tiles; t++) { - float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * - 8; // img_tm2 row data - float *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); - - vse32_v_f32m1(tm2, _tmp0, vl); - tm1 += 64 * tiles * packn; - tm2 += 1 * packn; - } - } - } - csi_mem_free(input_tm1_buf); - - // output_dot_buf: [out_c/packn, 64, blocks, packn] - float *output_dot_buf = - (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float)); -#pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / packn; p++) { - float *output0_tm = output_dot_buf + p * 64 * tiles * packn; // 4 channel dot output - float *kernel0_tm = kernel_data + p * 64 * in_c * packn; // 4 channel kernel - - for (int r = 0; r < 64; r++) { - float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel - - int t = 0; - for (; t + 7 < tiles; t += 8) { - float *r0 = img_tm2 + t * in_c; - float *k0 = kernel0_tm + r * in_c * packn; - - vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc2 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc3 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc4 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc5 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc6 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc7 = vfmv_v_f_f32m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl); - _acc2 = vfmacc_vf_f32m1(_acc2, r0[2], _kernel, vl); - _acc3 = vfmacc_vf_f32m1(_acc3, r0[3], _kernel, vl); - _acc4 = vfmacc_vf_f32m1(_acc4, r0[4], _kernel, vl); - _acc5 = vfmacc_vf_f32m1(_acc5, r0[5], _kernel, vl); - _acc6 = vfmacc_vf_f32m1(_acc6, r0[6], _kernel, vl); - _acc7 = vfmacc_vf_f32m1(_acc7, r0[7], _kernel, vl); - r0 += 8; - } - - vse32_v_f32m1(output0_tm, _acc0, vl); - vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl); - vse32_v_f32m1(output0_tm + packn * 2, _acc2, vl); - vse32_v_f32m1(output0_tm + packn * 3, _acc3, vl); - vse32_v_f32m1(output0_tm + packn * 4, _acc4, vl); - vse32_v_f32m1(output0_tm + packn * 5, _acc5, vl); - vse32_v_f32m1(output0_tm + packn * 6, _acc6, vl); - vse32_v_f32m1(output0_tm + packn * 7, _acc7, vl); - output0_tm += packn * 8; - } - - for (; t + 3 < tiles; t += 4) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; - float *k0 = kernel0_tm + r * in_c * packn; - - vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc2 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc3 = vfmv_v_f_f32m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl); - _acc2 = vfmacc_vf_f32m1(_acc2, r0[2], _kernel, vl); - _acc3 = vfmacc_vf_f32m1(_acc3, r0[3], _kernel, vl); - r0 += 4; - } - - vse32_v_f32m1(output0_tm, _acc0, vl); - vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl); - vse32_v_f32m1(output0_tm + packn * 2, _acc2, vl); - vse32_v_f32m1(output0_tm + packn * 3, _acc3, vl); - output0_tm += packn * 4; - } - for (; t + 1 < tiles; t += 2) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; - float *k0 = kernel0_tm + r * in_c * packn; - - vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl); - r0 += 2; - } - - vse32_v_f32m1(output0_tm, _acc0, vl); - vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl); - output0_tm += packn * 2; - } - for (; t < tiles; t++) { - float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; - float *k0 = kernel0_tm + r * in_c * packn; - - vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl); - r0 += 1; - } - - vse32_v_f32m1(output0_tm, _acc0, vl); - output0_tm += packn * 1; - } - } - } - - csi_mem_free(input_tm2_buf); - - /*************************** transform output ****************************/ - // output_tm1_buf: [out_c/packn, out_h6, out_w6, packn] - float *output_tm1_buf = - (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float)); - -/* -AT = { - { 1 1 1 1 1 1 1 0 }; - { 0 1 -1 2 -2 1/2 -1/2 0 }; - { 0 1 1 4 4 1/4 1/4 0 }; - { 0 1 -1 8 -8 1/8 -1/8 0 }; - { 0 1 1 16 16 1/16 1/16 0 }; - { 0 1 -1 32 -32 1/32 -1/32 1 } -}; -AT = { - { 1 1 1 1 1 32 32 0 }; - { 0 1 -1 2 -2 16 -16 0 }; - { 0 1 1 4 4 8 8 0 }; - { 0 1 -1 8 -8 4 -4 0 }; - { 0 1 1 16 16 2 2 0 }; - { 0 1 -1 32 -32 1 -1 1 } -}; -*/ -#pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / packn; p++) { - float *bias_tmp = bias_data + p * packn; - - float *out0_tm = output_dot_buf + - p * 64 * block_h * block_w * packn; // 输出转换前/dot后 第p个channel - float *out0 = - output_tm1_buf + p * 6 * block_h * 6 * block_w * packn; // 转换后输出 第p个channel - - float tmp[6][8][packn]; - - for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { - float *output0_tm_0 = out0_tm + (i * block_w + j) * packn; // 8*8 起始地址 - float *output0_tm_1 = output0_tm_0 + tiles * packn * 1; - float *output0_tm_2 = output0_tm_0 + tiles * packn * 2; - float *output0_tm_3 = output0_tm_0 + tiles * packn * 3; - float *output0_tm_4 = output0_tm_0 + tiles * packn * 4; - float *output0_tm_5 = output0_tm_0 + tiles * packn * 5; - float *output0_tm_6 = output0_tm_0 + tiles * packn * 6; - float *output0_tm_7 = output0_tm_0 + tiles * packn * 7; - - float *output0 = - out0 + (i * block_w * 6 * 6 + j * 6) * packn; // 输出 6*6 的起始地址 - - for (int m = 0; m < 8; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); - vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); - vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl); - vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl); - - vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl); - - vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl); - vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl); - - vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl); - vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl); - - vfloat32m1_t _tmp0m = - vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl), - vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); - vfloat32m1_t _tmp2m = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); - vfloat32m1_t _tmp4m = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); - - vfloat32m1_t _tmp1m = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); - vfloat32m1_t _tmp3m = vfmacc_vf_f32m1( - vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); - vfloat32m1_t _tmp5m = - vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl), - vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); - - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[4][m], _tmp4m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); - vse32_v_f32m1(tmp[5][m], _tmp5m, vl); - - output0_tm_0 += tiles * packn * 8; - output0_tm_1 += tiles * packn * 8; - output0_tm_2 += tiles * packn * 8; - output0_tm_3 += tiles * packn * 8; - output0_tm_4 += tiles * packn * 8; - output0_tm_5 += tiles * packn * 8; - output0_tm_6 += tiles * packn * 8; - output0_tm_7 += tiles * packn * 8; - } - - vfloat32m1_t _bias = vle32_v_f32m1(bias_tmp, vl); - for (int m = 0; m < 6; m++) { - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); - vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); - vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); - - vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); - vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); - - vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); - vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); - - vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); - vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); - - vfloat32m1_t _output00 = vfadd_vv_f32m1( - _bias, - vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl), - vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl), - vl); - vfloat32m1_t _output02 = vfadd_vv_f32m1( - _bias, - vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, - _tmp024c, vl), - vl); - vfloat32m1_t _output04 = vfadd_vv_f32m1( - _bias, - vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, - _tmp024c, vl), - vl); - - vfloat32m1_t _output01 = vfadd_vv_f32m1( - _bias, - vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, - _tmp135c, vl), - vl); - vfloat32m1_t _output03 = vfadd_vv_f32m1( - _bias, - vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, - _tmp135c, vl), - vl); - vfloat32m1_t _output05 = vfadd_vv_f32m1( - _bias, - vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl), - vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl), - vl); - - vse32_v_f32m1(output0, _output00, vl); - vse32_v_f32m1(output0 + packn * 2, _output02, vl); - vse32_v_f32m1(output0 + packn * 4, _output04, vl); - vse32_v_f32m1(output0 + packn * 1, _output01, vl); - vse32_v_f32m1(output0 + packn * 3, _output03, vl); - vse32_v_f32m1(output0 + packn * 5, _output05, vl); - - output0 += block_w * 6 * packn; - } - } - } - } - - csi_mem_free(output_dot_buf); - - // crop the output after transform: cut extra part (right , bottom) - winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, - block_h * 6, block_w * 6); - output_data += output_size; - csi_mem_free(output_tm1_buf); - } - - if (!flag_bias) { - csi_mem_free(bias_data); - bias_data = NULL; - } - return CSINN_TRUE; -} diff --git a/source/thead_rvv/convolution_3x3_fp16.c b/source/thead_rvv/convolution_3x3_fp16.c index a886ce0f..758fb77b 100644 --- a/source/thead_rvv/convolution_3x3_fp16.c +++ b/source/thead_rvv/convolution_3x3_fp16.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 ... @@ -106,22 +106,1042 @@ static void winograd_crop_output_packnto1_fp16(const __fp16 *output_trans, __fp1 } } -/* - pack n = VLEN / 16 (128/16=8 or 256/16=16) - constrain: output channel % n = 0 - input channel % n = 0 - kernel before: [O I 3*3] - kernel after : [O/n 8*8 I n] -*/ -void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tensor *o_kernel, - struct csi_tensor *t_kernel) +static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + __fp16 tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 6*6 start addr + const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + + vfloat16m1_t _tmp0m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f, + vfadd_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f, + vfsub_vv_f16m1(_r01, _r02, vl), vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f, + vfsub_vv_f16m1(_r01, _r03, vl), vl); + vfloat16m1_t _tmp5m = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _r0tm0 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl); + vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl); + vfloat16m1_t _r0tm5 = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) { - int32_t outch = o_kernel->dim[0]; - int32_t inch = o_kernel->dim[1]; + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[4][6][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _out00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat16m1_t _out03 = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f16m1(_bias, _out00, vl); + _out01 = vfadd_vv_f16m1(_bias, _out01, vl); + _out02 = vfadd_vv_f16m1(_bias, _out02, vl); + _out03 = vfadd_vv_f16m1(_bias, _out03, vl); + + vse16_v_f16m1(output0, _out00, vl); + vse16_v_f16m1(output0 + packn * 1, _out01, vl); + vse16_v_f16m1(output0 + packn * 2, _out02, vl); + vse16_v_f16m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} - __fp16 *kernel_data = (__fp16 *)o_kernel->data; +static inline void wg_bxf3s1_reorder_input_tile8_fp16(const __fp16 *src, __fp16 *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + __fp16 *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + + vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + + vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + + vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const __fp16 *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + + vse16_v_f16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel, + __fp16 *output, int in_ch, int out_ch, int tiles, + int area) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + const int vl = vsetvl_e16m1(packn); + + int p = 0; + for (; p + pack2n - 1 < out_ch; p += pack2n) { + __fp16 *output0_tm = output + p * area * tiles; // 16 channel dot output + __fp16 *output1_tm = output0_tm + packn * area * tiles; + + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 16 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc04 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc05 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc06 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc07 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc12 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc13 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc14 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc15 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc16 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc17 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl); + vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl); + vse16_v_f16m1(output0_tm + packn * 4, _acc04, vl); + vse16_v_f16m1(output0_tm + packn * 5, _acc05, vl); + vse16_v_f16m1(output0_tm + packn * 6, _acc06, vl); + vse16_v_f16m1(output0_tm + packn * 7, _acc07, vl); + output0_tm += packn * 8; + + vse16_v_f16m1(output1_tm, _acc10, vl); + vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl); + vse16_v_f16m1(output1_tm + packn * 2, _acc12, vl); + vse16_v_f16m1(output1_tm + packn * 3, _acc13, vl); + vse16_v_f16m1(output1_tm + packn * 4, _acc14, vl); + vse16_v_f16m1(output1_tm + packn * 5, _acc15, vl); + vse16_v_f16m1(output1_tm + packn * 6, _acc16, vl); + vse16_v_f16m1(output1_tm + packn * 7, _acc17, vl); + output1_tm += packn * 8; + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc12 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc13 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl); + vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl); + output0_tm += packn * 4; + + vse16_v_f16m1(output1_tm, _acc10, vl); + vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl); + vse16_v_f16m1(output1_tm + packn * 2, _acc12, vl); + vse16_v_f16m1(output1_tm + packn * 3, _acc13, vl); + output1_tm += packn * 4; + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + output0_tm += packn * 2; + + vse16_v_f16m1(output1_tm, _acc10, vl); + vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl); + output1_tm += packn * 2; + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + output0_tm += packn * 1; + + vse16_v_f16m1(output1_tm, _acc10, vl); + output1_tm += packn * 1; + } + } + } + + for (; p + packn - 1 < out_ch; p += packn) { + __fp16 *output0_tm = output + p * area * tiles; // 8 channel dot output + const __fp16 *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const __fp16 *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc04 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc05 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc06 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc07 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl); + vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl); + vse16_v_f16m1(output0_tm + packn * 4, _acc04, vl); + vse16_v_f16m1(output0_tm + packn * 5, _acc05, vl); + vse16_v_f16m1(output0_tm + packn * 6, _acc06, vl); + vse16_v_f16m1(output0_tm + packn * 7, _acc07, vl); + output0_tm += packn * 8; + } + for (; t + 3 < tiles; t += 4) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl); + vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl); + output0_tm += packn * 4; + } + for (; t + 1 < tiles; t += 2) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl); + output0_tm += packn * 2; + } + for (; t < tiles; t++) { + const __fp16 *k0 = kernel0_tm + r * in_ch * packn; + + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse16_v_f16m1(output0_tm, _acc00, vl); + output0_tm += packn * 1; + } + } + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const __fp16 *img0 = src + q * h * w; // feature map after padding - q channel + __fp16 *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + __fp16 tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // after padding 8*8 start addr + const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn; + // input_tm1 8*8 block start addr + __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); + + vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f16m1(_r03, _r05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = + vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = + vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[7][m], _tmp7m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vse16_v_f16m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + __fp16 *r0_tm0 = r0_tm; + __fp16 *r0_tm1 = r0_tm0 + tiles * packn; + __fp16 *r0_tm2 = r0_tm1 + tiles * packn; + __fp16 *r0_tm3 = r0_tm2 + tiles * packn; + __fp16 *r0_tm4 = r0_tm3 + tiles * packn; + __fp16 *r0_tm5 = r0_tm4 + tiles * packn; + __fp16 *r0_tm6 = r0_tm5 + tiles * packn; + __fp16 *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + + vfloat16m1_t _tmp12a = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = + vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + + vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + + vfloat16m1_t _tmp56a = vfmacc_vf_f16m1( + _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + + vse16_v_f16m1(r0_tm0, _r0tm0, vl); + vse16_v_f16m1(r0_tm7, _r0tm7, vl); + vse16_v_f16m1(r0_tm1, _r0tm1, vl); + vse16_v_f16m1(r0_tm2, _r0tm2, vl); + vse16_v_f16m1(r0_tm3, _r0tm3, vl); + vse16_v_f16m1(r0_tm4, _r0tm4, vl); + vse16_v_f16m1(r0_tm5, _r0tm5, vl); + vse16_v_f16m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias, + __fp16 *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const __fp16 *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + __fp16 tmp[6][8][packn]; + + vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6; + const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7; + + __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl); + + vfloat16m1_t _tmp0m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _tmp1m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp5m = + vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * packn * 8; + output0_tm_1 += tiles * packn * 8; + output0_tm_2 += tiles * packn * 8; + output0_tm_3 += tiles * packn * 8; + output0_tm_4 += tiles * packn * 8; + output0_tm_5 += tiles * packn * 8; + output0_tm_6 += tiles * packn * 8; + output0_tm_7 += tiles * packn * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); + vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); + + vfloat16m1_t _output00 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _output02 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _output04 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat16m1_t _output01 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _output03 = vfmacc_vf_f16m1( + vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _output05 = + vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f16m1(_bias, _output00, vl); + _output01 = vfadd_vv_f16m1(_bias, _output01, vl); + _output02 = vfadd_vv_f16m1(_bias, _output02, vl); + _output03 = vfadd_vv_f16m1(_bias, _output03, vl); + _output04 = vfadd_vv_f16m1(_bias, _output04, vl); + _output05 = vfadd_vv_f16m1(_bias, _output05, vl); + + vse16_v_f16m1(output0, _output00, vl); + vse16_v_f16m1(output0 + packn * 2, _output02, vl); + vse16_v_f16m1(output0 + packn * 4, _output04, vl); + vse16_v_f16m1(output0 + packn * 1, _output01, vl); + vse16_v_f16m1(output0 + packn * 3, _output03, vl); + vse16_v_f16m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16)); + + // kernel transform matrix: G + const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9; + __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const __fp16 *k0 = kernel0; + const __fp16 *k1 = kernel0 + 3; + const __fp16 *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + __fp16 tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + __fp16 *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/pack2n, 6*6, I, pack2n] + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + // [O/packn, 6*6, I, packn] + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + __fp16 *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + // pad input + winograd_pad_input_pack1ton_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/packn, 36, tiles, packn] + __fp16 *output_dot_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + __fp16 *output_tm1_buf = + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16)); + wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packnto1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + __fp16 *kernel_data = (__fp16 *)src_kernel->data; // for kernel transform buf, 3x3 --> 8x8 - __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); + __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); // kernel transform matrix: G const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f}, {-2.0f / 9, -2.0f / 9, -2.0f / 9}, @@ -143,7 +1163,7 @@ void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tens // {0.0f, 0.0f, 1.0f} // }; - csi_tensor_copy(t_kernel, o_kernel); + csinn_tensor_copy(dst_kernel, src_kernel); for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -175,39 +1195,48 @@ void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tens } } // optimized layout for winograd64 - const int packn = csrr_vlenb() / sizeof(__fp16); + __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16)); + dst_kernel->data = kernel_tm_packn; - __fp16 *kernel_tm_packn = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16)); - t_kernel->data = kernel_tm_packn; + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; - for (int oc = 0; oc < outch / packn; oc++) { - __fp16 *g0 = kernel_tm_packn + oc * 64 * inch * packn; + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + __fp16 *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + for (; oc + packn - 1 < outch; oc += packn) { + __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; for (int k = 0; k < 64; k++) { __fp16 *g00 = g0 + k * inch * packn; - - for (int ic = 0; ic < inch / packn; ic++) { - for (int i = 0; i < packn; i++) { - for (int j = 0; j < packn; j++) { - const __fp16 *k00 = - kernel_tm + (oc * packn + j) * 64 * inch + (ic * packn + i) * 64; - *g00++ = k00[k]; - } + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; } } } } - csi_mem_free(kernel_tm); + shl_mem_free(kernel_tm); } -/* - n = VLEN / 16 - constrain: output channel % n = 0 - input channel % n = 0 -*/ -int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_rvv_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -215,12 +1244,6 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct __fp16 *bias_data = (__fp16 *)bias->data; // param - int kernel_h = kernel->dim[2]; - int kernel_w = kernel->dim[3]; - int stride_h = params->stride_height; - int stride_w = params->stride_width; - int dilation_h = params->dilation_height; - int dilation_w = params->dilation_width; int pad_left = params->pad_left; int pad_top = params->pad_top; @@ -229,7 +1252,6 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct int in_h = input->dim[2]; int in_w = input->dim[3]; int input_size = in_c * in_h * in_w; - int kernel_size = in_c * kernel_h * kernel_w; int out_c = kernel->dim[0]; int out_h = output->dim[2]; @@ -240,563 +1262,57 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct int block_h = (out_h + 5) / 6; int block_w = (out_w + 5) / 6; - int padded_in_h = - block_h * 6 + - 2; // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; int padded_in_w = block_w * 6 + 2; int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel - /****************************** bias *****************************/ - bool flag_bias = 1; // default: conv2d layer include bias - if (bias_data == NULL) { - flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16)); - } - - const int packn = csrr_vlenb() / sizeof(__fp16); - const int vl = vsetvl_e16m1(packn); + int tiles = block_h * block_w; for (int n = 0; n < batch; n++) { - // pad buffer: [in_c/8 h w 8] - __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + // pad buffer: [in_c/packn h w packn] + __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); // pad input winograd_pad_input_pack1ton_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left); - input_data += input_size; - // input transform buffer1: [in_ch/8, 64, blocks, 8] - __fp16 *input_tm1_buf = - (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); + input_data += input_size; /****************************** transform input *****************************/ - /* - BT = { - { 1 0 -5.25 0 5.25 0 -1 0 }; - { 0 1 1 -4.25 -4.25 1 1 0 }; - { 0 -1 1 4.25 -4.25 -1 1 0 }; - { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; - { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; - { 0 2 4 -2.5 -5 0.5 1 0 }; - { 0 -2 4 2.5 -5 -0.5 1 0 }; - { 0 -1 0 5.25 0 -5.25 0 1 } - }; - */ - - int tiles = block_h * block_w; - -#pragma omp parallel for num_threads(1) - for (int q = 0; q < in_c / packn; q++) { - __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * - packn; // feature map after padding - q channel - __fp16 *img0_tm = - input_tm1_buf + q * 64 * tiles * packn; // transform and interleave - q channel - - __fp16 tmp[8][8][packn]; - - for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { - __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) * - packn; // feature map after padding 8*8 start addr - __fp16 *r0_tm = - img0_tm + (i * block_w + j) * packn; // input_tm1 8*8 block start addr - - for (int m = 0; m < 8; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); - vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); - vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); - - vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, - vfsub_vv_f16m1(_r04, _r02, vl), vl); - vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, - vfsub_vv_f16m1(_r03, _r05, vl), vl); - - vfloat16m1_t _tmp12a = - vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); - vfloat16m1_t _tmp12b = - vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); - vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - - vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); - vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, - _r05, vl); - vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); - - vfloat16m1_t _tmp56a = - vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); - vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, - _r05, vl); - vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); - vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); - - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[7][m], _tmp7m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); - vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vse16_v_f16m1(tmp[5][m], _tmp5m, vl); - vse16_v_f16m1(tmp[6][m], _tmp6m, vl); - - r0 += padded_in_w * packn; - } - - for (int m = 0; m < 8; m++) { - __fp16 *r0_tm0 = r0_tm; - __fp16 *r0_tm1 = r0_tm0 + tiles * packn; - __fp16 *r0_tm2 = r0_tm1 + tiles * packn; - __fp16 *r0_tm3 = r0_tm2 + tiles * packn; - __fp16 *r0_tm4 = r0_tm3 + tiles * packn; - __fp16 *r0_tm5 = r0_tm4 + tiles * packn; - __fp16 *r0_tm6 = r0_tm5 + tiles * packn; - __fp16 *r0_tm7 = r0_tm6 + tiles * packn; - - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); - vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); - vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); - - vfloat16m1_t _r0tm0 = - vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, - vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); - vfloat16m1_t _r0tm7 = - vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, - vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); - - vfloat16m1_t _tmp12a = - vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); - vfloat16m1_t _tmp12b = - vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); - vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - - vfloat16m1_t _tmp34a = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); - vfloat16m1_t _tmp34b = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), - 2.f, _tmp05, vl); - vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); - - vfloat16m1_t _tmp56a = vfmacc_vf_f16m1( - _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); - vfloat16m1_t _tmp56b = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), - 0.5f, _tmp05, vl); - vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); - vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); - - vse16_v_f16m1(r0_tm0, _r0tm0, vl); - vse16_v_f16m1(r0_tm7, _r0tm7, vl); - vse16_v_f16m1(r0_tm1, _r0tm1, vl); - vse16_v_f16m1(r0_tm2, _r0tm2, vl); - vse16_v_f16m1(r0_tm3, _r0tm3, vl); - vse16_v_f16m1(r0_tm4, _r0tm4, vl); - vse16_v_f16m1(r0_tm5, _r0tm5, vl); - vse16_v_f16m1(r0_tm6, _r0tm6, vl); - - r0_tm += tiles * packn * 8; - } - } - } - } - csi_mem_free(input_padd_buf); - - /*********************************** dot ***************************************/ - // reorder input_tm1_buf - int size_input_tm2 = 0; - if (tiles >= 8) { - size_input_tm2 = - 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8; - } else if (tiles >= 4) { - size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4; - } else if (tiles >= 2) { - size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2; - } else { - size_input_tm2 = 64 * tiles * in_c; - } - __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16)); - -#pragma omp parallel for num_threads(1) - for (int r = 0; r < 64; r++) { - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data - - int t = 0; - for (; t + 7 < tiles; t += 8) { - __fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data - __fp16 *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); - vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); - vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); - vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); - vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); - vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); - vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); - vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); - - vsseg8e16_v_f16m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, - vl); - tm1 += 64 * tiles * packn; - tm2 += 8 * packn; - } - } - for (; t + 3 < tiles; t += 4) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data - __fp16 *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); - vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); - vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); - vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); - - vsseg4e16_v_f16m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); - tm1 += 64 * tiles * packn; - tm2 += 4 * packn; - } - } - for (; t + 1 < tiles; t += 2) { - __fp16 *tm2 = - img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data - __fp16 *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); - vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); - - vsseg2e16_v_f16m1(tm2, _tmp0, _tmp1, vl); - tm1 += 64 * tiles * packn; - tm2 += 2 * packn; - } - } - for (; t < tiles; t++) { - __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * - 8; // img_tm2 row data - __fp16 *tm1 = input_tm1_buf; - - tm1 += (r * tiles + t) * packn; - for (int q = 0; q < in_c / packn; q++) { - vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); - - vse16_v_f16m1(tm2, _tmp0, vl); - tm1 += 64 * tiles * packn; - tm2 += 1 * packn; - } - } - } - - csi_mem_free(input_tm1_buf); - - // output_dot_buf: [out_c/8, 64, blocks, 8] + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16)); + wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/packn, 64, tiles, packn] __fp16 *output_dot_buf = - (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16)); - -#pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / packn; p++) { - __fp16 *output0_tm = output_dot_buf + p * 64 * tiles * packn; - __fp16 *kernel0_tm = kernel_data + p * 64 * in_c * packn; - - for (int r = 0; r < 64; r++) { - __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel - - int t = 0; - for (; t + 7 < tiles; t += 8) { - __fp16 *r0 = img_tm2 + t * in_c; - __fp16 *k0 = kernel0_tm + r * in_c * packn; - - vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc2 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc3 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc4 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc5 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc6 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc7 = vfmv_v_f_f16m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl); - _acc2 = vfmacc_vf_f16m1(_acc2, r0[2], _kernel, vl); - _acc3 = vfmacc_vf_f16m1(_acc3, r0[3], _kernel, vl); - _acc4 = vfmacc_vf_f16m1(_acc4, r0[4], _kernel, vl); - _acc5 = vfmacc_vf_f16m1(_acc5, r0[5], _kernel, vl); - _acc6 = vfmacc_vf_f16m1(_acc6, r0[6], _kernel, vl); - _acc7 = vfmacc_vf_f16m1(_acc7, r0[7], _kernel, vl); - r0 += 8; - } - - vse16_v_f16m1(output0_tm, _acc0, vl); - vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl); - vse16_v_f16m1(output0_tm + packn * 2, _acc2, vl); - vse16_v_f16m1(output0_tm + packn * 3, _acc3, vl); - vse16_v_f16m1(output0_tm + packn * 4, _acc4, vl); - vse16_v_f16m1(output0_tm + packn * 5, _acc5, vl); - vse16_v_f16m1(output0_tm + packn * 6, _acc6, vl); - vse16_v_f16m1(output0_tm + packn * 7, _acc7, vl); - output0_tm += packn * 8; - } - for (; t + 3 < tiles; t += 4) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; - __fp16 *k0 = kernel0_tm + r * in_c * packn; - - vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc2 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc3 = vfmv_v_f_f16m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl); - _acc2 = vfmacc_vf_f16m1(_acc2, r0[2], _kernel, vl); - _acc3 = vfmacc_vf_f16m1(_acc3, r0[3], _kernel, vl); - r0 += 4; - } - - vse16_v_f16m1(output0_tm, _acc0, vl); - vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl); - vse16_v_f16m1(output0_tm + packn * 2, _acc2, vl); - vse16_v_f16m1(output0_tm + packn * 3, _acc3, vl); - output0_tm += packn * 4; - } - for (; t + 1 < tiles; t += 2) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; - __fp16 *k0 = kernel0_tm + r * in_c * packn; - - vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl); - vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl); - _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl); - r0 += 2; - } - - vse16_v_f16m1(output0_tm, _acc0, vl); - vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl); - output0_tm += packn * 2; - } - for (; t < tiles; t++) { - __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; - __fp16 *k0 = kernel0_tm + r * in_c * packn; + (__fp16 *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(__fp16)); + wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); - vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl); - - for (int c = 0; c < in_c; c++) { - vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl); - k0 += packn; - _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl); - r0 += 1; - } - - vse16_v_f16m1(output0_tm, _acc0, vl); - output0_tm += packn * 1; - } - } - } - - csi_mem_free(input_tm2_buf); - /*************************** transform output ****************************/ - // output_tm1_buf: [out_c/8, out_h6, out_w6, 8] + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] __fp16 *output_tm1_buf = - (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16)); - - /* - AT = { - { 1 1 1 1 1 1 1 0 }; - { 0 1 -1 2 -2 1/2 -1/2 0 }; - { 0 1 1 4 4 1/4 1/4 0 }; - { 0 1 -1 8 -8 1/8 -1/8 0 }; - { 0 1 1 16 16 1/16 1/16 0 }; - { 0 1 -1 32 -32 1/32 -1/32 1 } - }; - AT = { - { 1 1 1 1 1 32 32 0 }; - { 0 1 -1 2 -2 16 -16 0 }; - { 0 1 1 4 4 8 8 0 }; - { 0 1 -1 8 -8 4 -4 0 }; - { 0 1 1 16 16 2 2 0 }; - { 0 1 -1 32 -32 1 -1 1 } - }; - */ - -#pragma omp parallel for num_threads(1) - for (int p = 0; p < out_c / packn; p++) { - __fp16 *bias_tmp = bias_data + p * packn; - - __fp16 *out0_tm = output_dot_buf + - p * 64 * block_h * block_w * packn; // 输出转换前/dot后 第p个channel - __fp16 *out0 = - output_tm1_buf + p * 6 * block_h * 6 * block_w * packn; // 转换后输出 第p个channel - - __fp16 tmp[6][8][packn]; - - for (int i = 0; i < block_h; i++) { - for (int j = 0; j < block_w; j++) { - __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * packn; // 8*8 起始地址 - __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1; - __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2; - __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3; - __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4; - __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5; - __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6; - __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7; - - __fp16 *output0 = - out0 + (i * block_w * 6 * 6 + j * 6) * packn; // 输出 6*6 的起始地址 - - for (int m = 0; m < 8; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); - vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); - vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl); - vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl); - - vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl); - - vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl); - vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl); - - vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl); - vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl); - - vfloat16m1_t _tmp0m = - vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl), - vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); - vfloat16m1_t _tmp2m = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); - vfloat16m1_t _tmp4m = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); - - vfloat16m1_t _tmp1m = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); - vfloat16m1_t _tmp3m = vfmacc_vf_f16m1( - vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); - vfloat16m1_t _tmp5m = - vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl), - vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); - - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); - vse16_v_f16m1(tmp[5][m], _tmp5m, vl); - - output0_tm_0 += tiles * packn * 8; - output0_tm_1 += tiles * packn * 8; - output0_tm_2 += tiles * packn * 8; - output0_tm_3 += tiles * packn * 8; - output0_tm_4 += tiles * packn * 8; - output0_tm_5 += tiles * packn * 8; - output0_tm_6 += tiles * packn * 8; - output0_tm_7 += tiles * packn * 8; - } - - vfloat16m1_t _bias = vle16_v_f16m1(bias_tmp, vl); - for (int m = 0; m < 6; m++) { - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); - vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); - vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); - - vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); - vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); - - vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); - vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); - - vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); - vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); - - vfloat16m1_t _output00 = vfadd_vv_f16m1( - _bias, - vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), - vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl), - vl); - vfloat16m1_t _output02 = vfadd_vv_f16m1( - _bias, - vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, - _tmp024c, vl), - vl); - vfloat16m1_t _output04 = vfadd_vv_f16m1( - _bias, - vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, - _tmp024c, vl), - vl); - - vfloat16m1_t _output01 = vfadd_vv_f16m1( - _bias, - vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, - _tmp135c, vl), - vl); - vfloat16m1_t _output03 = vfadd_vv_f16m1( - _bias, - vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, - _tmp135c, vl), - vl); - vfloat16m1_t _output05 = vfadd_vv_f16m1( - _bias, - vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), - vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl), - vl); - - vse16_v_f16m1(output0, _output00, vl); - vse16_v_f16m1(output0 + packn * 2, _output02, vl); - vse16_v_f16m1(output0 + packn * 4, _output04, vl); - vse16_v_f16m1(output0 + packn * 1, _output01, vl); - vse16_v_f16m1(output0 + packn * 3, _output03, vl); - vse16_v_f16m1(output0 + packn * 5, _output05, vl); - - output0 += block_w * 6 * packn; - } - } - } - } + (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16)); + wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); - csi_mem_free(output_dot_buf); // crop the output after transform: cut extra part (right , bottom) winograd_crop_output_packnto1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6); output_data += output_size; - csi_mem_free(output_tm1_buf); - } - - if (!flag_bias) { - csi_mem_free(bias_data); - bias_data = NULL; + shl_mem_free(output_tm1_buf); } return CSINN_TRUE; } diff --git a/source/thead_rvv/convolution_3x3_fp32.c b/source/thead_rvv/convolution_3x3_fp32.c new file mode 100644 index 00000000..0f6f6016 --- /dev/null +++ b/source/thead_rvv/convolution_3x3_fp32.c @@ -0,0 +1,1320 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + note: VLEN = 128/256 ... +*************************************************************/ +/* + padding input for winograd input transform , and change memory layout to [n c/4 h w 4] + input layout: [n c h w] + input_padded layout: [n c/packn h w packn] + constrain: input channel % packn = 0 +*/ + +static void winograd_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, + int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + int padded_hw = padded_h * padded_w; + const int in_size = inh * inw; // per-channel size + + float *pad_ptr = input_padded; + float *inp_ptr = (float *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + inp_ptr = (float *)input + c * in_size; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl); + inp_ptr++; + vse32_v_f32m1(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +} + +static void winograd_crop_output_packnto1_fp32(const float *output_trans, float *output, int out_c, + int out_h, int out_w, int wino_h, int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + + float *out_tm_ptr = (float *)output_trans; + float *out_ptr = output; + + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + out_tm_ptr = (float *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + + for (int h = 0; h < out_h; h++) { + float *crop_ptr = out_tm_ptr + h * wino_w * packn; + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl); + crop_ptr += packn; + vsse32_v_f32m1(out_ptr, out_size * sizeof(float), _tmp, vl); + out_ptr++; + } + } + } +} + +static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // after padding - q channel + float *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + + float tmp[6][6][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // pad_buf 6*6 block start addr + const float *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + + vfloat32m1_t _tmp0m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f, + vfadd_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f, + vfsub_vv_f32m1(_r01, _r02, vl), vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f, + vfsub_vv_f32m1(_r01, _r03, vl), vl); + vfloat32m1_t _tmp5m = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + + for (int m = 0; m < 6; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _r0tm0 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); + vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f, + vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f, + vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl); + vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f, + vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl); + vfloat32m1_t _r0tm5 = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + + float tmp[4][6][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + + float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _tmp3m = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + + for (int m = 0; m < 4; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _out00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl); + vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl); + vfloat32m1_t _out03 = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl); + + _out00 = vfadd_vv_f32m1(_bias, _out00, vl); + _out01 = vfadd_vv_f32m1(_bias, _out01, vl); + _out02 = vfadd_vv_f32m1(_bias, _out02, vl); + _out03 = vfadd_vv_f32m1(_bias, _out03, vl); + + vse32_v_f32m1(output0, _out00, vl); + vse32_v_f32m1(output0 + packn * 1, _out01, vl); + vse32_v_f32m1(output0 + packn * 2, _out02, vl); + vse32_v_f32m1(output0 + packn * 3, _out03, vl); + + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile8_fp32(const float *src, float *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + for (int r = 0; r < area; r++) { + float *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + + vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + + vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + + vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const float *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + + vse32_v_f32m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m8n8_fp32(const float *input, const float *kernel, + float *output, int in_ch, int out_ch, int tiles, + int area) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + const int vl = vsetvl_e32m1(packn); + + int p = 0; + for (; p + pack2n - 1 < out_ch; p += pack2n) { + float *output0_tm = output + p * area * tiles; // 8 channel dot output + float *output1_tm = output0_tm + packn * area * tiles; + + const float *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc04 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc05 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc06 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc07 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc12 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc13 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc14 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc15 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc16 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc17 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl); + vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl); + vse32_v_f32m1(output0_tm + packn * 4, _acc04, vl); + vse32_v_f32m1(output0_tm + packn * 5, _acc05, vl); + vse32_v_f32m1(output0_tm + packn * 6, _acc06, vl); + vse32_v_f32m1(output0_tm + packn * 7, _acc07, vl); + output0_tm += packn * 8; + + vse32_v_f32m1(output1_tm, _acc10, vl); + vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl); + vse32_v_f32m1(output1_tm + packn * 2, _acc12, vl); + vse32_v_f32m1(output1_tm + packn * 3, _acc13, vl); + vse32_v_f32m1(output1_tm + packn * 4, _acc14, vl); + vse32_v_f32m1(output1_tm + packn * 5, _acc15, vl); + vse32_v_f32m1(output1_tm + packn * 6, _acc16, vl); + vse32_v_f32m1(output1_tm + packn * 7, _acc17, vl); + output1_tm += packn * 8; + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc12 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc13 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl); + vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl); + output0_tm += packn * 4; + + vse32_v_f32m1(output1_tm, _acc10, vl); + vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl); + vse32_v_f32m1(output1_tm + packn * 2, _acc12, vl); + vse32_v_f32m1(output1_tm + packn * 3, _acc13, vl); + output1_tm += packn * 4; + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + output0_tm += packn * 2; + + vse32_v_f32m1(output1_tm, _acc10, vl); + vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl); + output1_tm += packn * 2; + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * pack2n; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + output0_tm += packn * 1; + + vse32_v_f32m1(output1_tm, _acc10, vl); + output1_tm += packn * 1; + } + } + } + + for (; p + packn - 1 < out_ch; p += packn) { + float *output0_tm = output + p * area * tiles; // 4 channel dot output + const float *kernel0_tm = kernel + p * area * in_ch; // 4 channel kernel + + for (int r = 0; r < area; r++) { + const float *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc04 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc05 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc06 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc07 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl); + vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl); + vse32_v_f32m1(output0_tm + packn * 4, _acc04, vl); + vse32_v_f32m1(output0_tm + packn * 5, _acc05, vl); + vse32_v_f32m1(output0_tm + packn * 6, _acc06, vl); + vse32_v_f32m1(output0_tm + packn * 7, _acc07, vl); + output0_tm += packn * 8; + } + for (; t + 3 < tiles; t += 4) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl); + vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl); + output0_tm += packn * 4; + } + for (; t + 1 < tiles; t += 2) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl); + output0_tm += packn * 2; + } + for (; t < tiles; t++) { + const float *k0 = kernel0_tm + r * in_ch * packn; + + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < in_ch; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_f32m1(output0_tm, _acc00, vl); + output0_tm += packn * 1; + } + } + } +} + +static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h, + int w, int blk_h, int blk_w) +{ + /* input transform matrix + BT = { + { 1 0 -5.25 0 5.25 0 -1 0 }; + { 0 1 1 -4.25 -4.25 1 1 0 }; + { 0 -1 1 4.25 -4.25 -1 1 0 }; + { 0 0.5 0.25 -2.5 -1.25 2 1 0 }; + { 0 -0.5 0.25 2.5 -1.25 -2 1 0 }; + { 0 2 4 -2.5 -5 0.5 1 0 }; + { 0 -2 4 2.5 -5 -0.5 1 0 }; + { 0 -1 0 5.25 0 -5.25 0 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const float *img0 = src + q * h * w; // feature map after padding - q channel + float *img0_tm = dst + q * 64 * tiles; // transform and interleave - q channel + + float tmp[8][8][packn]; + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *r0 = + img0 + (i * w * 6 + j * 6) * packn; // feature map after padding 8*8 start addr + float *r0_tm = img0_tm + (i * blk_w + j) * packn; // input_tm1 8*8 block start addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); + + vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, + vfsub_vv_f32m1(_r04, _r02, vl), vl); + vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, + vfsub_vv_f32m1(_r03, _r05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = + vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, + vl); + vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = + vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, + vl); + vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[7][m], _tmp7m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vse32_v_f32m1(tmp[6][m], _tmp6m, vl); + + r0 += w * packn; + } + + for (int m = 0; m < 8; m++) { + float *r0_tm0 = r0_tm; + float *r0_tm1 = r0_tm0 + tiles * packn; + float *r0_tm2 = r0_tm1 + tiles * packn; + float *r0_tm3 = r0_tm2 + tiles * packn; + float *r0_tm4 = r0_tm3 + tiles * packn; + float *r0_tm5 = r0_tm4 + tiles * packn; + float *r0_tm6 = r0_tm5 + tiles * packn; + float *r0_tm7 = r0_tm6 + tiles * packn; + + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, + vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); + vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, + vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); + + vfloat32m1_t _tmp12a = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat32m1_t _tmp12b = + vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + + vfloat32m1_t _tmp34a = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat32m1_t _tmp34b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, + _tmp05, vl); + vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + + vfloat32m1_t _tmp56a = vfmacc_vf_f32m1( + _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat32m1_t _tmp56b = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, + _tmp05, vl); + vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + + vse32_v_f32m1(r0_tm0, _r0tm0, vl); + vse32_v_f32m1(r0_tm7, _r0tm7, vl); + vse32_v_f32m1(r0_tm1, _r0tm1, vl); + vse32_v_f32m1(r0_tm2, _r0tm2, vl); + vse32_v_f32m1(r0_tm3, _r0tm3, vl); + vse32_v_f32m1(r0_tm4, _r0tm4, vl); + vse32_v_f32m1(r0_tm5, _r0tm5, vl); + vse32_v_f32m1(r0_tm6, _r0tm6, vl); + + r0_tm += tiles * packn * 8; + } + } + } + } +} + +static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias, + float *dst, int ch, int blk_h, int blk_w) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 1 1 0 }; + { 0 1 -1 2 -2 1/2 -1/2 0 }; + { 0 1 1 4 4 1/4 1/4 0 }; + { 0 1 -1 8 -8 1/8 -1/8 0 }; + { 0 1 1 16 16 1/16 1/16 0 }; + { 0 1 -1 32 -32 1/32 -1/32 1 } + }; + AT = { + { 1 1 1 1 1 32 32 0 }; + { 0 1 -1 2 -2 16 -16 0 }; + { 0 1 1 4 4 8 8 0 }; + { 0 1 -1 8 -8 4 -4 0 }; + { 0 1 1 16 16 2 2 0 }; + { 0 1 -1 32 -32 1 -1 1 } + }; + */ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const float *out0_tm = src + p * 64 * tiles; // 输出转换前/dot后 第p个channel + float *out0 = dst + p * 6 * blk_h * 6 * blk_w; // 转换后输出 第p个channel + + float tmp[6][8][packn]; + + vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 8*8 起始地址 + const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + const float *output0_tm_6 = output0_tm_0 + tiles * packn * 6; + const float *output0_tm_7 = output0_tm_0 + tiles * packn * 7; + + float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn; // out 6*6 addr + + for (int m = 0; m < 8; m++) { + vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl); + + vfloat32m1_t _tmp0m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _tmp2m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _tmp4m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _tmp1m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _tmp3m = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _tmp5m = + vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + + output0_tm_0 += tiles * packn * 8; + output0_tm_1 += tiles * packn * 8; + output0_tm_2 += tiles * packn * 8; + output0_tm_3 += tiles * packn * 8; + output0_tm_4 += tiles * packn * 8; + output0_tm_5 += tiles * packn * 8; + output0_tm_6 += tiles * packn * 8; + output0_tm_7 += tiles * packn * 8; + } + + for (int m = 0; m < 6; m++) { + vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); + vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); + + vfloat32m1_t _output00 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl), + vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _output02 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _output04 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + + vfloat32m1_t _output01 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _output03 = vfmacc_vf_f32m1( + vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _output05 = + vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl), + vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + + _output00 = vfadd_vv_f32m1(_bias, _output00, vl); + _output01 = vfadd_vv_f32m1(_bias, _output01, vl); + _output02 = vfadd_vv_f32m1(_bias, _output02, vl); + _output03 = vfadd_vv_f32m1(_bias, _output03, vl); + _output04 = vfadd_vv_f32m1(_bias, _output04, vl); + _output05 = vfadd_vv_f32m1(_bias, _output05, vl); + + vse32_v_f32m1(output0, _output00, vl); + vse32_v_f32m1(output0 + packn * 2, _output02, vl); + vse32_v_f32m1(output0 + packn * 4, _output04, vl); + vse32_v_f32m1(output0 + packn * 1, _output01, vl); + vse32_v_f32m1(output0 + packn * 3, _output03, vl); + vse32_v_f32m1(output0 + packn * 5, _output05, vl); + + output0 += blk_w * 6 * packn; + } + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float)); + + // kernel transform matrix: G + const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f}}; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/pack2n, 6*6, I, pack2n] + float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + float *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + // [O/packn, 6*6, I, packn] + for (; oc + packn - 1 < outch; oc += packn) { + float *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + float *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_c/packn, 36, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float)); + wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/packn, 36, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(out_c / 4 * 36 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float)); + wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + + float *kernel_data = (float *)src_kernel->data; + // for kernel transform buf, 3x3 --> 8x8 + float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float)); + // kernel transform matrix: G + const float ktm[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + + // const float ktm[8][3] = { + // {1.0f, 0.0f, 0.0f}, + // {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + // {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + // {1.0f / 90, 1.0f / 45, 2.0f / 45}, + // {1.0f / 90, -1.0f / 45, 2.0f / 45}, + // {32.0f / 45, 16.0f / 45, 8.0f / 45}, + // {32.0f / 45, -16.0f / 45, 8.0f / 45}, + // {0.0f, 0.0f, 1.0f} + // }; + + csinn_tensor_copy(dst_kernel, src_kernel); + + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const float *kernel0 = kernel_data + p * inch * 9 + q * 9; + float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64; + + // transform kernel + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; + + // h : first compute the transport matrix tmp = (g * GT)T + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 8; j++) { + float *tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + kernel_tmp[j * 8 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd64 + float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int oc = 0; + for (; oc + pack2n - 1 < outch; oc += pack2n) { + float *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + float *g00 = g0 + k * inch * pack2n; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < pack2n; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + + for (; oc + packn - 1 < outch; oc += packn) { + float *g0 = kernel_tm_packn + oc * 64 * inch; + for (int k = 0; k < 64; k++) { + float *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +int shl_rvv_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)params->conv_extra.kernel_tm->data; + float *bias_data = (float *)bias->data; + + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + + // winograd param + int block_h = (out_h + 5) / 6; + int block_w = (out_w + 5) / 6; + + // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2 + int padded_in_h = block_h * 6 + 2; + int padded_in_w = block_w * 6 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + + int tiles = block_h * block_w; + + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + // pad input + winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left); + + input_data += input_size; + + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/packn, 64, tiles, packn] + float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float)); + wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w); + shl_mem_free(input_padd_buf); + + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [64, tiles/8, in_c, 8] + float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float)); + wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64); + shl_mem_free(input_tm1_buf); + + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/packn, 64, tiles, packn] + float *output_dot_buf = (float *)shl_mem_alloc(out_c / 4 * 64 * tiles * 4 * sizeof(float)); + wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 64); + shl_mem_free(input_tm2_buf); + + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn] + float *output_tm1_buf = + (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float)); + wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w); + shl_mem_free(output_dot_buf); + + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 6, block_w * 6); + output_data += output_size; + shl_mem_free(output_tm1_buf); + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_3x3_int8.c b/source/thead_rvv/convolution_3x3_int8.c new file mode 100644 index 00000000..438c29ca --- /dev/null +++ b/source/thead_rvv/convolution_3x3_int8.c @@ -0,0 +1,682 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* CSI-NN2 version 2.0.x */ +#include "shl_c908.h" +/************************************************************* + note: VLEN = 128 +*************************************************************/ + +#ifdef RVV_1_0_0 +/****************************************************************************************** + * padding input for winograd input transform , and change memory layout + * input layout: [n c h w] + * input_padded layout: [n, c/8, h, w, 8] + * constrain: input channel % 8 = 0 + ******************************************************************************************/ +static void winograd_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, + int inh, int inw, int padded_h, int padded_w, + int pad_top, int pad_left, int8_t pad_value) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int padded_hw = padded_h * padded_w; + const int in_size = inh * inw; // per-channel size + int8_t *pad_ptr = input_padded; + int8_t *inp_ptr = (int8_t *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl); + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + inp_ptr = (int8_t *)input + c * in_size; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl); + inp_ptr++; + vse8_v_i8mf2(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +} + +/****************************************************************************************** + * cut winograd output transform for output, and change memory layout + * winograd output transform layout: [n, c/8, h, w, 8] + * output layout: [n, c, h, w] + * constrain: output channel % 8 = 0 + ******************************************************************************************/ +static void winograd_crop_output_packnto1_int8(const int8_t *output_trans, int8_t *output, + int out_c, int out_h, int out_w, int wino_h, + int wino_w) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + const int out_size = out_h * out_w; // per-channel size + const int crop_size = wino_h * wino_w; + int8_t *out_tm_ptr = (int8_t *)output_trans; + int8_t *out_ptr = output; + int c = 0; + for (; c + packn - 1 < out_c; c += packn) { + out_tm_ptr = (int8_t *)output_trans + c * crop_size; + out_ptr = output + c * out_size; + for (int h = 0; h < out_h; h++) { + int8_t *crop_ptr = out_tm_ptr + h * wino_w * vl; + for (int w = 0; w < out_w; w++) { + vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl); + crop_ptr += vl; + vsse8_v_i8mf2(out_ptr, out_size * sizeof(int8_t), _tmp, vl); + out_ptr++; + } + } + } +} + +/****************************************************************************************** + * winograd int8 postprocess int32 --> int8 + * _src: 8 channels int32 macc + * multiplier: multi for scale, support channel quantization + * shift: shift for scale, support channel quantization + * out_zp: output zero_point + ******************************************************************************************/ +static vint8mf2_t requantize_m2_s(vint32m2_t _src, int32_t *multiplier, int32_t *shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mult = vle32_v_i32m2(multiplier, vl); + vint32m2_t _shift = vle32_v_i32m2(shift, vl); + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +} + +static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h, + int w, int blk_h, int blk_w, int8_t input_zp) +{ + /* input transform matrix + BT = { + { 4 0 -5 0 1 0 }; + { 0 -4 -4 1 1 0 }; + { 0 4 -4 -1 1 0 }; + { 0 -2 -1 2 1 0 }; + { 0 2 -1 -2 1 0 }; + { 0 4 0 -5 0 1 } + }; + [0] = 4 * r00 - 5 * r02 + r04 + [1] = -4 * (r01 + r02) + r04 + r03 + [2] = 4 * (r01 - r02) + r04 - r03 + [3] = -2 * (r01 - r03) + r04 - r02 + [4] = 2 * (r01 - r03) + r04 - r02 + [5] = 4 * r01 - 5 * r03 + r05 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int q = 0; q + packn - 1 < ch; q += packn) { + const int8_t *img0 = src + q * h * w; // feature map after padding - q channel + int16_t *img0_tm = dst + q * 36 * tiles; // transform and interleave - q channel + int16_t tmp[6][6][packn]; + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + // feature map after padding 6*6 start addr + const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn; + // input_tm1 6*6 block start addr + int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn; + for (int m = 0; m < 6; m++) { + vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl); + vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl); + vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl); + vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl); + vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl); + vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl); + // (q - z) + vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl); + vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl); + vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl); + vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl); + vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl); + vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl); + vint16m1_t _tmp0m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl), + _r04, vl); + vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4, + vadd_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4, + vsub_vv_i16m1(_r01, _r02, vl), vl); + vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2, + vsub_vv_i16m1(_r01, _r03, vl), vl); + vint16m1_t _tmp5m = vadd_vv_i16m1( + vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl), + _r05, vl); + // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl), + // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m = + // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02, + // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl), + // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m = + // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03, + // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), + // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m = + // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl), + // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl); + vse16_v_i16m1(tmp[0][m], _tmp0m, vl); + vse16_v_i16m1(tmp[1][m], _tmp1m, vl); + vse16_v_i16m1(tmp[2][m], _tmp2m, vl); + vse16_v_i16m1(tmp[3][m], _tmp3m, vl); + vse16_v_i16m1(tmp[4][m], _tmp4m, vl); + vse16_v_i16m1(tmp[5][m], _tmp5m, vl); + r0 += w * packn; + } + for (int m = 0; m < 6; m++) { + int16_t *r0_tm0 = r0_tm; + int16_t *r0_tm1 = r0_tm0 + tiles * packn; + int16_t *r0_tm2 = r0_tm1 + tiles * packn; + int16_t *r0_tm3 = r0_tm2 + tiles * packn; + int16_t *r0_tm4 = r0_tm3 + tiles * packn; + int16_t *r0_tm5 = r0_tm4 + tiles * packn; + vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl); + vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl); + vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl); + vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl); + vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl); + vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl); + vint16m1_t _r0tm0 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl); + vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4, + vadd_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4, + vsub_vv_i16m1(_tmp01, _tmp02, vl), vl); + vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2, + vsub_vv_i16m1(_tmp01, _tmp03, vl), vl); + vint16m1_t _r0tm5 = + vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl); + vse16_v_i16m1(r0_tm0, _r0tm0, vl); + vse16_v_i16m1(r0_tm1, _r0tm1, vl); + vse16_v_i16m1(r0_tm2, _r0tm2, vl); + vse16_v_i16m1(r0_tm3, _r0tm3, vl); + vse16_v_i16m1(r0_tm4, _r0tm4, vl); + vse16_v_i16m1(r0_tm5, _r0tm5, vl); + r0_tm += tiles * packn * 6; + } + } + } + } +} + +static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias, + int8_t *dst, int ch, int blk_h, int blk_w, + int32_t *multi, int32_t *shift, int32_t out_zp) +{ + /* output transform matrix + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 1 } + }; + AT = { + { 1 1 1 1 1 0 }, + { 0 1 -1 2 -2 0 }, + { 0 1 1 4 4 0 }, + { 0 1 -1 8 -8 4 } // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍 + }; + [0] = r00 + (r01 + r02) + (r03 + r04) + [1] = (r01 - r02) + (r03 - r04) * 2 + [2] = (r01 + r02) + (r03 + r04) * 4 + [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8 + */ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int tiles = blk_h * blk_w; + for (int p = 0; p + packn - 1 < ch; p += packn) { + const int32_t *out0_tm = src + p * 36 * tiles; // 输出转换前/dot后 第p个channel + int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w; // 转换后输出 第p个channel + int32_t tmp[4][6][packn]; + + vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl); + _bias = vmul_vx_i32m2(_bias, 576, vl); + for (int i = 0; i < blk_h; i++) { + for (int j = 0; j < blk_w; j++) { + const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn; // 6*6 起始地址 + const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1; + const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2; + const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3; + const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4; + const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5; + int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn; // out 4*4 addr + for (int m = 0; m < 6; m++) { + vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl); + vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl); + vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl); + vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl); + vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl); + vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl); + vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl); + vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl); + vint32m2_t _tmp0m = + vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl); + vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl); + vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl); + vint32m2_t _tmp3m = + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl); + vse32_v_i32m2(tmp[0][m], _tmp0m, vl); + vse32_v_i32m2(tmp[1][m], _tmp1m, vl); + vse32_v_i32m2(tmp[2][m], _tmp2m, vl); + vse32_v_i32m2(tmp[3][m], _tmp3m, vl); + output0_tm_0 += tiles * packn * 6; + output0_tm_1 += tiles * packn * 6; + output0_tm_2 += tiles * packn * 6; + output0_tm_3 += tiles * packn * 6; + output0_tm_4 += tiles * packn * 6; + output0_tm_5 += tiles * packn * 6; + } + for (int m = 0; m < 4; m++) { + vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl); + vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl); + vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl); + vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl); + vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl); + vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl); + vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl); + vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl); + vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl); + vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl); + vint32m2_t _out00 = vadd_vv_i32m2( + _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl); + vint32m2_t _out01 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl); + vint32m2_t _out02 = + vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl); + vint32m2_t _out03 = vadd_vv_i32m2( + _bias, + vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl); + vint8mf2_t _res0 = requantize_m2_s(_out00, multi + p, shift + p, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_out01, multi + p, shift + p, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_out02, multi + p, shift + p, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_out03, multi + p, shift + p, out_zp, vl); + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + output0 += blk_w * 4 * packn; + } + } + } + } +} + +static inline void wg_bxf3s1_reorder_input_tile8_int8(const int16_t *src, int16_t *dst, int ch, + int tiles, int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + for (int r = 0; r < area; r++) { + int16_t *img_tm2 = dst + r * tiles * ch; // input_tm2 r channel data + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl); + vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl); + vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl); + vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl); + vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, + vl); + tm1 += area * tiles * packn; + img_tm2 += 8 * packn; + } + } + for (; t + 3 < tiles; t += 4) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl); + vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl); + vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += area * tiles * packn; + img_tm2 += 4 * packn; + } + } + for (; t + 1 < tiles; t += 2) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl); + vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl); + tm1 += area * tiles * packn; + img_tm2 += 2 * packn; + } + } + for (; t < tiles; t++) { + const int16_t *tm1 = src; + tm1 += (r * tiles + t) * packn; + for (int q = 0; q < ch / packn; q++) { + vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl); + vse16_v_i16m1(img_tm2, _tmp0, vl); + tm1 += area * tiles * packn; + img_tm2 += 1 * packn; + } + } + } +} + +static inline void wg_bxf3s1_batch_gemm_m8n8_int8(const int16_t *input, const int16_t *kernel, + int32_t *output, int in_ch, int out_ch, int tiles, + int area) +{ + const int packn = csrr_vlenb() / sizeof(int16_t); + const int vl = vsetvl_e16m1(packn); + + for (int p = 0; p + packn - 1 < out_ch; p += packn) { + int32_t *output0_tm = output + p * area * tiles; // 8 channel dot output + const int16_t *kernel0_tm = kernel + p * area * in_ch; // 8 channel kernel + for (int r = 0; r < area; r++) { + const int16_t *img0 = input + r * tiles * in_ch; // img_tm2 第r个channel + int t = 0; + for (; t + 7 < tiles; t += 8) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc02 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc03 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc04 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc05 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc06 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc07 = vmv_v_x_i32m2(0, vl); + + for (int c = 0; c < in_ch; c++) { + vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl); + k0 += packn; + _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl); + _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl); + _acc02 = vwmacc_vx_i32m2(_acc02, img0[2], _kernel0, vl); + _acc03 = vwmacc_vx_i32m2(_acc03, img0[3], _kernel0, vl); + _acc04 = vwmacc_vx_i32m2(_acc04, img0[4], _kernel0, vl); + _acc05 = vwmacc_vx_i32m2(_acc05, img0[5], _kernel0, vl); + _acc06 = vwmacc_vx_i32m2(_acc06, img0[6], _kernel0, vl); + _acc07 = vwmacc_vx_i32m2(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_i32m2(output0_tm, _acc00, vl); + vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl); + vse32_v_i32m2(output0_tm + packn * 2, _acc02, vl); + vse32_v_i32m2(output0_tm + packn * 3, _acc03, vl); + vse32_v_i32m2(output0_tm + packn * 4, _acc04, vl); + vse32_v_i32m2(output0_tm + packn * 5, _acc05, vl); + vse32_v_i32m2(output0_tm + packn * 6, _acc06, vl); + vse32_v_i32m2(output0_tm + packn * 7, _acc07, vl); + output0_tm += packn * 8; + } + for (; t + 3 < tiles; t += 4) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc02 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc03 = vmv_v_x_i32m2(0, vl); + + for (int c = 0; c < in_ch; c++) { + vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl); + k0 += packn; + _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl); + _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl); + _acc02 = vwmacc_vx_i32m2(_acc02, img0[2], _kernel0, vl); + _acc03 = vwmacc_vx_i32m2(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_i32m2(output0_tm, _acc00, vl); + vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl); + vse32_v_i32m2(output0_tm + packn * 2, _acc02, vl); + vse32_v_i32m2(output0_tm + packn * 3, _acc03, vl); + output0_tm += packn * 4; + } + for (; t + 1 < tiles; t += 2) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl); + vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl); + + for (int c = 0; c < in_ch; c++) { + vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl); + k0 += packn; + _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl); + _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_i32m2(output0_tm, _acc00, vl); + vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl); + output0_tm += packn * 2; + } + for (; t < tiles; t++) { + const int16_t *k0 = kernel0_tm + r * in_ch * packn; + + vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl); + + for (int c = 0; c < in_ch; c++) { + vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl); + k0 += packn; + _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_i32m2(output0_tm, _acc00, vl); + output0_tm += packn * 1; + } + } + } +} + +/****************************************************************************************** + * kernel layout before: [O, I, 3, 3] + * kernel layout after : [O/packn, 36, I, packn] + * constrain: output channel % packn = 0 + * input channel % packn = 0 + ******************************************************************************************/ +void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, + struct csinn_tensor *dst_kernel) +{ + int32_t outch = src_kernel->dim[0]; + int32_t inch = src_kernel->dim[1]; + int8_t *kernel_data = (int8_t *)src_kernel->data; + // for kernel transform buf, 3x3 --> 6x6 + int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t)); + // kernel transform matrix: G + const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4}, + {1, 2, 4}, {1, -2, 4}, {0, 0, 6}}; + csinn_tensor_copy(dst_kernel, src_kernel); // tensor->dtype ?? + for (int p = 0; p < outch; p++) { + for (int q = 0; q < inch; q++) { + const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9; + int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36; + // transform kernel + const int8_t *k0 = kernel0; + const int8_t *k1 = kernel0 + 3; + const int8_t *k2 = kernel0 + 6; + // h : first compute the transport matrix tmp = (g * GT)T + int16_t tmp[6][3]; + for (int i = 0; i < 6; i++) { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + // U + for (int j = 0; j < 6; j++) { + int16_t *tmpp = &tmp[j][0]; + for (int i = 0; i < 6; i++) { + kernel_tm0[j * 6 + i] = + tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + // optimized layout for winograd b4f3 + // [O, I, 6, 6] --> [O/packn, 6*6, I, packn] + int16_t *kernel_tm_packn = + (int16_t *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(int16_t)); + dst_kernel->data = kernel_tm_packn; + + const int packn = csrr_vlenb() / sizeof(int16_t); + for (int oc = 0; oc + packn - 1 < outch; oc += packn) { + int16_t *g0 = kernel_tm_packn + oc * 36 * inch; + for (int k = 0; k < 36; k++) { + int16_t *g00 = g0 + k * inch * packn; + for (int ic = 0; ic < inch; ic++) { + for (int j = 0; j < packn; j++) { + int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36; + *g00++ = k00[k]; + } + } + } + } + shl_mem_free(kernel_tm); +} + +/****************************************************************************************** + * constrain: output channel % 8 = 0 + * input channel % 8 = 0 + ******************************************************************************************/ +int shl_rvv_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + // param + int pad_left = params->pad_left; + int pad_top = params->pad_top; + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + int out_c = kernel->dim[0]; + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = out_c * out_h * out_w; + // winograd param + int block_h = (out_h + 3) / 4; + int block_w = (out_w + 3) / 4; + // block * 4 for alignment with 4,kernel = 3 * 3 ,stride = 1,thus input_size + 2 + int padded_in_h = block_h * 4 + 2; + int padded_in_w = block_w * 4 + 2; + int padded_in_hw = padded_in_h * padded_in_w; // element size after padding per channel + int tiles = block_h * block_w; + for (int n = 0; n < batch; n++) { + // pad buffer: [in_c/packn h w packn] + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + // pad input + winograd_pad_input_pack1ton_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, + padded_in_w, pad_top, pad_left, input->qinfo->zero_point); + input_data += input_size; + /****************************** transform input *****************************/ + // input transform buffer1: [in_ch/8, 64, tiles, 8] + int16_t *input_tm1_buf = + (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t)); + wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h, + padded_in_w, block_h, block_w, input->qinfo->zero_point); + shl_mem_free(input_padd_buf); + /****************************** reorder input_tm1_buf *****************************/ + // input reorder buffer2: [36, tiles/8, in_c, 8] + int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t)); + wg_bxf3s1_reorder_input_tile8_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36); + shl_mem_free(input_tm1_buf); + /****************************** batch gemm *****************************/ + // output_dot_buf: [out_c/8, 36, tiles, 8] + const int vlen = csrr_vlenb() * 8; + int32_t *output_dot_buf = + (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t)); + + wg_bxf3s1_batch_gemm_m8n8_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c, + tiles, 36); + + shl_mem_free(input_tm2_buf); + /****************************** transform output *****************************/ + // output_tm1_buf: [out_c/8, out_h4, out_w4, 8] + int8_t *output_tm1_buf = + (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h, + block_w, multiplier, shift, output->qinfo->zero_point); + shl_mem_free(output_dot_buf); + // crop the output after transform: cut extra part (right , bottom) + winograd_crop_output_packnto1_int8(output_tm1_buf, output_data, out_c, out_h, out_w, + block_h * 4, block_w * 4); + output_data += output_size; + shl_mem_free(output_tm1_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + } + return CSINN_TRUE; +} + +#elif define RVV_0_7_1 +// TODO: winograd int8 opt for vector 0.7.1 + +#endif \ No newline at end of file diff --git a/source/thead_rvv/convolution_gemm_fp16.c b/source/thead_rvv/convolution_gemm_fp16.c index 954d136a..62f1c0ad 100644 --- a/source/thead_rvv/convolution_gemm_fp16.c +++ b/source/thead_rvv/convolution_gemm_fp16.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/* - pack kernel_data inplace, means the origin kernel_data be destoried. - The reason to do this is that the packaging process must not consume more memory. -*/ -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel, - struct conv2d_params *params) +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { __fp16 *kernel_data = (__fp16 *)kernel->data; int group = params->group; @@ -33,17 +33,17 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kerne int m = kernel->dim[0] / group; // m = out_ch / group int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16)); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16)); for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -73,8 +73,8 @@ int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor int32_t k = channel_col; int32_t n = out_height * out_width; - __fp16 *im2col_data = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); - __fp16 *pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); + __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -107,14 +107,14 @@ int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor __fp16 *pc = output_data; // pack - csi_nn_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n); + shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } - csi_mem_free(pb_reorder); - csi_mem_free(im2col_data); + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); return CSINN_TRUE; } \ No newline at end of file diff --git a/source/thead_rvv/convolution_gemm_fp16_pack1ton.c b/source/thead_rvv/convolution_gemm_fp16_pack1ton.c new file mode 100644 index 00000000..afc549c6 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp16_pack1ton.c @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(__fp16) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn can != 0 + * layout: [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n] + * [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_pack1ton_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int vl = vsetvl_e16m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m2(g0, _tmp, vl); + g0 += vl; + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < tail_c; p++) { + vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e16m1(packn); + // [out_c/packn, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m1(g0, _tmp, vl); + g0 += vl; + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < tail_c; p++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16)); + for (int g = 0; g < group; g++) { + __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_pack1ton_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_pack1ton_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn] + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const __fp16 *img0 = input_pad_buf; + __fp16 *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e16m1(loop_c); + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * vl; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += vl; + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm_fp16_packn.c b/source/thead_rvv/convolution_gemm_fp16_packn.c new file mode 100644 index 00000000..e3c98f2e --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp16_packn.c @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(__fp16) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_packn_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int vl = vsetvl_e16m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e16m1(packn); + // [out_c/packn, in_c/packn, maxk, packn, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16)); + for (int g = 0; g < group; g++) { + __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packn_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(__fp16)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const __fp16 *img0 = input_pad_buf + c * padded_in_hw; + __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * packn; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm_fp16_packnto1.c b/source/thead_rvv/convolution_gemm_fp16_packnto1.c new file mode 100644 index 00000000..19506858 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp16_packnto1.c @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(__fp16) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn != 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + * [out_c/tail, in_c/packnb, maxk, packnb, tail] + ************************************************************/ +static void im2col_gemm_reorder_kernel_packnto1_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int vl = vsetvl_e16m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e16m1(packn); + // [out_c/packn, in_c/packn, maxk, packn, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } + // [out_c/tail, in_c/packnb, maxk, packnb, tail] + if (oc < out_c) { + vl = vsetvl_e16m1(out_c - oc); + __fp16 *k0 = src + oc * in_c * maxk; + __fp16 *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k), + in_c * maxk * sizeof(__fp16), vl); + vse16_v_f16m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16)); + for (int g = 0; g < group; g++) { + __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packnto1_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16)); + shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(__fp16)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const __fp16 *img0 = input_pad_buf + c * padded_in_hw; + __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const __fp16 *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl); + img1 += stride_w * packn; + vse16_v_f16m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16)); + shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp; + __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w); + + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm.c b/source/thead_rvv/convolution_gemm_fp32.c similarity index 73% rename from source/thead_rvv/convolution_gemm.c rename to source/thead_rvv/convolution_gemm_fp32.c index 52f0ef45..b84e66a3 100644 --- a/source/thead_rvv/convolution_gemm.c +++ b/source/thead_rvv/convolution_gemm_fp32.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/* - pack kernel_data inplace, means the origin kernel_data be destoried. - The reason to do this is that the packaging process must not consume more memory. -*/ -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kernel, - struct conv2d_params *params) +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { float *kernel_data = (float *)kernel->data; int group = params->group; @@ -33,17 +33,17 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kerne int m = kernel->dim[0] / group; // m = out_ch / group int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float)); + float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float)); for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); + shl_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k); } memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -73,8 +73,8 @@ int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor int32_t k = channel_col; int32_t n = out_height * out_width; - float *im2col_data = (float *)csi_mem_alloc(k * n * sizeof(float)); - float *pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float)); + float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float)); + float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float)); for (int i = 0; i < batch; i++) { for (int g = 0; g < group; g++) { @@ -107,14 +107,14 @@ int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor float *pc = output_data; // pack - csi_nn_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n); + shl_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x8_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m); + shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } - csi_mem_free(pb_reorder); - csi_mem_free(im2col_data); + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); return CSINN_TRUE; } diff --git a/source/thead_rvv/convolution_gemm_fp32_pack1ton.c b/source/thead_rvv/convolution_gemm_fp32_pack1ton.c new file mode 100644 index 00000000..8c433500 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp32_pack1ton.c @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(float) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn can != 0 + * layout: [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n] + * [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_pack1ton_per_group_fp32(float *src, float *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int vl = vsetvl_e32m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m2_t _tmp = + vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m2(g0, _tmp, vl); + g0 += vl; + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < tail_c; p++) { + vfloat32m2_t _tmp = + vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e32m1(packn); + // [out_c/packn, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m1_t _tmp = + vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m1(g0, _tmp, vl); + g0 += vl; + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < tail_c; p++) { + vfloat32m1_t _tmp = + vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float)); + for (int g = 0; g < group; g++) { + float *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_pack1ton_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_pack1ton_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const float *img0 = input_pad_buf; + float *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e32m1(loop_c); + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * vl; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += vl; + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm_fp32_packn.c b/source/thead_rvv/convolution_gemm_fp32_packn.c new file mode 100644 index 00000000..40847539 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp32_packn.c @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(float) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_packn_per_group_fp32(float *src, float *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int vl = vsetvl_e32m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m2_t _tmp = + vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e32m1(packn); + // [out_c/packn, in_c/packn, maxk, packn, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m1_t _tmp = + vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float)); + for (int g = 0; g < group; g++) { + float *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packn_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(float)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const float *img0 = input_pad_buf + c * padded_in_hw; + float *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * packn; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm_fp32_packnto1.c b/source/thead_rvv/convolution_gemm_fp32_packnto1.c new file mode 100644 index 00000000..8e933a0c --- /dev/null +++ b/source/thead_rvv/convolution_gemm_fp32_packnto1.c @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * packn = vlenb / sizeof(float) + * maxk = ksize_h * ksize_w + * constrain: out_c % packn != 0 and in_ch % packn = 0 + * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + * [out_c/packna, in_c/packnb, maxk, packnb, packna] + * [out_c/tail, in_c/packnb, maxk, packnb, tail] + ************************************************************/ +static void im2col_gemm_reorder_kernel_packnto1_per_group_fp32(float *src, float *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int vl = vsetvl_e32m2(pack2n); + int oc = 0; + // [out_c/pack2n, in_c/packn, maxk, packn, pack2n] + for (; oc + pack2n - 1 < out_c; oc += pack2n) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m2_t _tmp = + vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m2(g0, _tmp, vl); + g0 += vl; + } + } + } + } + vl = vsetvl_e32m1(packn); + // [out_c/packn, in_c/packn, maxk, packn, packn] + for (; oc + packn - 1 < out_c; oc += packn) { + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m1_t _tmp = + vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } + // [out_c/tail, in_c/packnb, maxk, packnb, tail] + if (oc < out_c) { + vl = vsetvl_e32m1(out_c - oc); + float *k0 = src + oc * in_c * maxk; + float *g0 = dst + oc * in_c / packn * maxk * packn; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + for (int p = 0; p < packn; p++) { + vfloat32m1_t _tmp = + vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl); + vse32_v_f32m1(g0, _tmp, vl); + g0 += vl; + } + } + } + } +} + +/************************************************************************************* + * reorder kernel_data inplace, means the origin kernel_data be destoried. + * The reason to do this is that the packaging process must not consume more memory. + **************************************************************************************/ +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float)); + for (int g = 0; g < group; g++) { + float *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packnto1_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float)); + + for (int i = 0; i < batch; i++) { + for (int g = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float)); + shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left); + + // im2col + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(float)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const float *img0 = input_pad_buf + c * padded_in_hw; + float *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const float *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl); + img1 += stride_w * packn; + vse32_v_f32m1(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + // reorder(pack) + float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float)); + shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + float *ker_ptr = kernel_data + g * m * maxk * in_cp; + float *bias_ptr = bias_data ? (bias_data + g * m) : NULL; + shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n); + shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w); + + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/convolution_gemm_int4.c b/source/thead_rvv/convolution_gemm_int4.c index be2f8565..2a4746d5 100644 --- a/source/thead_rvv/convolution_gemm_int4.c +++ b/source/thead_rvv/convolution_gemm_int4.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#ifdef __riscv_xtheadv -#include "csi_thead_rvv.h" - -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kernel, - struct conv2d_params *params) +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { int8_t *kernel_data = (int8_t *)kernel->data; int group = params->group; @@ -33,19 +32,19 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kerne int k_2 = (((k - 1) & -2) + 2) >> 1; int k4 = ((k_2 - 1) & -4) + 4; // align of 4 for int8 - params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * n * k4 * sizeof(int8_t)); + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * n * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, - k_2, k_2); + shl_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, k_2, + k_2); } // FIXME: free params->conv_extra.kernel_tm->data } -int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv_im2col_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -76,11 +75,11 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor int32_t n = out_ch / group; int32_t k4 = ((k_2 - 1) & -4) + 4; - int32_t *multiplier = (int32_t *)csi_mem_alloc(n * sizeof(int32_t)); - int32_t *shift = (int32_t *)csi_mem_alloc(n * sizeof(int32_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(n * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(n * sizeof(int32_t)); - int8_t *im2col_data = (int8_t *)csi_mem_alloc(m * k_2 * sizeof(int8_t)); - int8_t *pa_reorder = (int8_t *)csi_mem_alloc(m * k4 * sizeof(int8_t)); + int8_t *im2col_data = (int8_t *)shl_mem_alloc(m * k_2 * sizeof(int8_t)); + int8_t *pa_reorder = (int8_t *)shl_mem_alloc(m * k4 * sizeof(int8_t)); int8_t *im2col_shadow = NULL; int8_t pad_value = 0; @@ -91,10 +90,9 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor // im2col if (in_ch & 1) { int8_t *buffer_int4_to_int8 = - (int8_t *)csi_mem_alloc(in_height * in_width * in_ch * sizeof(int8_t)); - csi_nn_rvv_int4_to_int8(input_data, buffer_int4_to_int8, - in_height * in_width * in_ch); - int8_t *buffer_im2col = (int8_t *)csi_mem_alloc(m * channel_col * sizeof(int8_t)); + (int8_t *)shl_mem_alloc(in_height * in_width * in_ch * sizeof(int8_t)); + shl_rvv_int4_to_int8(input_data, buffer_int4_to_int8, in_height * in_width * in_ch); + int8_t *buffer_im2col = (int8_t *)shl_mem_alloc(m * channel_col * sizeof(int8_t)); im2col_shadow = buffer_im2col; pad_value = input->qinfo->zero_point & 0x0f; @@ -121,11 +119,11 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor } } for (int k = 0; k < m; k++) { - csi_nn_rvv_int8_to_int4(buffer_im2col + k * channel_col, im2col_data + k * k_2, - channel_col); + shl_rvv_int8_to_int4(buffer_im2col + k * channel_col, im2col_data + k * k_2, + channel_col); } - csi_mem_free(buffer_int4_to_int8); - csi_mem_free(buffer_im2col); + shl_mem_free(buffer_int4_to_int8); + shl_mem_free(buffer_im2col); } else { im2col_shadow = im2col_data; @@ -171,19 +169,19 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor } // pack - csi_nn_rvv_reorder_input_n8_int4(im2col_data, pa, m, k_2, k_2); + shl_rvv_reorder_input_n8_int4(im2col_data, pa, m, k_2, k_2); // GEMM - csi_nn_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n, - output->qinfo->zero_point, multiplier, shift); + shl_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n, + output->qinfo->zero_point, multiplier, shift); input_data += in_ch / group * in_height * in_width / 2; output_data += m * n / 2; } } - csi_mem_free(pa_reorder); - csi_mem_free(im2col_data); - csi_mem_free(multiplier); - csi_mem_free(shift); + shl_mem_free(pa_reorder); + shl_mem_free(im2col_data); + shl_mem_free(multiplier); + shl_mem_free(shift); return CSINN_TRUE; } #endif diff --git a/source/thead_rvv/convolution_gemm_int4_packn.c b/source/thead_rvv/convolution_gemm_int4_packn.c new file mode 100644 index 00000000..34bba877 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_int4_packn.c @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + + ************************************************************/ +static void im2col_gemm_reorder_kernel_packn_per_group_int4(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ +} + +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ +} + +int shl_rvv_conv_im2col_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_gemm_int8.c b/source/thead_rvv/convolution_gemm_int8.c index c2ddae4a..dde50d48 100644 --- a/source/thead_rvv/convolution_gemm_int8.c +++ b/source/thead_rvv/convolution_gemm_int8.c @@ -16,13 +16,12 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#ifdef __riscv_xtheadv -#include "csi_thead_rvv.h" - -void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kernel, - struct conv2d_params *params) +#include "shl_thead_rvv.h" +#ifdef XTHEADV +void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) { int8_t *kernel_data = (int8_t *)kernel->data; int group = params->group; @@ -31,21 +30,20 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kerne int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; - params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * m * k4 * sizeof(int8_t)); + params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; for (int g = 0; g < group; g++) { - csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, - k); + shl_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k); } // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); - // csi_mem_free(pa_reorder); + // shl_mem_free(pa_reorder); } -int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -76,11 +74,11 @@ int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor int32_t n = out_height * out_width; int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; - int8_t *im2col_data = (int8_t *)csi_mem_alloc(k * n * sizeof(int8_t)); - int8_t *pb_reorder = (int8_t *)csi_mem_alloc(k4 * n * sizeof(int8_t)); + int8_t *im2col_data = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t)); + int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t)); - int32_t *multiplier = (int32_t *)csi_mem_alloc(m * sizeof(int32_t)); - int32_t *shift = (int32_t *)csi_mem_alloc(m * sizeof(int32_t)); + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); int j = 0; for (int i = 0; i < batch; i++) { @@ -126,19 +124,19 @@ int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor } // pack - csi_nn_rvv_reorder_input_z8_int8(im2col_data, pb, k, n, n); + shl_rvv_reorder_input_z8_int8(im2col_data, pb, k, n, n); // GEMM - csi_nn_rvv_gemm_8x8_int8(pc, pa, pb, m, k4, n, n, bias_data + g * m, - output->qinfo->zero_point, multiplier, shift); + shl_rvv_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n, + output->qinfo->zero_point, multiplier, shift); input_data += in_ch / group * in_height * in_width; output_data += m * n; } } - csi_mem_free(pb_reorder); - csi_mem_free(im2col_data); - csi_mem_free(multiplier); - csi_mem_free(shift); + shl_mem_free(pb_reorder); + shl_mem_free(im2col_data); + shl_mem_free(multiplier); + shl_mem_free(shift); return CSINN_TRUE; } -#endif \ No newline at end of file +#endif diff --git a/source/thead_rvv/convolution_gemm_int8_pack1ton.c b/source/thead_rvv/convolution_gemm_int8_pack1ton.c new file mode 100644 index 00000000..4b5b8aa4 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_int8_pack1ton.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn can != 0 + * layout: [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna] + ************************************************************/ +static void im2col_gemm_reorder_kernel_pack1ton_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int in_c4 = ((in_c - 1) & -4) + 4; + for (int oc = 0; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c4 * maxk; + + int ic = 0; + for (; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + if (ic < in_c) { + int tail_c = in_c & (packn - 1); + int tail_c4 = in_c & 3; + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * (in_c4 - ic); + + int p = 0; + for (; p + 3 < tail_c; p += 4) { + int8_t *g2 = g1 + p * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + if (p < tail_c) { + int8_t *g2 = g1 + p * packn; + for (int i = 0; i < tail_c4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + int in_c4 = ((in_c - 1) & -4) + 4; // align 4 for input_channel + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c4 * maxk * sizeof(int8_t)); + int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c4 * maxk; + im2col_gemm_reorder_kernel_pack1ton_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } +} + +int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // padding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_pack1ton_int8(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + int in_cp4 = ((in_cp - 1) & -4) + 4; + + // [in_cp4/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_cp4%packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w); + + const int8_t *img0 = input_pad_buf; + int8_t *dst_ptr = im2col_buf; + + int loop_c = in_cp; + while (loop_c > 0) { + vl = vsetvl_e8mf2(loop_c); + int vl4 = ((vl - 1) & -4) + 4; + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * vl; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += vl4; // XXX: dst align 4 + } + img1 += tailstep * vl; + } + } + } + img0 += padded_in_hw * vl; + // dst_ptr += maxk * out_h * out_w * vl; + loop_c -= vl; + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t)); + shl_rvv_reorder_input_z12_pack1ton_int8(im2col_buf, reorder_buf, in_cp4, maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4; + int32_t *bias_ptr = bias_data + g * m; + shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp4 * maxk, n, n, output->qinfo->zero_point, + multiplier, shift); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_gemm_int8_packn.c b/source/thead_rvv/convolution_gemm_int8_packn.c new file mode 100644 index 00000000..c3cdd72e --- /dev/null +++ b/source/thead_rvv/convolution_gemm_int8_packn.c @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch % packn = 0 + * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4] + * 默认支持 dot 版本,不支持 dot 数据排布不同 + ************************************************************/ +static void im2col_gemm_reorder_kernel_packn_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b] + for (int oc = 0; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packn_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + + // FIXME: free params->conv_extra.kernel_tm->data + // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + // shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // paddding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(int8_t)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const int8_t *img0 = input_pad_buf + c * padded_in_hw; + int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * packn; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t)); + shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n, output->qinfo->zero_point, + multiplier, shift); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/convolution_gemm_int8_packnto1.c b/source/thead_rvv/convolution_gemm_int8_packnto1.c new file mode 100644 index 00000000..feb969d4 --- /dev/null +++ b/source/thead_rvv/convolution_gemm_int8_packnto1.c @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn != 0 and in_ch % packn = 0 + * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4] + * [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4] + * 默认支持 dot 版本,不支持 dot 数据排布不同 + ************************************************************/ +static void im2col_gemm_reorder_kernel_packnto1_per_group_int8(int8_t *src, int8_t *dst, int out_c, + int in_c, int maxk) +{ + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + + // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b] + int oc = 0; + for (; oc + packn - 1 < out_c; oc += packn) { + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * packn; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } + // [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4] + if (oc < out_c) { + vl = vsetvl_e8mf2(out_c - oc); + int8_t *k0 = src + oc * in_c * maxk; + int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4; + + for (int ic = 0; ic + packn - 1 < in_c; ic += packn) { + for (int k = 0; k < maxk; k++) { + int8_t *g1 = g0 + (ic * maxk) * vl + k * packn * vl; + + for (int p = 0; p < packn / 4; p++) { + int8_t *g2 = g1 + p * 4 * vl; + for (int i = 0; i < 4; i++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k, + in_c * maxk * sizeof(int8_t), vl); + vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl); + g2++; + } + } + } + } + } +} + +void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + int group = params->group; + + int out_c = kernel->dim[0]; + int out_cp = out_c / group; // per-group out channel + int in_c = kernel->dim[1]; + int maxk = kernel->dim[2] * kernel->dim[3]; + + params->conv_extra.kernel_tm->data = + (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); + + for (int g = 0; g < group; g++) { + int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk; + int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; + im2col_gemm_reorder_kernel_packnto1_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); + } + + // FIXME: free params->conv_extra.kernel_tm->data + // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); + // shl_mem_free(pa_reorder); +} + +int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t group = params->group; + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_c = kernel->dim[0]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t ksize_h = kernel->dim[2]; + int32_t ksize_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + + int32_t m = out_c / group; + int32_t in_cp = in_c / group; + int32_t maxk = ksize_h * ksize_w; + int32_t n = out_h * out_w; + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t)); + + int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t)); + + for (int i = 0; i < batch; i++) { + for (int g = 0, j = 0; g < group; g++) { + // paddding + int padded_in_hw = (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right); + int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t)); + shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, + (in_h + params->pad_top + params->pad_down), + (in_w + params->pad_left + params->pad_right), + params->pad_top, params->pad_left, + input->qinfo->zero_point); + + // im2col + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + // [in_c/packn, maxk, out_h, out_w, packn] + int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w * + packn * sizeof(int8_t)); + const int tailstep = + ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) * + packn; + + for (int c = 0; c + packn - 1 < in_cp; c += packn) { + const int8_t *img0 = input_pad_buf + c * padded_in_hw; + int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w; + + for (int a = 0; a < ksize_h; a++) { + for (int b = 0; b < ksize_w; b++) { + const int8_t *img1 = + img0 + a * (in_w + params->pad_left + params->pad_right) * packn + + b * packn; + + for (int p = 0; p < out_h; p++) { + for (int q = 0; q < out_w; q++) { + vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl); + img1 += stride_w * packn; + vse8_v_i8mf2(dst_ptr, _tmp, vl); + dst_ptr += packn; + } + img1 += tailstep; + } + } + } + } + shl_mem_free(input_pad_buf); + + if (kernel->quant_channel > 1) { + for (int c = 0; c < m; c++, j++) { + multiplier[c] = kernel->qinfo[j].multiplier; + shift[c] = kernel->qinfo[j].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < m; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + + // reorder(pack) + int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t)); + shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n); + shl_mem_free(im2col_buf); + + // gemm + int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp; + int32_t *bias_ptr = bias_data + g * m; // bias_data != NULL with fusing zp to bias + shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m, + in_cp * maxk, n, n, output->qinfo->zero_point, + multiplier, shift); + + shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w); + shl_mem_free(reorder_buf); + + input_data += in_cp * in_h * in_w; + output_data += m * n; + } + } + shl_mem_free(multiplier); + shl_mem_free(shift); + shl_mem_free(output_ncxhwx); + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/data_convert.c b/source/thead_rvv/data_convert.c new file mode 100644 index 00000000..2103302b --- /dev/null +++ b/source/thead_rvv/data_convert.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) +{ + struct csinn_callback *cb = params->base.cb; + // TODO: corrected output quantization parameters ??? + if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_INT4) { + cb->exec = shl_rvv_data_convert_int8_to_int4; + } else if (input->dtype == CSINN_DTYPE_INT4 && output->dtype == CSINN_DTYPE_INT8) { + cb->exec = shl_rvv_data_convert_int4_to_int8; + } + return CSINN_TRUE; +} + +int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int size = csinn_tensor_size(input); + int size2 = size / 2 * 2; + while (size2 > 0) { + int vl = vsetvl_e8m2(size2); + vint8m2_t _input = vle8_v_i8m2(input_data, vl); + vint8m2_t _tmp = vssra_vx_i8m2(_input, 4, vl); + vint8m1_t _res = vpnclip_wx_i8m1(vreinterpret_v_i8m2_i16m2(_tmp), 0, vl / 2); + vse8_v_i8m1(output_data, _res, vl / 2); + input_data += vl; + output_data += vl / 2; + size2 -= vl; + } + if (size & 1) { + *output_data = (*input_data + 8) >> 4; // round arithmetic shift right + } + return CSINN_TRUE; +} + +int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int size = csinn_tensor_size(input); + int size_2 = size / 2; + while (size_2 > 0) { + int vl = vsetvl_e8m1(size_2); + vint8m1_t _input = vle8_v_i8m1(input_data, vl); + vint16m2_t _tmp = vpwadd_vx_i16m2(_input, 0, vl); + vint8m2_t _res = vsll_vx_i8m2(vreinterpret_v_i16m2_i8m2(_tmp), 4, vl * 2); + vse8_v_i8m2(output_data, _res, vl * 2); + input_data + vl; + output_data += vl * 2; + size_2 -= vl; + } + if (size & 1) { + *output_data = (*input_data) << 4; + } + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/depthwise_convolution.c b/source/thead_rvv/depthwise_convolution.c new file mode 100644 index 00000000..e8e9987a --- /dev/null +++ b/source/thead_rvv/depthwise_convolution.c @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(float); + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_fp32; + + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_fp32; + } else { + cb->exec = shl_ref_depthwise_conv2d_f32; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_fp32; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_fp32; + } else { + cb->exec = shl_ref_depthwise_conv2d_f32; + } + } + return CSINN_TRUE; +} + +int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_fp16; + + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_fp16; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_fp16; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_fp16; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + return CSINN_TRUE; +} + +int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; + int32_t out_c = output->dim[1]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // enable fuse zeropoint to bias + if (!params->conv_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + bias->data = bias_data; + } + int kernel_inner = 1 * kernel_h * kernel_w; + for (int oc = 0; oc < out_c; oc++) { + int32_t tmp = 0; + for (int j = 0; j < kernel_inner; j++) { + tmp += kernel_data[oc * kernel_inner + j] * input_zp; + } + bias_data[oc] -= tmp; + } + } + + if (in_c % packn == 0 && out_c % packn == 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_dwconv3x3s1_packn_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + cb->exec = shl_rvv_dwconv3x3s2_packn_int8; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + + if (in_c % packn != 0 && out_c % packn != 0) { + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_int8; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_int8; + } else { + cb->exec = shl_ref_depthwise_conv2d_quant; + } + } + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + return CSINN_TRUE; +} + +int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + int32_t batch = input->dim[0]; + int32_t in_ch = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t out_ch = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + int32_t kernel_h = kernel->dim[2]; + int32_t kernel_w = kernel->dim[3]; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + struct csinn_callback *cb = params->base.cb; + + // xxx: only int4 support nhwc layout now + if (input->layout == CSINN_LAYOUT_NHWC) { + out_ch = output->dim[3]; + in_ch = input->dim[3]; + in_h = input->dim[1]; + in_w = input->dim[2]; + kernel_h = kernel->dim[1]; + kernel_w = kernel->dim[2]; + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { + cb->exec = shl_rvv_dwconv3x3s1_int4; + } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { + cb->exec = shl_rvv_dwconv3x3s2_int4; + } + // support channel quantization + for (int i = 0; i < kernel->quant_channel; i++) { + float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale; + shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier), + &(kernel->qinfo[i].shift)); + } + return CSINN_TRUE; + } + return CSINN_FALSE; +} diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp16.c b/source/thead_rvv/depthwise_convolution_3x3_fp16.c index 6af0b363..8c9531d8 100644 --- a/source/thead_rvv/depthwise_convolution_3x3_fp16.c +++ b/source/thead_rvv/depthwise_convolution_3x3_fp16.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -42,10 +42,10 @@ int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[3]; __fp16 *input_padd_buf = - (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_nn_rvv_pad_input_fp16( + shl_rvv_pad_input_fp16( input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); @@ -338,13 +338,13 @@ int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *out } } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } -int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -361,10 +361,10 @@ int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[3]; __fp16 *input_padd_buf = - (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_nn_rvv_pad_input_fp16( + shl_rvv_pad_input_fp16( input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); @@ -508,6 +508,6 @@ int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *out } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c b/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c new file mode 100644 index 00000000..f23b6265 --- /dev/null +++ b/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c @@ -0,0 +1,798 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + note: VLEN = 128/256 ... flexible vlen +*************************************************************/ +int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_padd_buf = + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + shl_rvv_pad_input_packn_fp16( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + __fp16 *out1 = out0 + out_w * packn; + + const __fp16 *r0 = input_padd_buf + c * in_h * in_w; + const __fp16 *r1 = r0 + in_w * packn; + const __fp16 *r2 = r1 + in_w * packn; + const __fp16 *r3 = r2 + in_w * packn; + + const __fp16 *kernel0 = kernel_data + c * 9; + + vfloat16m1_t _k00 = vle16_v_f16m1(kernel0, vl); + vfloat16m1_t _k01 = vle16_v_f16m1(kernel0 + 1 * packn, vl); + vfloat16m1_t _k02 = vle16_v_f16m1(kernel0 + 2 * packn, vl); + vfloat16m1_t _k10 = vle16_v_f16m1(kernel0 + 3 * packn, vl); + vfloat16m1_t _k11 = vle16_v_f16m1(kernel0 + 4 * packn, vl); + vfloat16m1_t _k12 = vle16_v_f16m1(kernel0 + 5 * packn, vl); + vfloat16m1_t _k20 = vle16_v_f16m1(kernel0 + 6 * packn, vl); + vfloat16m1_t _k21 = vle16_v_f16m1(kernel0 + 7 * packn, vl); + vfloat16m1_t _k22 = vle16_v_f16m1(kernel0 + 8 * packn, vl); + + vfloat16m1_t _bias0; + _bias0 = bias_data ? vle16_v_f16m1(bias_data + c, vl) : vfmv_v_f_f16m1(0.0f, vl); + + int h = 0; + // h2 loop + for (; h + 1 < out_h; h += 2) { + int w = 0; + // h2w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + vfloat16m1_t _acc02 = _bias0; + vfloat16m1_t _acc03 = _bias0; + vfloat16m1_t _acc10 = _bias0; + vfloat16m1_t _acc11 = _bias0; + vfloat16m1_t _acc12 = _bias0; + vfloat16m1_t _acc13 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r02, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r03, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r04, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r03, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r04, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r05, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl); + vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r12, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r13, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r14, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r13, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r14, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r15, vl); // + _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k00, _r11, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k01, _r12, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k02, _r13, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k00, _r12, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k01, _r13, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k02, _r14, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k00, _r13, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k01, _r14, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k02, _r15, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl); + vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r22, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r23, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r24, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r23, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r24, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r25, vl); // + _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k10, _r21, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k11, _r22, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k12, _r23, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k10, _r22, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k11, _r23, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k12, _r24, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k10, _r23, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k11, _r24, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k12, _r25, vl); + + vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl); + vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl); + vfloat16m1_t _r33 = vle16_v_f16m1(r3 + 3 * packn, vl); + vfloat16m1_t _r34 = vle16_v_f16m1(r3 + 4 * packn, vl); + vfloat16m1_t _r35 = vle16_v_f16m1(r3 + 5 * packn, vl); + + _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k20, _r31, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k21, _r32, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k22, _r33, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k20, _r32, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k21, _r33, vl); + _acc12 = vfmacc_vv_f16m1(_acc12, _k22, _r34, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k20, _r33, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k21, _r34, vl); + _acc13 = vfmacc_vv_f16m1(_acc13, _k22, _r35, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + vse16_v_f16m1(out0 + 2 * packn, _acc02, vl); + vse16_v_f16m1(out0 + 3 * packn, _acc03, vl); + vse16_v_f16m1(out1, _acc10, vl); + vse16_v_f16m1(out1 + 1 * packn, _acc11, vl); + vse16_v_f16m1(out1 + 2 * packn, _acc12, vl); + vse16_v_f16m1(out1 + 3 * packn, _acc13, vl); + + out0 += packn * 4; + out1 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + r3 += packn * 4; + } + // h2w2 + for (; w + 1 < out_w; w += 2) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + vfloat16m1_t _acc10 = _bias0; + vfloat16m1_t _acc11 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl); // 0 + _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k00, _r11, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k01, _r12, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k02, _r13, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl); // + _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k10, _r21, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k11, _r22, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k12, _r23, vl); + + vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl); + vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl); + vfloat16m1_t _r33 = vle16_v_f16m1(r3 + 3 * packn, vl); + + _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k20, _r31, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k21, _r32, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _k22, _r33, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + vse16_v_f16m1(out1, _acc10, vl); + vse16_v_f16m1(out1 + 1 * packn, _acc11, vl); + + out0 += packn * 2; + out1 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + r3 += packn * 2; + } + // h2w1 + for (; w < out_w; w++) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc10 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); // 0 + _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); // + _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl); + + vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl); + vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl); + + _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out1, _acc10, vl); + + out0 += packn * 1; + out1 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + r3 += packn * 1; + } + r0 += (2 + in_w) * packn; + r1 += (2 + in_w) * packn; + r2 += (2 + in_w) * packn; + r3 += (2 + in_w) * packn; + + out0 += out_w * packn; + out1 += out_w * packn; + } + + // h1 + for (; h < out_h; h++) { + int w = 0; + // h1w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + vfloat16m1_t _acc02 = _bias0; + vfloat16m1_t _acc03 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r02, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r03, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r04, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r03, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r04, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r05, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl); + vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r12, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r13, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r14, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r13, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r14, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r15, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl); + vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r22, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r23, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r24, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r23, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r24, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r25, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + vse16_v_f16m1(out0 + 2 * packn, _acc02, vl); + vse16_v_f16m1(out0 + 3 * packn, _acc03, vl); + + out0 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + // h1w2 + for (; w + 1 < out_w; w += 2) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + + out0 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + // h1w1 + for (; w < out_w; w++) { + vfloat16m1_t _acc00 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + + vse16_v_f16m1(out0, _acc00, vl); + + out0 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + } + } + } + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} + +int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + __fp16 *kernel_data = (__fp16 *)kernel->data; + __fp16 *bias_data = (__fp16 *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_padd_buf = + (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + shl_rvv_pad_input_packn_fp16( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + + int tailstep = (in_w - 2 * out_w + in_w) * packn; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + + const __fp16 *r0 = input_padd_buf + c * in_h * in_w; + const __fp16 *r1 = r0 + in_w * packn; + const __fp16 *r2 = r1 + in_w * packn; + + const __fp16 *kernel0 = kernel_data + c * 9; + + vfloat16m1_t _k00 = vle16_v_f16m1(kernel0, vl); + vfloat16m1_t _k01 = vle16_v_f16m1(kernel0 + 1 * packn, vl); + vfloat16m1_t _k02 = vle16_v_f16m1(kernel0 + 2 * packn, vl); + vfloat16m1_t _k10 = vle16_v_f16m1(kernel0 + 3 * packn, vl); + vfloat16m1_t _k11 = vle16_v_f16m1(kernel0 + 4 * packn, vl); + vfloat16m1_t _k12 = vle16_v_f16m1(kernel0 + 5 * packn, vl); + vfloat16m1_t _k20 = vle16_v_f16m1(kernel0 + 6 * packn, vl); + vfloat16m1_t _k21 = vle16_v_f16m1(kernel0 + 7 * packn, vl); + vfloat16m1_t _k22 = vle16_v_f16m1(kernel0 + 8 * packn, vl); + + vfloat16m1_t _bias0; + _bias0 = bias_data ? vle16_v_f16m1(bias_data + c, vl) : vfmv_v_f_f16m1(0.0f, vl); + + for (int h = 0; h < out_h; h++) { + int w = 0; + // h1w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + vfloat16m1_t _acc02 = _bias0; + vfloat16m1_t _acc03 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl); + vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl); + vfloat16m1_t _r06 = vle16_v_f16m1(r0 + 6 * packn, vl); + vfloat16m1_t _r07 = vle16_v_f16m1(r0 + 7 * packn, vl); + vfloat16m1_t _r08 = vle16_v_f16m1(r0 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r03, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r04, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r04, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r05, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r06, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r06, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r07, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r08, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl); + vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl); + vfloat16m1_t _r16 = vle16_v_f16m1(r1 + 6 * packn, vl); + vfloat16m1_t _r17 = vle16_v_f16m1(r1 + 7 * packn, vl); + vfloat16m1_t _r18 = vle16_v_f16m1(r1 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r13, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r14, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r14, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r15, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r16, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r16, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r17, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r18, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl); + vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl); + vfloat16m1_t _r26 = vle16_v_f16m1(r2 + 6 * packn, vl); + vfloat16m1_t _r27 = vle16_v_f16m1(r2 + 7 * packn, vl); + vfloat16m1_t _r28 = vle16_v_f16m1(r2 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r23, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r24, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r24, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r25, vl); + _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r26, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r26, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r27, vl); + _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r28, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + vse16_v_f16m1(out0 + 2 * packn, _acc02, vl); + vse16_v_f16m1(out0 + 3 * packn, _acc03, vl); + + out0 += packn * 4; + + r0 += packn * 8; + r1 += packn * 8; + r2 += packn * 8; + } + for (; w + 1 < out_w; w += 2) { + vfloat16m1_t _acc00 = _bias0; + vfloat16m1_t _acc01 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl); + vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r02, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r03, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r04, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl); + vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r12, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r13, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r14, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl); + vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r22, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r23, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r24, vl); + + vse16_v_f16m1(out0, _acc00, vl); + vse16_v_f16m1(out0 + 1 * packn, _acc01, vl); + + out0 += packn * 2; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + for (; w < out_w; w++) { + vfloat16m1_t _acc00 = _bias0; + + vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl); + vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl); + + vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl); + vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl); + + vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl); + vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl); + + vse16_v_f16m1(out0, _acc00, vl); + out0 += packn * 1; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} + +void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + __fp16 *kernel_data = (__fp16 *)kernel->data; + const int out_ch = kernel->dim[0]; + const int maxk = kernel->dim[2] * kernel->dim[3]; + __fp16 *kernel_trans = (__fp16 *)shl_mem_alloc(out_ch * maxk * sizeof(__fp16)); + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) { + __fp16 *ksrc = kernel_data + oc * maxk; + __fp16 *kdst = kernel_trans + oc * maxk; + for (int ic = 0; ic < maxk; ic++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(ksrc + ic, maxk * sizeof(__fp16), vl); + vse16_v_f16m1(kdst, _tmp, vl); + kdst += vl; + } + } + memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(__fp16)); + shl_mem_free(kernel_trans); +} diff --git a/source/thead_rvv/depthwise_convolution_3x3.c b/source/thead_rvv/depthwise_convolution_3x3_fp32.c similarity index 95% rename from source/thead_rvv/depthwise_convolution_3x3.c rename to source/thead_rvv/depthwise_convolution_3x3_fp32.c index 95d7e760..c9244456 100644 --- a/source/thead_rvv/depthwise_convolution_3x3.c +++ b/source/thead_rvv/depthwise_convolution_3x3_fp32.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -42,10 +42,10 @@ int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[3]; float *input_padd_buf = - (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_nn_rvv_pad_input_fp32( + shl_rvv_pad_input_fp32( input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); @@ -341,13 +341,13 @@ int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *out } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } -int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -364,10 +364,10 @@ int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[3]; float *input_padd_buf = - (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float)); - csi_nn_rvv_pad_input_fp32( + shl_rvv_pad_input_fp32( input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); @@ -474,6 +474,6 @@ int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *out } } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c b/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c new file mode 100644 index 00000000..b0c53397 --- /dev/null +++ b/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c @@ -0,0 +1,802 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + *************************************************************/ +int shl_rvv_dwconv3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + float *output_ncxhwx = (float *)shl_mem_alloc(out_c * out_h * out_w * sizeof(float)); + + shl_rvv_pad_input_packn_fp32( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + float *out1 = out0 + out_w * packn; + + const float *r0 = input_padd_buf + c * in_h * in_w; + const float *r1 = r0 + in_w * packn; + const float *r2 = r1 + in_w * packn; + const float *r3 = r2 + in_w * packn; + + const float *kernel0 = kernel_data + c * 9; + + vfloat32m1_t _k00 = vle32_v_f32m1(kernel0, vl); + vfloat32m1_t _k01 = vle32_v_f32m1(kernel0 + 1 * packn, vl); + vfloat32m1_t _k02 = vle32_v_f32m1(kernel0 + 2 * packn, vl); + vfloat32m1_t _k10 = vle32_v_f32m1(kernel0 + 3 * packn, vl); + vfloat32m1_t _k11 = vle32_v_f32m1(kernel0 + 4 * packn, vl); + vfloat32m1_t _k12 = vle32_v_f32m1(kernel0 + 5 * packn, vl); + vfloat32m1_t _k20 = vle32_v_f32m1(kernel0 + 6 * packn, vl); + vfloat32m1_t _k21 = vle32_v_f32m1(kernel0 + 7 * packn, vl); + vfloat32m1_t _k22 = vle32_v_f32m1(kernel0 + 8 * packn, vl); + + vfloat32m1_t _bias0; + _bias0 = bias_data ? vle32_v_f32m1(bias_data + c, vl) : vfmv_v_f_f32m1(0.0f, vl); + + int h = 0; + // h2 loop + for (; h + 1 < out_h; h += 2) { + int w = 0; + // h2w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + vfloat32m1_t _acc02 = _bias0; + vfloat32m1_t _acc03 = _bias0; + vfloat32m1_t _acc10 = _bias0; + vfloat32m1_t _acc11 = _bias0; + vfloat32m1_t _acc12 = _bias0; + vfloat32m1_t _acc13 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r02, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r03, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r04, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r03, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r04, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r05, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl); + vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r12, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r13, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r14, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r13, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r14, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r15, vl); // + _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k00, _r11, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k01, _r12, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k02, _r13, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k00, _r12, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k01, _r13, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k02, _r14, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k00, _r13, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k01, _r14, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k02, _r15, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl); + vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r22, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r23, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r24, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r23, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r24, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r25, vl); // + _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k10, _r21, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k11, _r22, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k12, _r23, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k10, _r22, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k11, _r23, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k12, _r24, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k10, _r23, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k11, _r24, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k12, _r25, vl); + + vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl); + vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl); + vfloat32m1_t _r33 = vle32_v_f32m1(r3 + 3 * packn, vl); + vfloat32m1_t _r34 = vle32_v_f32m1(r3 + 4 * packn, vl); + vfloat32m1_t _r35 = vle32_v_f32m1(r3 + 5 * packn, vl); + + _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k20, _r31, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k21, _r32, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k22, _r33, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k20, _r32, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k21, _r33, vl); + _acc12 = vfmacc_vv_f32m1(_acc12, _k22, _r34, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k20, _r33, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k21, _r34, vl); + _acc13 = vfmacc_vv_f32m1(_acc13, _k22, _r35, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + vse32_v_f32m1(out0 + 2 * packn, _acc02, vl); + vse32_v_f32m1(out0 + 3 * packn, _acc03, vl); + vse32_v_f32m1(out1, _acc10, vl); + vse32_v_f32m1(out1 + 1 * packn, _acc11, vl); + vse32_v_f32m1(out1 + 2 * packn, _acc12, vl); + vse32_v_f32m1(out1 + 3 * packn, _acc13, vl); + + out0 += packn * 4; + out1 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + r3 += packn * 4; + } + // h2w2 + for (; w + 1 < out_w; w += 2) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + vfloat32m1_t _acc10 = _bias0; + vfloat32m1_t _acc11 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl); // 0 + _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k00, _r11, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k01, _r12, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k02, _r13, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl); // + _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k10, _r21, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k11, _r22, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k12, _r23, vl); + + vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl); + vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl); + vfloat32m1_t _r33 = vle32_v_f32m1(r3 + 3 * packn, vl); + + _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k20, _r31, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k21, _r32, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _k22, _r33, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + vse32_v_f32m1(out1, _acc10, vl); + vse32_v_f32m1(out1 + 1 * packn, _acc11, vl); + + out0 += packn * 2; + out1 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + r3 += packn * 2; + } + // h2w1 + for (; w < out_w; w++) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc10 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); // 0 + _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); // + _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl); + + vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl); + vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl); + + _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out1, _acc10, vl); + + out0 += packn * 1; + out1 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + r3 += packn * 1; + } + r0 += (2 + in_w) * packn; + r1 += (2 + in_w) * packn; + r2 += (2 + in_w) * packn; + r3 += (2 + in_w) * packn; + + out0 += out_w * packn; + out1 += out_w * packn; + } + + // h1 + for (; h < out_h; h++) { + int w = 0; + // h1w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + vfloat32m1_t _acc02 = _bias0; + vfloat32m1_t _acc03 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r02, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r03, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r04, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r03, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r04, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r05, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl); + vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r12, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r13, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r14, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r13, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r14, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r15, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl); + vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r22, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r23, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r24, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r23, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r24, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r25, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + vse32_v_f32m1(out0 + 2 * packn, _acc02, vl); + vse32_v_f32m1(out0 + 3 * packn, _acc03, vl); + + out0 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + // h1w2 + for (; w + 1 < out_w; w += 2) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + + out0 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + // h1w1 + for (; w < out_w; w++) { + vfloat32m1_t _acc00 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + + vse32_v_f32m1(out0, _acc00, vl); + + out0 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + } + } + } + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} + +int shl_rvv_dwconv3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_padd_buf = + (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * sizeof(float)); + + float *output_ncxhwx = (float *)shl_mem_alloc(out_c * out_h * out_w * sizeof(float)); + + shl_rvv_pad_input_packn_fp32( + input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + + int tailstep = (in_w - 2 * out_w + in_w) * packn; + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + + const float *r0 = input_padd_buf + c * in_h * in_w; + const float *r1 = r0 + in_w * packn; + const float *r2 = r1 + in_w * packn; + + const float *kernel0 = kernel_data + c * 9; + + vfloat32m1_t _k00 = vle32_v_f32m1(kernel0, vl); + vfloat32m1_t _k01 = vle32_v_f32m1(kernel0 + 1 * packn, vl); + vfloat32m1_t _k02 = vle32_v_f32m1(kernel0 + 2 * packn, vl); + vfloat32m1_t _k10 = vle32_v_f32m1(kernel0 + 3 * packn, vl); + vfloat32m1_t _k11 = vle32_v_f32m1(kernel0 + 4 * packn, vl); + vfloat32m1_t _k12 = vle32_v_f32m1(kernel0 + 5 * packn, vl); + vfloat32m1_t _k20 = vle32_v_f32m1(kernel0 + 6 * packn, vl); + vfloat32m1_t _k21 = vle32_v_f32m1(kernel0 + 7 * packn, vl); + vfloat32m1_t _k22 = vle32_v_f32m1(kernel0 + 8 * packn, vl); + + vfloat32m1_t _bias0; + _bias0 = bias_data ? vle32_v_f32m1(bias_data + c, vl) : vfmv_v_f_f32m1(0.0f, vl); + + for (int h = 0; h < out_h; h++) { + int w = 0; + // h1w4 loop + for (; w + 3 < out_w; w += 4) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + vfloat32m1_t _acc02 = _bias0; + vfloat32m1_t _acc03 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl); + vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl); + vfloat32m1_t _r06 = vle32_v_f32m1(r0 + 6 * packn, vl); + vfloat32m1_t _r07 = vle32_v_f32m1(r0 + 7 * packn, vl); + vfloat32m1_t _r08 = vle32_v_f32m1(r0 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r03, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r04, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r04, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r05, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r06, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r06, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r07, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r08, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl); + vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl); + vfloat32m1_t _r16 = vle32_v_f32m1(r1 + 6 * packn, vl); + vfloat32m1_t _r17 = vle32_v_f32m1(r1 + 7 * packn, vl); + vfloat32m1_t _r18 = vle32_v_f32m1(r1 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r13, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r14, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r14, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r15, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r16, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r16, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r17, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r18, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl); + vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl); + vfloat32m1_t _r26 = vle32_v_f32m1(r2 + 6 * packn, vl); + vfloat32m1_t _r27 = vle32_v_f32m1(r2 + 7 * packn, vl); + vfloat32m1_t _r28 = vle32_v_f32m1(r2 + 8 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r23, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r24, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r24, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r25, vl); + _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r26, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r26, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r27, vl); + _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r28, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + vse32_v_f32m1(out0 + 2 * packn, _acc02, vl); + vse32_v_f32m1(out0 + 3 * packn, _acc03, vl); + + out0 += packn * 4; + + r0 += packn * 8; + r1 += packn * 8; + r2 += packn * 8; + } + for (; w + 1 < out_w; w += 2) { + vfloat32m1_t _acc00 = _bias0; + vfloat32m1_t _acc01 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl); + vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r02, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r03, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r04, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl); + vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r12, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r13, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r14, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl); + vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r22, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r23, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r24, vl); + + vse32_v_f32m1(out0, _acc00, vl); + vse32_v_f32m1(out0 + 1 * packn, _acc01, vl); + + out0 += packn * 2; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + for (; w < out_w; w++) { + vfloat32m1_t _acc00 = _bias0; + + vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl); + vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl); + + vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl); + vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl); + + vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl); + vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl); + + vse32_v_f32m1(out0, _acc00, vl); + out0 += packn * 1; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } + shl_mem_free(input_padd_buf); + return CSINN_TRUE; +} + +void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + float *kernel_data = (float *)kernel->data; + const int out_ch = kernel->dim[0]; + const int maxk = kernel->dim[2] * kernel->dim[3]; + float *kernel_trans = (float *)shl_mem_alloc(out_ch * maxk * sizeof(float)); + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) { + float *ksrc = kernel_data + oc * maxk; + float *kdst = kernel_trans + oc * maxk; + for (int ic = 0; ic < maxk; ic++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(ksrc + ic, maxk * sizeof(float), vl); + vse32_v_f32m1(kdst, _tmp, vl); + kdst += vl; + } + } + memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(float)); + shl_mem_free(kernel_trans); +} diff --git a/source/thead_rvv/depthwise_convolution_3x3_int4.c b/source/thead_rvv/depthwise_convolution_3x3_int4.c index fda312fb..c2083499 100644 --- a/source/thead_rvv/depthwise_convolution_3x3_int4.c +++ b/source/thead_rvv/depthwise_convolution_3x3_int4.c @@ -16,9 +16,8 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp, int vl) @@ -31,9 +30,9 @@ static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shif return _tmp2; } -int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s1_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -49,20 +48,20 @@ int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[2]; int32_t out_c = output->dim[3]; - int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) * + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * in_c * sizeof(int8_t)); int8_t pad_value = input->qinfo->zero_point; - csi_nn_rvv_pad_input_int4_trans_int8( - input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, - in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left, - input->qinfo->zero_point); + shl_rvv_pad_input_int4_trans_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); - int8_t *kernel_tran_buf = (int8_t *)csi_mem_alloc(9 * in_c * sizeof(int8_t)); - int8_t *output_tran_buf = (int8_t *)csi_mem_alloc(out_h * out_w * out_c * sizeof(int8_t)); + int8_t *kernel_tran_buf = (int8_t *)shl_mem_alloc(9 * in_c * sizeof(int8_t)); + int8_t *output_tran_buf = (int8_t *)shl_mem_alloc(out_h * out_w * out_c * sizeof(int8_t)); - csi_nn_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c); + shl_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -247,16 +246,16 @@ int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *out } } } - csi_nn_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c); - csi_mem_free(input_padd_buf); - csi_mem_free(kernel_tran_buf); - csi_mem_free(output_tran_buf); + shl_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c); + shl_mem_free(input_padd_buf); + shl_mem_free(kernel_tran_buf); + shl_mem_free(output_tran_buf); return CSINN_TRUE; } -int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s2_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -272,19 +271,19 @@ int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *out int32_t out_w = output->dim[2]; int32_t out_c = output->dim[3]; - int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) * + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * in_c * sizeof(int8_t)); - csi_nn_rvv_pad_input_int4_trans_int8( - input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, - in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left, - input->qinfo->zero_point); + shl_rvv_pad_input_int4_trans_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); - int8_t *kernel_tran_buf = (int8_t *)csi_mem_alloc(9 * in_c * sizeof(int8_t)); - int8_t *output_tran_buf = (int8_t *)csi_mem_alloc(out_h * out_w * out_c * sizeof(int8_t)); + int8_t *kernel_tran_buf = (int8_t *)shl_mem_alloc(9 * in_c * sizeof(int8_t)); + int8_t *output_tran_buf = (int8_t *)shl_mem_alloc(out_h * out_w * out_c * sizeof(int8_t)); - csi_nn_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c); + shl_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -383,9 +382,9 @@ int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *out r2 += tailstep; } } - csi_nn_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c); - csi_mem_free(input_padd_buf); - csi_mem_free(kernel_tran_buf); - csi_mem_free(output_tran_buf); + shl_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c); + shl_mem_free(input_padd_buf); + shl_mem_free(kernel_tran_buf); + shl_mem_free(output_tran_buf); return CSINN_TRUE; -} \ No newline at end of file +} diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8.c b/source/thead_rvv/depthwise_convolution_3x3_int8.c index e6084026..6b47f108 100644 --- a/source/thead_rvv/depthwise_convolution_3x3_int8.c +++ b/source/thead_rvv/depthwise_convolution_3x3_int8.c @@ -16,9 +16,8 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ - -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp, int vl) @@ -31,9 +30,9 @@ static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shif return _tmp2; } -int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -49,14 +48,14 @@ int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *out int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) * + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * in_c * sizeof(int8_t)); - csi_nn_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w, - in_h + params->pad_top + params->pad_down, - in_w + params->pad_left + params->pad_right, params->pad_top, - params->pad_left, input->qinfo->zero_point); + shl_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -288,13 +287,13 @@ int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *out } output_data += out_h * out_w; } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } -int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct csi_tensor *bias, - struct conv2d_params *params) +int shl_rvv_dwconv3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -310,14 +309,14 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out int32_t out_h = output->dim[2]; int32_t out_w = output->dim[3]; - int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) * + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * in_c * sizeof(int8_t)); - csi_nn_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w, - in_h + params->pad_top + params->pad_down, - in_w + params->pad_left + params->pad_right, params->pad_top, - params->pad_left, input->qinfo->zero_point); + shl_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); in_h = in_h + params->pad_top + params->pad_down; in_w = in_w + params->pad_left + params->pad_right; @@ -420,10 +419,10 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out vint8m1_t _res0; if (kernel->quant_channel > 1) { _res0 = requantize_m4(_acc0, kernel->qinfo[c].multiplier, - kernel->qinfo[c].shift, output->qinfo->zero_point, 16); + kernel->qinfo[c].shift, output->qinfo->zero_point, vl); } else if (kernel->quant_channel == 1) { _res0 = requantize_m4(_acc0, kernel->qinfo[0].multiplier, - kernel->qinfo[0].shift, output->qinfo->zero_point, 16); + kernel->qinfo[0].shift, output->qinfo->zero_point, vl); } vse8_v_i8m1(outptr0, _res0, vl); outptr0 += vl; @@ -435,6 +434,6 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out } output_data += out_h * out_w; } - csi_mem_free(input_padd_buf); + shl_mem_free(input_padd_buf); return CSINN_TRUE; } diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c b/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c new file mode 100644 index 00000000..4e320f78 --- /dev/null +++ b/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift, + int32_t out_zp, int vl) +{ +} + +int shl_rvv_dwconv3x3s1_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + return CSINN_FALSE; +} + +int shl_rvv_dwconv3x3s2_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ + return CSINN_FALSE; +} + +/**************************************************************************** + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch = 1 + * layout: [out_c, 1, ksize_h, ksize_w] ==> [out_c/packn, 1, maxk, packn] + ***************************************************************************/ +void shl_rvv_dwconv_reorder_kernel_packn_int8_dot(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ + int8_t *kernel_data = (int8_t *)kernel->data; + const int out_ch = kernel->dim[0]; + const int maxk = kernel->dim[2] * kernel->dim[3]; + int8_t *kernel_trans = (int8_t *)shl_mem_alloc(out_ch * maxk * sizeof(int8_t)); + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) { + int8_t *ksrc = kernel_data + oc * maxk; + int8_t *kdst = kernel_trans + oc * maxk; + for (int ic = 0; ic < maxk; ic++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(ksrc + ic, maxk * sizeof(int8_t), vl); + vse8_v_i8mf2(kdst, _tmp, vl); + kdst += vl; + } + } + memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(int8_t)); + shl_mem_free(kernel_trans); +} +#endif diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c b/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c new file mode 100644 index 00000000..6500a288 --- /dev/null +++ b/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c @@ -0,0 +1,905 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift, + int32_t out_zp, int vl) +{ +#ifdef RVV_1_0_0 + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +#endif +} + +int shl_rvv_dwconv3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8m1(packn); + + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * + in_c * sizeof(int8_t)); + + shl_rvv_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + int8_t *out0 = output_data + c * out_h * out_w; + int8_t *out1 = out0 + out_w * packn; + + const int8_t *r0 = input_padd_buf + c * in_h * in_w; + const int8_t *r1 = r0 + in_w * packn; + const int8_t *r2 = r1 + in_w * packn; + const int8_t *r3 = r2 + in_w * packn; + + const int8_t *kernel0 = kernel_data + c * 9; + + vint16m1_t _k00 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0, vl), 0, vl); + vint16m1_t _k01 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 1 * packn, vl), 0, vl); + vint16m1_t _k02 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 2 * packn, vl), 0, vl); + vint16m1_t _k10 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 3 * packn, vl), 0, vl); + vint16m1_t _k11 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 4 * packn, vl), 0, vl); + vint16m1_t _k12 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 5 * packn, vl), 0, vl); + vint16m1_t _k20 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 6 * packn, vl), 0, vl); + vint16m1_t _k21 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 7 * packn, vl), 0, vl); + vint16m1_t _k22 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 8 * packn, vl), 0, vl); + + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + vint32m2_t _bias0 = vle32_v_i32m2(bias_data + c, vl); + + vint32m2_t _mult = vle32_v_i32m2(multiplier + c, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + c, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + int32_t out_zp = output->qinfo->zero_point; + + int h = 0; + // h2 loop + for (; h + 1 < out_h; h += 2) { + int w = 0; + // h2w4 loop + for (; w + 3 < out_w; w += 4) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + vint32m2_t _acc02 = _bias0; + vint32m2_t _acc03 = _bias0; + vint32m2_t _acc10 = _bias0; + vint32m2_t _acc11 = _bias0; + vint32m2_t _acc12 = _bias0; + vint32m2_t _acc13 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl); + vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r02, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r03, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r04, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r03, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r04, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r05, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl); + vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r12, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r13, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r14, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r13, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r14, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r15, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k00, _r11, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k01, _r12, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k02, _r13, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k00, _r12, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k01, _r13, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k02, _r14, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k00, _r13, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k01, _r14, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k02, _r15, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl); + vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r22, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r23, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r24, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r23, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r24, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r25, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k10, _r21, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k11, _r22, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k12, _r23, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k10, _r22, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k11, _r23, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k12, _r24, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k10, _r23, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k11, _r24, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k12, _r25, vl); + + vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl); + vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl); + vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl); + vint16m1_t _r33 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 3 * packn, vl), 0, vl); + vint16m1_t _r34 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 4 * packn, vl), 0, vl); + vint16m1_t _r35 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 5 * packn, vl), 0, vl); + + _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k20, _r31, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k21, _r32, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k22, _r33, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k20, _r32, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k21, _r33, vl); + _acc12 = vwmacc_vv_i32m2(_acc12, _k22, _r34, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k20, _r33, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k21, _r34, vl); + _acc13 = vwmacc_vv_i32m2(_acc13, _k22, _r35, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl); + vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl); + vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl); + vint8mf2_t _res11 = requantize_m2_s(_acc11, _mult, _shift, out_zp, vl); + vint8mf2_t _res12 = requantize_m2_s(_acc12, _mult, _shift, out_zp, vl); + vint8mf2_t _res13 = requantize_m2_s(_acc13, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + vse8_v_i8mf2(out0 + packn * 2, _res02, vl); + vse8_v_i8mf2(out0 + packn * 3, _res03, vl); + vse8_v_i8mf2(out1, _res10, vl); + vse8_v_i8mf2(out1 + packn * 1, _res11, vl); + vse8_v_i8mf2(out1 + packn * 2, _res12, vl); + vse8_v_i8mf2(out1 + packn * 3, _res13, vl); + + out0 += packn * 4; + out1 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + r3 += packn * 4; + } + for (; w + 1 < out_w; w += 2) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + vint32m2_t _acc10 = _bias0; + vint32m2_t _acc11 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k00, _r11, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k01, _r12, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k02, _r13, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k10, _r21, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k11, _r22, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k12, _r23, vl); + + vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl); + vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl); + vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl); + vint16m1_t _r33 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 3 * packn, vl), 0, vl); + + _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k20, _r31, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k21, _r32, vl); + _acc11 = vwmacc_vv_i32m2(_acc11, _k22, _r33, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl); + vint8mf2_t _res11 = requantize_m2_s(_acc11, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + vse8_v_i8mf2(out1, _res10, vl); + vse8_v_i8mf2(out1 + packn * 1, _res11, vl); + + out0 += packn * 2; + out1 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + r3 += packn * 2; + } + for (; w < out_w; w++) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc10 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); // + _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl); + + vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl); + vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl); + vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl); + + _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl); + _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out1, _res10, vl); + + out0 += packn * 1; + out1 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + r3 += packn * 1; + } + r0 += (2 + in_w) * packn; + r1 += (2 + in_w) * packn; + r2 += (2 + in_w) * packn; + r3 += (2 + in_w) * packn; + + out0 += out_w * packn; + out1 += out_w * packn; + } + for (; h < out_h; h++) { + int w = 0; + // h1w4 loop + for (; w + 3 < out_w; w += 4) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + vint32m2_t _acc02 = _bias0; + vint32m2_t _acc03 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl); + vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r02, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r03, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r04, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r03, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r04, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r05, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl); + vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r12, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r13, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r14, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r13, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r14, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r15, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl); + vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r22, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r23, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r24, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r23, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r24, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r25, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl); + vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + vse8_v_i8mf2(out0 + packn * 2, _res02, vl); + vse8_v_i8mf2(out0 + packn * 3, _res03, vl); + + out0 += packn * 4; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + for (; w + 1 < out_w; w += 2) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + + out0 += packn * 2; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + for (; w < out_w; w++) { + vint32m2_t _acc00 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + + out0 += packn * 1; + + r0 += packn * 1; + r1 += packn * 1; + r2 += packn * 1; + } + } + } + shl_mem_free(input_padd_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport dwconv3x3s1 packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} + +int shl_rvv_dwconv3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t *bias_data = (int32_t *)bias->data; + + int32_t batch = input->dim[0]; + int32_t in_c = input->dim[1]; // group = in_channel + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + + int32_t out_c = output->dim[1]; + int32_t out_h = output->dim[2]; + int32_t out_w = output->dim[3]; + + int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t)); + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8m1(packn); + + int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) * + (in_w + params->pad_left + params->pad_right) * + in_c * sizeof(int8_t)); + + int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t)); + + shl_rvv_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w, + in_h + params->pad_top + params->pad_down, + in_w + params->pad_left + params->pad_right, params->pad_top, + params->pad_left, input->qinfo->zero_point); + + in_h = in_h + params->pad_top + params->pad_down; + in_w = in_w + params->pad_left + params->pad_right; + + int tailstep = (in_w - 2 * out_w + in_w) * packn; + + if (kernel->quant_channel > 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[c].multiplier; + shift[c] = kernel->qinfo[c].shift; + } + } else if (kernel->quant_channel == 1) { + for (int c = 0; c < out_c; c++) { + multiplier[c] = kernel->qinfo[0].multiplier; + shift[c] = kernel->qinfo[0].shift; + } + } + +#pragma omp parallel for num_threads(1) + for (int c = 0; c + packn - 1 < in_c; c += packn) { + int8_t *out0 = output_data + c * out_h * out_w; + + int8_t *r0 = input_padd_buf + c * in_h * in_w; + int8_t *r1 = r0 + in_w * packn; + int8_t *r2 = r1 + in_w * packn; + + const int8_t *kernel0 = kernel_data + c * 9; + + vint16m1_t _k00 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0, vl), 0, vl); + vint16m1_t _k01 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 1 * packn, vl), 0, vl); + vint16m1_t _k02 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 2 * packn, vl), 0, vl); + vint16m1_t _k10 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 3 * packn, vl), 0, vl); + vint16m1_t _k11 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 4 * packn, vl), 0, vl); + vint16m1_t _k12 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 5 * packn, vl), 0, vl); + vint16m1_t _k20 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 6 * packn, vl), 0, vl); + vint16m1_t _k21 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 7 * packn, vl), 0, vl); + vint16m1_t _k22 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 8 * packn, vl), 0, vl); + + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + vint32m2_t _bias0 = vle32_v_i32m2(bias_data + c, vl); + + vint32m2_t _mult = vle32_v_i32m2(multiplier + c, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + c, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + int32_t out_zp = output->qinfo->zero_point; + + for (int h = 0; h < out_h; h++) { + int w = 0; + for (; w + 3 < out_w; w += 4) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + vint32m2_t _acc02 = _bias0; + vint32m2_t _acc03 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl); + vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl); + vint16m1_t _r06 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 6 * packn, vl), 0, vl); + vint16m1_t _r07 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 7 * packn, vl), 0, vl); + vint16m1_t _r08 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 8 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r03, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r04, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r04, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r05, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r06, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r06, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r07, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r08, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl); + vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl); + vint16m1_t _r16 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 6 * packn, vl), 0, vl); + vint16m1_t _r17 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 7 * packn, vl), 0, vl); + vint16m1_t _r18 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 8 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r13, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r14, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r14, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r15, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r16, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r16, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r17, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r18, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl); + vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl); + vint16m1_t _r26 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 6 * packn, vl), 0, vl); + vint16m1_t _r27 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 7 * packn, vl), 0, vl); + vint16m1_t _r28 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 8 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r23, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r24, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r24, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r25, vl); + _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r26, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r26, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r27, vl); + _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r28, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl); + vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + vse8_v_i8mf2(out0 + packn * 2, _res02, vl); + vse8_v_i8mf2(out0 + packn * 3, _res03, vl); + + out0 += packn * 4; + + r0 += packn * 8; + r1 += packn * 8; + r2 += packn * 8; + } + for (; w + 1 < out_w; w += 2) { + vint32m2_t _acc00 = _bias0; + vint32m2_t _acc01 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl); + vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r02, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r03, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r04, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl); + vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r12, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r13, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r14, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl); + vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r22, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r23, vl); + _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r24, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + vse8_v_i8mf2(out0 + packn * 1, _res01, vl); + + out0 += packn * 2; + + r0 += packn * 4; + r1 += packn * 4; + r2 += packn * 4; + } + for (; w < out_w; w++) { + vint32m2_t _acc00 = _bias0; + + vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl); + vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl); + vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl); + + vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl); + vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl); + vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl); + + vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl); + vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl); + vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl); + + _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl); + _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl); + + vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(out0, _res00, vl); + + out0 += packn * 1; + + r0 += packn * 2; + r1 += packn * 2; + r2 += packn * 2; + } + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } + shl_mem_free(input_padd_buf); + shl_mem_free(multiplier); + shl_mem_free(shift); + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport dwconv3x3s2 packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} + +/**************************************************************************** + * packn = vlenb / sizeof(int8_t) / 2 + * maxk = ksize_h * ksize_w + * constrain: out_c % packn = 0 and in_ch = 1 + * layout: [out_c, 1, ksize_h, ksize_w] ==> [out_c/packn, 1, maxk, packn] + ***************************************************************************/ +void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel, + struct csinn_conv2d_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *kernel_data = (int8_t *)kernel->data; + const int out_ch = kernel->dim[0]; + const int maxk = kernel->dim[2] * kernel->dim[3]; + int8_t *kernel_trans = (int8_t *)shl_mem_alloc(out_ch * maxk * sizeof(int8_t)); + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) { + int8_t *ksrc = kernel_data + oc * maxk; + int8_t *kdst = kernel_trans + oc * maxk; + for (int ic = 0; ic < maxk; ic++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(ksrc + ic, maxk * sizeof(int8_t), vl); + vse8_v_i8mf2(kdst, _tmp, vl); + kdst += vl; + } + } + memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(int8_t)); + shl_mem_free(kernel_trans); +#endif +} diff --git a/source/thead_rvv/fullyconnected.c b/source/thead_rvv/fullyconnected.c index 78728d82..b5b37153 100644 --- a/source/thead_rvv/fullyconnected.c +++ b/source/thead_rvv/fullyconnected.c @@ -16,133 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/************************************************************* - note: VLEN = 128/256 -*************************************************************/ -static void csi_nn_rvv_reorder_weight_npackn_fp32(float *src, float *dst, int m, int k, int ldx) +int shl_rvv_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { - int packn = csrr_vlenb() / sizeof(float); // VLEN128=4 VLEN256=8 - int vl = vsetvl_e32m1(packn); - int i = 0; - for (; i + packn - 1 < m; i += packn) { - float *in_ptr = src + i * k; - for (int j = 0; j < k; j++) { - vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), vl); - in_ptr++; - vse32_v_f32m1(dst, _input, vl); - dst += packn; - } - } - src += i * k; - for (; i < m; i++) { - memcpy(dst, src, sizeof(float) * k); - dst += k; - src += k; - } -} - -void csi_nn_rvv_fc_gemv_transform_weight_fp32(struct csi_tensor *weights) -{ - float *weight_data = (float *)weights->data; - - int n = weights->dim[0]; // out_nodes - int k = weights->dim[1]; // in_nodes - - float *pa_reorder = (float *)csi_mem_alloc(n * k * sizeof(float)); - csi_nn_rvv_reorder_weight_npackn_fp32(weight_data, pa_reorder, n, k, k); - memcpy(weight_data, pa_reorder, n * k * sizeof(float)); - csi_mem_free(pa_reorder); -} - -int csi_nn_rvv_fullyconnected_packn_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - float *weights_data = (float *)weights->data; - float *bias_data = (float *)bias->data; - const int output_dims_count = output->dim_count; const int weights_dims_count = weights->dim_count; - const int bias_dims_count = bias->dim_count; - int batches = 1; - /* compute the outer size */ - for (int i = 0; i < output_dims_count - 1; i++) { - batches *= output->dim[i]; - } - int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes - int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes - - bool flag_bias = 1; // default: fc layer include bias - if (bias_data == NULL) { - flag_bias = 0; - bias_data = (float *)csi_mem_alloc(output_depth * 2); - } - int packn = csrr_vlenb() / sizeof(float); // VLEN128=4 VLEN256=8 - int vl; - - for (int b = 0; b < batches; b++) { - float *init_output = output_data + b * output_depth; - float *init_input = input_data + b * accum_depth; - float *init_weight = weights_data; - float *init_bias = bias_data; - - vl = vsetvl_e32m1(packn); - int n = 0; - for (; n + packn - 1 < output_depth; n += packn) { - float *in_ptr = init_input; - vfloat32m1_t _acc = vle32_v_f32m1(init_bias, vl); - init_bias += vl; + const int out_nodes = weights->dim[weights_dims_count - 2]; + const int in_nodes = weights->dim[weights_dims_count - 1]; + struct csinn_callback *cb = params->base.cb; + if (input->dtype == CSINN_DTYPE_FLOAT32) { + shl_rvv_fc_gemv_transform_weight_fp32(weights); + cb->exec = shl_rvv_fullyconnected_packn_fp32; + } else if (input->dtype == CSINN_DTYPE_FLOAT16) { + shl_rvv_fc_gemv_transform_weight_fp16(weights); + cb->exec = shl_rvv_fullyconnected_packn_fp16; + } else if (input->dtype == CSINN_DTYPE_INT8) { + // enable fuse zeropoint to bias + if (!params->fc_extra.fuse_zp2bias) { + int32_t *bias_data = (int32_t *)bias->data; + int8_t *weights_data = (int8_t *)weights->data; + int32_t input_zp = input->qinfo->zero_point; - for (int k = 0; k < accum_depth; k++) { - vfloat32m1_t _weight = vle32_v_f32m1(init_weight, vl); - _acc = vfmacc_vf_f32m1(_acc, in_ptr[k], _weight, vl); - init_weight += vl; + if (bias_data == NULL) { + // XXX: memory leak + bias_data = (int32_t *)shl_mem_alloc(out_nodes * sizeof(int32_t)); + bias->data = bias_data; } - vse32_v_f32m1(init_output, _acc, vl); - init_output += vl; - } - for (; n < output_depth; n++) { - float *in_ptr = init_input; - float acc = init_bias[0]; - for (int k = 0; k < accum_depth; k++) { - acc += in_ptr[k] * init_weight[k]; + for (int oc = 0; oc < out_nodes; oc++) { + int32_t tmp = 0; + for (int j = 0; j < in_nodes; j++) { + tmp += weights_data[oc * in_nodes + j] * input_zp; + } + bias_data[oc] -= tmp; } - *init_output++ = acc; - init_bias++; - init_weight += accum_depth; } - } - if (!flag_bias) { - csi_mem_free(bias_data); - bias_data = NULL; - } - return CSINN_TRUE; -} -int csi_nn_rvv_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) -{ - if (input->dtype == CSINN_DTYPE_FLOAT32) { - csi_nn_rvv_fc_gemv_transform_weight_fp32(weights); - params->base.bc = csi_nn_rvv_fullyconnected_packn_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - csi_nn_rvv_fc_gemv_transform_weight_fp16(weights); - params->base.bc = csi_nn_rvv_fullyconnected_packn_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - csi_nn_rvv_fc_gemv_transform_weight_int8(weights); + shl_rvv_fc_gemv_transform_weight_int8(weights); // support channel quantization for (int i = 0; i < weights->quant_channel; i++) { float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), + shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), &(weights->qinfo[i].shift)); } - params->base.bc = csi_nn_rvv_fullyconnected_packn_int8; + cb->exec = shl_rvv_fullyconnected_packn_int8; } return CSINN_TRUE; } diff --git a/source/thead_rvv/fullyconnected_fp16.c b/source/thead_rvv/fullyconnected_fp16.c index 802e6a5c..af24cedc 100644 --- a/source/thead_rvv/fullyconnected_fp16.c +++ b/source/thead_rvv/fullyconnected_fp16.c @@ -16,51 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -static void csi_nn_rvv_reorder_weight_npackn_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) +static void shl_rvv_reorder_weight_npackn_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx) { - int packn = csrr_vlenb() / sizeof(__fp16); // VLEN128=8 VLEN256=16 + const int packn = csrr_vlenb() / sizeof(__fp16); // VLEN128=8 VLEN256=16 int vl = vsetvl_e16m1(packn); - int i = 0; - for (; i + packn - 1 < m; i += packn) { - __fp16 *in_ptr = src + i * k; + + while (m > 0) { + vl = vsetvl_e16m1(m); + __fp16 *in_ptr = src; for (int j = 0; j < k; j++) { vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), vl); in_ptr++; vse16_v_f16m1(dst, _input, vl); - dst += packn; + dst += vl; } - } - src += i * k; - for (; i < m; i++) { - memcpy(dst, src, sizeof(__fp16) * k); - dst += k; - src += k; + src += vl * k; + m -= vl; } } -void csi_nn_rvv_fc_gemv_transform_weight_fp16(struct csi_tensor *weights) +void shl_rvv_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights) { __fp16 *weight_data = (__fp16 *)weights->data; int n = weights->dim[0]; // out_nodes int k = weights->dim[1]; // in_nodes - __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16)); - csi_nn_rvv_reorder_weight_npackn_fp16(weight_data, pa_reorder, n, k, k); + __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16)); + shl_rvv_reorder_weight_npackn_fp16(weight_data, pa_reorder, n, k, k); memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) +int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -80,11 +77,11 @@ int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_te bool flag_bias = 1; // default: fc layer include bias if (bias_data == NULL) { flag_bias = 0; - bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2); + bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2); } - int packn = csrr_vlenb() / sizeof(__fp16); // VLEN128=8 VLEN256=16 - int vl; + const int packn = csrr_vlenb() / sizeof(__fp16); // VLEN128=8 VLEN256=16 + int vl = vsetvl_e16m1(packn); for (int b = 0; b < batches; b++) { __fp16 *init_output = output_data + b * output_depth; @@ -92,34 +89,23 @@ int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_te __fp16 *init_weight = weights_data; __fp16 *init_bias = bias_data; - vl = vsetvl_e16m1(packn); - int n = 0; - for (; n + packn - 1 < output_depth; n += packn) { - __fp16 *in_ptr = init_input; + int n = output_depth; + while (n > 0) { + vl = vsetvl_e16m1(n); vfloat16m1_t _acc = vle16_v_f16m1(init_bias, vl); init_bias += vl; - for (int k = 0; k < accum_depth; k++) { vfloat16m1_t _weight = vle16_v_f16m1(init_weight, vl); - _acc = vfmacc_vf_f16m1(_acc, in_ptr[k], _weight, vl); + _acc = vfmacc_vf_f16m1(_acc, init_input[k], _weight, vl); init_weight += vl; } vse16_v_f16m1(init_output, _acc, vl); init_output += vl; - } - for (; n < output_depth; n++) { - __fp16 *in_ptr = init_input; - __fp16 acc = init_bias[0]; - for (int k = 0; k < accum_depth; k++) { - acc += in_ptr[k] * init_weight[k]; - } - *init_output++ = acc; - init_bias++; - init_weight += accum_depth; + n -= vl; } } if (!flag_bias) { - csi_mem_free(bias_data); + shl_mem_free(bias_data); bias_data = NULL; } return CSINN_TRUE; diff --git a/source/thead_rvv/fullyconnected_fp32.c b/source/thead_rvv/fullyconnected_fp32.c new file mode 100644 index 00000000..fea065d9 --- /dev/null +++ b/source/thead_rvv/fullyconnected_fp32.c @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + note: VLEN = 128/256 +*************************************************************/ +static void shl_rvv_reorder_weight_npackn_fp32(float *src, float *dst, int m, int k, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(float); // VLEN128=4 VLEN256=8 + int vl = vsetvl_e32m1(packn); + + while (m > 0) { + vl = vsetvl_e32m1(m); + float *in_ptr = src; + for (int j = 0; j < k; j++) { + vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), vl); + in_ptr++; + vse32_v_f32m1(dst, _input, vl); + dst += vl; + } + src += vl * k; + m -= vl; + } +} + +void shl_rvv_fc_gemv_transform_weight_fp32(struct csinn_tensor *weights) +{ + float *weight_data = (float *)weights->data; + + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + + float *pa_reorder = (float *)shl_mem_alloc(n * k * sizeof(float)); + shl_rvv_reorder_weight_npackn_fp32(weight_data, pa_reorder, n, k, k); + memcpy(weight_data, pa_reorder, n * k * sizeof(float)); + shl_mem_free(pa_reorder); +} + +int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *weights_data = (float *)weights->data; + float *bias_data = (float *)bias->data; + const int output_dims_count = output->dim_count; + const int weights_dims_count = weights->dim_count; + const int bias_dims_count = bias->dim_count; + int batches = 1; + /* compute the outer size */ + for (int i = 0; i < output_dims_count - 1; i++) { + batches *= output->dim[i]; + } + int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes + int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes + + bool flag_bias = 1; // default: fc layer include bias + if (bias_data == NULL) { + flag_bias = 0; + bias_data = (float *)shl_mem_alloc(output_depth * 2); + } + const int packn = csrr_vlenb() / sizeof(float); // VLEN128=4 VLEN256=8 + int vl = vsetvl_e32m1(packn); + + for (int b = 0; b < batches; b++) { + float *init_output = output_data + b * output_depth; + float *init_input = input_data + b * accum_depth; + float *init_weight = weights_data; + float *init_bias = bias_data; + + int n = output_depth; + while (n > 0) { + vl = vsetvl_e32m1(n); + vfloat32m1_t _acc = vle32_v_f32m1(init_bias, vl); + init_bias += vl; + for (int k = 0; k < accum_depth; k++) { + vfloat32m1_t _weight = vle32_v_f32m1(init_weight, vl); + _acc = vfmacc_vf_f32m1(_acc, init_input[k], _weight, vl); + init_weight += vl; + } + vse32_v_f32m1(init_output, _acc, vl); + init_output += vl; + n -= vl; + } + } + if (!flag_bias) { + shl_mem_free(bias_data); + bias_data = NULL; + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fullyconnected_int4.c b/source/thead_rvv/fullyconnected_int4.c new file mode 100644 index 00000000..dce8a707 --- /dev/null +++ b/source/thead_rvv/fullyconnected_int4.c @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + note: VLEN = 128/256 +*************************************************************/ +static void shl_rvv_reorder_weight_packn_int4(int8_t *src, int8_t *dst, int m, int k, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(int8_t); + int vl = vsetvl_e8m1(packn); + + while (m > 0) { + vl = vsetvl_e8m1(m); + int32_t *in_ptr0 = (int32_t *)src; + int32_t *out_ptr0 = (int32_t *)dst; + int j = 0; + for (; j + 7 < k; j += 8) { + vint32m4_t _nf0, _nf1; + vlsseg2e32_v_i32m4(&_nf0, &_nf1, in_ptr0, k * sizeof(int8_t), vl); + in_ptr0 += 2; + vse32_v_i32m4(out_ptr0, _nf0, vl); + out_ptr0 += vl; + vse32_v_i32m4(out_ptr0, _nf1, vl); + out_ptr0 += vl; + } + for (; j + 3 < k; j += 4) { + vint32m4_t _input = vlse32_v_i32m4(in_ptr0, k * sizeof(int8_t), vl); + in_ptr0++; + vse32_v_i32m4(out_ptr0, _input, vl); + out_ptr0 += vl; + } + src += vl * k; + dst += vl * k; + m -= vl; + } +} + +void shl_rvv_fc_gemv_transform_weight_int4_dot(struct csinn_tensor *weights) +{ + int8_t *weight_data = (int8_t *)weights->data; + + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + int k_2 = (((k - 1) & -2) + 2) >> 1; // pair of int4, col of weight_matrix + + int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k_2 * sizeof(int8_t)); + shl_rvv_reorder_weight_packn_int4(weight_data, pa_reorder, n, k_2, k_2); + memcpy(weight_data, pa_reorder, n * k_2 * sizeof(int8_t)); + shl_mem_free(pa_reorder); +} + +static void shl_rvv_fullyconnectd_packn_int4_internel(const int8_t *input, int32_t *output, + int8_t *weight, const int32_t *bias, + int in_nodes, int out_nodes) +{ + const int packn = csrr_vlenb() / sizeof(int8_t); + int vl = vsetvl_e8m1(packn); + + while (out_nodes > 0) { + vl = vsetvl_e8m1(out_nodes); + int32_t *input_ptr = (int32_t *)input; + vint32m4_t _acc0 = vle32_v_i32m4(bias, vl); + bias += vl; + for (int c = 0; c < in_nodes / 4; c++) { + vint8m4_t _weight = vle8_v_i8m4(weight, vl * 4); + _acc0 = vpmaqa_vx_i32m4(_acc0, input_ptr[c], _weight, vl); + weight += 4 * vl; + } + vse32_v_i32m4(output, _acc0, vl); + output += vl; + out_nodes -= vl; + } +} + +int shl_rvv_fullyconnected_packn_int4_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *weights_data = (int8_t *)weights->data; + int32_t *bias_data = (int32_t *)bias->data; + const int output_dims_count = output->dim_count; + const int weights_dims_count = weights->dim_count; + const int bias_dims_count = bias->dim_count; + int batches = 1; + /* compute the outer size */ + for (int i = 0; i < output_dims_count - 1; i++) { + batches *= output->dim[i]; + } + const int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes + const int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes + int k_2 = (((accum_depth - 1) & -2) + 2) >> 1; // pair of int4, col of weight_matrix + + int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t)); + int vl; + + for (int b = 0; b < batches; b++) { + int8_t *input_ptr = input_data + b * accum_depth; + int8_t *weight_ptr = weights_data; + int32_t *bias_ptr = bias_data; + int32_t *output_ptr = output_tmp; + + shl_rvv_fullyconnectd_packn_int4_internel(input_ptr, output_ptr, weight_ptr, bias_ptr, k_2, + output_depth); + + if (weights->quant_channel == 1) { + shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift, + output_depth); + } else if (weights->quant_channel == output_depth) { + // support channel quantization + for (int c = 0; c < weights->quant_channel; c++) { + shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier, + weights->qinfo[c].shift, 1); + } + } + shl_rvv_saturated_int4(output_ptr, output_data + b * output_depth / 2, + output->qinfo->zero_point, output_depth); + } + if (output_tmp) { + shl_mem_free(output_tmp); + output_tmp = NULL; + } + return CSINN_TRUE; +} +#endif diff --git a/source/thead_rvv/fullyconnected_int8.c b/source/thead_rvv/fullyconnected_int8.c index 729e9b7f..e0e43d7b 100644 --- a/source/thead_rvv/fullyconnected_int8.c +++ b/source/thead_rvv/fullyconnected_int8.c @@ -16,87 +16,191 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -static void csi_nn_rvv_reorder_weight_packn_int8(int8_t *src, int8_t *dst, int m, int k, int ldx) +static void shl_rvv_reorder_weight_packn_int8(int8_t *src, int8_t *dst, int m, int k, int ldx) { - int packn = csrr_vlenb() / sizeof(int8_t); // VLEN128=16 VLEN256=32 + const int packn = csrr_vlenb() / sizeof(int8_t); // VLEN128=16 VLEN256=32 int vl = vsetvl_e8m1(packn); - int i = 0; - for (; i + packn - 1 < m; i += packn) { - int8_t *in_ptr = src + i * k; - for (int j = 0; j < k; j++) { - vint8m1_t _input = vlse8_v_i8m1(in_ptr, k * sizeof(int8_t), vl); - in_ptr++; - vse8_v_i8m1(dst, _input, vl); - dst += packn; - } - } - if (i < m) { - vl = vsetvl_e8m1(m & (packn - 1)); - int8_t *in_ptr = src + i * k; + + while (m > 0) { + vl = vsetvl_e8m1(m); + int8_t *in_ptr = src; for (int j = 0; j < k; j++) { vint8m1_t _input = vlse8_v_i8m1(in_ptr, k * sizeof(int8_t), vl); in_ptr++; vse8_v_i8m1(dst, _input, vl); dst += vl; } + src += vl * k; + m -= vl; } } -void csi_nn_rvv_fc_gemv_transform_weight_int8(struct csi_tensor *weights) +void shl_rvv_fc_gemv_transform_weight_int8(struct csinn_tensor *weights) { int8_t *weight_data = (int8_t *)weights->data; int n = weights->dim[0]; // out_nodes int k = weights->dim[1]; // in_nodes - int8_t *pa_reorder = (int8_t *)csi_mem_alloc(n * k * sizeof(int8_t)); - csi_nn_rvv_reorder_weight_packn_int8(weight_data, pa_reorder, n, k, k); + int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k * sizeof(int8_t)); + shl_rvv_reorder_weight_packn_int8(weight_data, pa_reorder, n, k, k); memcpy(weight_data, pa_reorder, n * k * sizeof(int8_t)); - csi_mem_free(pa_reorder); + shl_mem_free(pa_reorder); } -static void csi_nn_rvv_fullyconnectd_packn_int8_internel(const int8_t *input, int32_t *output, - int8_t *weight, const int32_t *bias, - int in_nodes, int out_nodes) +static void shl_rvv_fullyconnectd_packn_int8_internel(const int8_t *input, int32_t *output, + int8_t *weight, const int32_t *bias, + int in_nodes, int out_nodes) { - int i = 0; - int packn = csrr_vlenb() / sizeof(int8_t); + const int packn = csrr_vlenb() / sizeof(int8_t); int vl = vsetvl_e8m1(packn); - for (; i + packn - 1 < out_nodes; i += packn) { + + while (out_nodes > 0) { + vl = vsetvl_e8m1(out_nodes); vint32m4_t _acc = vle32_v_i32m4(bias, vl); + bias += vl; for (int j = 0; j < in_nodes; j++) { vint8m1_t _weight = vle8_v_i8m1(weight, vl); vint16m2_t _mul = vwmul_vx_i16m2(_weight, input[j], vl); _acc = vwmacc_vx_i32m4(_acc, 1, _mul, vl); weight += vl; } - bias += vl; vse32_v_i32m4(output, _acc, vl); output += vl; + out_nodes -= vl; } - if (i < out_nodes) { - vl = vsetvl_e32m4(out_nodes & (packn - 1)); // tail out_node - vint32m4_t _acc = vle32_v_i32m4(bias, vl); - for (int j = 0; j < in_nodes; j++) { - vint8m1_t _weight = vle8_v_i8m1(weight, vl); - vint16m2_t _mul = vwmul_vx_i16m2(_weight, input[j], vl); - _acc = vwmacc_vx_i32m4(_acc, 1, _mul, vl); - weight += vl; +} + +int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) +{ + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + int8_t *weights_data = (int8_t *)weights->data; + int32_t *bias_data = (int32_t *)bias->data; + const int output_dims_count = output->dim_count; + const int weights_dims_count = weights->dim_count; + const int bias_dims_count = bias->dim_count; + int batches = 1; + /* compute the outer size */ + for (int i = 0; i < output_dims_count - 1; i++) { + batches *= output->dim[i]; + } + const int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes + const int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes + + int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t)); + int vl; + + for (int b = 0; b < batches; b++) { + int8_t *input_ptr = input_data + b * accum_depth; + int8_t *weight_ptr = weights_data; + int32_t *bias_ptr = bias_data; + int32_t *output_ptr = output_tmp; + + shl_rvv_fullyconnectd_packn_int8_internel(input_ptr, output_ptr, weight_ptr, bias_ptr, + accum_depth, output_depth); + + if (weights->quant_channel == 1) { + shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift, + output_depth); + } else if (weights->quant_channel == output_depth) { + // support channel quantization + for (int c = 0; c < weights->quant_channel; c++) { + shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier, + weights->qinfo[c].shift, 1); + } } - vse32_v_i32m4(output, _acc, vl); + shl_rvv_saturated_int8(output_ptr, output_data + b * output_depth, + output->qinfo->zero_point, output_depth); + } + if (output_tmp) { + shl_mem_free(output_tmp); + output_tmp = NULL; + } + return CSINN_TRUE; +} + +/************************************ dot **********************************************/ +#ifdef XTHEADV +static void shl_rvv_reorder_weight_packn_int8_dot(int8_t *src, int8_t *dst, int m, int k, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(int8_t); + int vl = vsetvl_e8m1(packn); + + while (m > 0) { + vl = vsetvl_e8m1(m); + int32_t *in_ptr0 = (int32_t *)src; + int32_t *out_ptr0 = (int32_t *)dst; + int j = 0; + for (; j + 7 < k; j += 8) { + vint32m4_t _nf0, _nf1; + vlsseg2e32_v_i32m4(&_nf0, &_nf1, in_ptr0, k * sizeof(int8_t), vl); + in_ptr0 += 2; + vse32_v_i32m4(out_ptr0, _nf0, vl); + out_ptr0 += vl; + vse32_v_i32m4(out_ptr0, _nf1, vl); + out_ptr0 += vl; + } + for (; j + 3 < k; j += 4) { + vint32m4_t _input = vlse32_v_i32m4(in_ptr0, k * sizeof(int8_t), vl); + in_ptr0++; + vse32_v_i32m4(out_ptr0, _input, vl); + out_ptr0 += vl; + } + src += vl * k; + dst += vl * k; + m -= vl; + } +} + +void shl_rvv_fc_gemv_transform_weight_int8_dot(struct csinn_tensor *weights) +{ + int8_t *weight_data = (int8_t *)weights->data; + + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + + int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k * sizeof(int8_t)); + shl_rvv_reorder_weight_packn_int8_dot(weight_data, pa_reorder, n, k, k); + memcpy(weight_data, pa_reorder, n * k * sizeof(int8_t)); + shl_mem_free(pa_reorder); +} + +static void shl_rvv_fullyconnectd_packn_int8_internel_dot(const int8_t *input, int32_t *output, + int8_t *weight, const int32_t *bias, + int in_nodes, int out_nodes) +{ + const int packn = csrr_vlenb() / sizeof(int8_t); + int vl = vsetvl_e8m1(packn); + + while (out_nodes > 0) { + vl = vsetvl_e8m1(out_nodes); + int32_t *input_ptr = (int32_t *)input; + vint32m4_t _acc0 = vle32_v_i32m4(bias, vl); + bias += vl; + for (int c = 0; c < in_nodes / 4; c++) { + vint8m4_t _weight = vle8_v_i8m4(weight, vl * 4); + _acc0 = vmaqa_vx_i32m4(_acc0, input_ptr[c], _weight, vl); + weight += 4 * vl; + } + vse32_v_i32m4(output, _acc0, vl); + output += vl; + out_nodes -= vl; } } -int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weights, struct csi_tensor *bias, - struct fc_params *params) +int shl_rvv_fullyconnected_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -113,7 +217,7 @@ int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_te const int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes const int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes - int32_t *output_tmp = (int32_t *)csi_mem_alloc(output_depth * sizeof(int32_t)); + int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t)); int vl; for (int b = 0; b < batches; b++) { @@ -122,25 +226,26 @@ int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_te int32_t *bias_ptr = bias_data; int32_t *output_ptr = output_tmp; - csi_nn_rvv_fullyconnectd_packn_int8_internel(input_ptr, output_ptr, weight_ptr, bias_ptr, - accum_depth, output_depth); + shl_rvv_fullyconnectd_packn_int8_internel_dot(input_ptr, output_ptr, weight_ptr, bias_ptr, + accum_depth, output_depth); if (weights->quant_channel == 1) { - csi_nn_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift, - output_depth); + shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift, + output_depth); } else if (weights->quant_channel == output_depth) { // support channel quantization for (int c = 0; c < weights->quant_channel; c++) { - csi_nn_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier, - weights->qinfo[c].shift, 1); + shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier, + weights->qinfo[c].shift, 1); } } - csi_nn_rvv_saturated_int8(output_ptr, output_data + b * output_depth, - output->qinfo->zero_point, output_depth); + shl_rvv_saturated_int8(output_ptr, output_data + b * output_depth, + output->qinfo->zero_point, output_depth); } if (output_tmp) { - csi_mem_free(output_tmp); + shl_mem_free(output_tmp); output_tmp = NULL; } return CSINN_TRUE; } +#endif diff --git a/source/thead_rvv/gemm_fp16.c b/source/thead_rvv/gemm_fp16.c index c707509e..38369bdb 100644 --- a/source/thead_rvv/gemm_fp16.c +++ b/source/thead_rvv/gemm_fp16.c @@ -16,99 +16,17 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/************************************************************* - note: VLEN = 128 -*************************************************************/ -void csi_nn_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) -{ - int i = 0; - for (; i + 7 < m; i += 8) { - for (int j = 0; j < k; j++) { - sa[i * k + 8 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 8 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 8 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 8 * j + 3] = a[(i + 3) * k + j]; - sa[i * k + 8 * j + 4] = a[(i + 4) * k + j]; - sa[i * k + 8 * j + 5] = a[(i + 5) * k + j]; - sa[i * k + 8 * j + 6] = a[(i + 6) * k + j]; - sa[i * k + 8 * j + 7] = a[(i + 7) * k + j]; - } - } - - for (; i + 3 < m; i += 4) { - for (int j = 0; j < k; j++) { - sa[i * k + 4 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 4 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 4 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 4 * j + 3] = a[(i + 3) * k + j]; - } - } - - for (; i + 1 < m; i += 2) { - for (int j = 0; j < k; j++) { - sa[i * k + 2 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 2 * j + 1] = a[(i + 1) * k + j]; - } - } - - for (; i < m; i++) { - for (int j = 0; j < k; j++) { - sa[i * k + 1 * j + 0] = a[(i + 0) * k + j]; - } - } -} +/************************************************************************ + * input matrix and kernel matrix have been reordered + ***********************************************************************/ -void csi_nn_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) -{ - int vl = vsetvl_e16m2(16); - __fp16 *b0 = NULL; - int i = 0; - for (; i + 15 < n; i += 16) { - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat16m2_t _tmp = vle16_v_f16m2(b0, vl); - b0 += ldx; - vse16_v_f16m2(sb, _tmp, vl); - sb += 16; - } - } - - for (; i + 7 < n; i += 8) { - vl = vsetvl_e16m1(8); - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl); - b0 += ldx; - vse16_v_f16m1(sb, _tmp, vl); - sb += 8; - } - } - - for (; i < n; i++) { - vl = vsetvl_e16m2(16); - b0 = b + i; - int j = 0; - for (; j + 15 < k; j += 16) { - vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl); - b0 += 16 * ldx; - vse16_v_f16m2(sb, _tmp, vl); - sb += 16; - } - if (j < k) { - vl = vsetvl_e16m2(k & 15); - vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl); - vse16_v_f16m2(sb, _tmp, vl); - sb += vl; - } - } -} - -void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, int n, - int ldc, __fp16 *bias) +// vlen=128 +void shl_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m, + int k, int n, int ldc) { __fp16 *kernel_data = (__fp16 *)sa; __fp16 *input_data = (__fp16 *)sb; @@ -117,7 +35,7 @@ void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int flag_bias = 1; // default: conv2d layer include bias if (bias == NULL) { flag_bias = 0; - bias = (__fp16 *)csi_mem_alloc(m * sizeof(__fp16)); + bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16)); } __fp16 *bias_ptr = bias; @@ -553,111 +471,14 @@ void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, } if (!flag_bias) { - csi_mem_free(bias); + shl_mem_free(bias); bias = NULL; } } -/************************************************************* - note: VLEN = 256 -*************************************************************/ -void csi_nn_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) -{ - int i = 0; - - for (; i + 15 < m; i += 16) { - for (int j = 0; j < k; j++) { - sa[i * k + 16 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 16 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 16 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 16 * j + 3] = a[(i + 3) * k + j]; - sa[i * k + 16 * j + 4] = a[(i + 4) * k + j]; - sa[i * k + 16 * j + 5] = a[(i + 5) * k + j]; - sa[i * k + 16 * j + 6] = a[(i + 6) * k + j]; - sa[i * k + 16 * j + 7] = a[(i + 7) * k + j]; - sa[i * k + 16 * j + 8] = a[(i + 8) * k + j]; - sa[i * k + 16 * j + 9] = a[(i + 9) * k + j]; - sa[i * k + 16 * j + 10] = a[(i + 10) * k + j]; - sa[i * k + 16 * j + 11] = a[(i + 11) * k + j]; - sa[i * k + 16 * j + 12] = a[(i + 12) * k + j]; - sa[i * k + 16 * j + 13] = a[(i + 13) * k + j]; - sa[i * k + 16 * j + 14] = a[(i + 14) * k + j]; - sa[i * k + 16 * j + 15] = a[(i + 15) * k + j]; - } - } - - for (; i + 7 < m; i += 8) { - for (int j = 0; j < k; j++) { - sa[i * k + 8 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 8 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 8 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 8 * j + 3] = a[(i + 3) * k + j]; - sa[i * k + 8 * j + 4] = a[(i + 4) * k + j]; - sa[i * k + 8 * j + 5] = a[(i + 5) * k + j]; - sa[i * k + 8 * j + 6] = a[(i + 6) * k + j]; - sa[i * k + 8 * j + 7] = a[(i + 7) * k + j]; - } - } - - for (; i + 3 < m; i += 4) { - for (int j = 0; j < k; j++) { - sa[i * k + 4 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 4 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 4 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 4 * j + 3] = a[(i + 3) * k + j]; - } - } - - for (; i + 1 < m; i += 2) { - for (int j = 0; j < k; j++) { - sa[i * k + 2 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 2 * j + 1] = a[(i + 1) * k + j]; - } - } - - for (; i < m; i++) { - for (int j = 0; j < k; j++) { - sa[i * k + 1 * j + 0] = a[(i + 0) * k + j]; - } - } -} - -void csi_nn_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) -{ - int vl = vsetvl_e16m1(16); - __fp16 *b0 = NULL; - int i = 0; - for (; i + 15 < n; i += 16) { - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl); - b0 += ldx; - vse16_v_f16m1(sb, _tmp, vl); - sb += 16; - } - } - - for (; i < n; i++) { - vl = vsetvl_e16m1(16); - b0 = b + i; - int j = 0; - for (; j + 15 < k; j += 16) { - vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl); - b0 += 16 * ldx; - vse16_v_f16m1(sb, _tmp, vl); - sb += 16; - } - if (j < k) { - vl = vsetvl_e16m1(k & 15); - vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl); - vse16_v_f16m1(sb, _tmp, vl); - sb += vl; - } - } -} - -void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, - int n, int ldc, __fp16 *bias) +// vlen=256 +void shl_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int m, int k, int n, int ldc) { __fp16 *kernel_data = (__fp16 *)sa; __fp16 *input_data = (__fp16 *)sb; @@ -666,7 +487,7 @@ void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 * int flag_bias = 1; // default: conv2d layer include bias if (bias == NULL) { flag_bias = 0; - bias = (__fp16 *)csi_mem_alloc(m * 2); + bias = (__fp16 *)shl_mem_alloc(m * 2); } __fp16 *bias_ptr = bias; @@ -1143,7 +964,7 @@ void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 * } if (!flag_bias) { - csi_mem_free(bias); + shl_mem_free(bias); bias = NULL; } } diff --git a/source/thead_rvv/gemm_fp16_packn.c b/source/thead_rvv/gemm_fp16_packn.c new file mode 100644 index 00000000..62e13d7f --- /dev/null +++ b/source/thead_rvv/gemm_fp16_packn.c @@ -0,0 +1,944 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + * input matrix and kernel matrix have been reordered + *************************************************************/ + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/pack2n, k, pack2n] [m/packn, k, packn] + * sb - input: [n/8, k, 8] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int m, int k, int n, int ldc) +{ + __fp16 *kernel_data = (__fp16 *)sa; + __fp16 *input_data = (__fp16 *)sb; + __fp16 *output_data = dst; + + int flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16)); + } + __fp16 *bias_ptr = bias; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + int vl = vsetvl_e16m1(packn); + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + __fp16 *output0 = output_data + oc * n; // 16 channel dot output + __fp16 *output1 = output0 + packn * n; + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + vse16_v_f16m1(output1 + packn * 2, _acc12, vl); + vse16_v_f16m1(output1 + packn * 3, _acc13, vl); + vse16_v_f16m1(output1 + packn * 4, _acc14, vl); + vse16_v_f16m1(output1 + packn * 5, _acc15, vl); + vse16_v_f16m1(output1 + packn * 6, _acc16, vl); + vse16_v_f16m1(output1 + packn * 7, _acc17, vl); + output1 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + vse16_v_f16m1(output1 + packn * 2, _acc12, vl); + vse16_v_f16m1(output1 + packn * 3, _acc13, vl); + output1 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + output1 += packn * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += packn * 1; + + vse16_v_f16m1(output1, _acc10, vl); + output1 += packn * 1; + } + } + + for (; oc + packn - 1 < m; oc += packn) { + __fp16 *output0 = output_data + oc * n; // 8 channel dot output + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e16m1(m - oc); + __fp16 *output0 = output_data + oc * n; // 8 channel dot output + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + vse16_v_f16m1(output0 + vl * 2, _acc02, vl); + vse16_v_f16m1(output0 + vl * 3, _acc03, vl); + vse16_v_f16m1(output0 + vl * 4, _acc04, vl); + vse16_v_f16m1(output0 + vl * 5, _acc05, vl); + vse16_v_f16m1(output0 + vl * 6, _acc06, vl); + vse16_v_f16m1(output0 + vl * 7, _acc07, vl); + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + vse16_v_f16m1(output0 + vl * 2, _acc02, vl); + vse16_v_f16m1(output0 + vl * 3, _acc03, vl); + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += vl * 1; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/pack2n, k, pack2n] [m/packn, k, packn] + * sb - input: [n/12, k, 12] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int m, int k, int n, int ldc) +{ + __fp16 *kernel_data = (__fp16 *)sa; + __fp16 *input_data = (__fp16 *)sb; + __fp16 *output_data = dst; + + int flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16)); + } + __fp16 *bias_ptr = bias; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + int vl = vsetvl_e16m1(packn); + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + __fp16 *output0 = output_data + oc * n; // 16 channel dot output + __fp16 *output1 = output0 + packn * n; + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc18 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc19 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc1a = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc1b = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl); + _acc18 = vfmacc_vf_f16m1(_acc18, img0[8], _kernel1, vl); + _acc19 = vfmacc_vf_f16m1(_acc19, img0[9], _kernel1, vl); + _acc1a = vfmacc_vf_f16m1(_acc1a, img0[10], _kernel1, vl); + _acc1b = vfmacc_vf_f16m1(_acc1b, img0[11], _kernel1, vl); + img0 += 12; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + vse16_v_f16m1(output0 + packn * 8, _acc08, vl); + vse16_v_f16m1(output0 + packn * 9, _acc09, vl); + vse16_v_f16m1(output0 + packn * 10, _acc0a, vl); + vse16_v_f16m1(output0 + packn * 11, _acc0b, vl); + output0 += packn * 12; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + vse16_v_f16m1(output1 + packn * 2, _acc12, vl); + vse16_v_f16m1(output1 + packn * 3, _acc13, vl); + vse16_v_f16m1(output1 + packn * 4, _acc14, vl); + vse16_v_f16m1(output1 + packn * 5, _acc15, vl); + vse16_v_f16m1(output1 + packn * 6, _acc16, vl); + vse16_v_f16m1(output1 + packn * 7, _acc17, vl); + vse16_v_f16m1(output1 + packn * 8, _acc18, vl); + vse16_v_f16m1(output1 + packn * 9, _acc19, vl); + vse16_v_f16m1(output1 + packn * 10, _acc1a, vl); + vse16_v_f16m1(output1 + packn * 11, _acc1b, vl); + output1 += packn * 12; + } + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + vse16_v_f16m1(output1 + packn * 2, _acc12, vl); + vse16_v_f16m1(output1 + packn * 3, _acc13, vl); + vse16_v_f16m1(output1 + packn * 4, _acc14, vl); + vse16_v_f16m1(output1 + packn * 5, _acc15, vl); + vse16_v_f16m1(output1 + packn * 6, _acc16, vl); + vse16_v_f16m1(output1 + packn * 7, _acc17, vl); + output1 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + vse16_v_f16m1(output1 + packn * 2, _acc12, vl); + vse16_v_f16m1(output1 + packn * 3, _acc13, vl); + output1 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + + vse16_v_f16m1(output1, _acc10, vl); + vse16_v_f16m1(output1 + packn * 1, _acc11, vl); + output1 += packn * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 16 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += packn * 1; + + vse16_v_f16m1(output1, _acc10, vl); + output1 += packn * 1; + } + } + + for (; oc + packn - 1 < m; oc += packn) { + __fp16 *output0 = output_data + oc * n; // 8 channel dot output + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl); + + img0 += 12; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + vse16_v_f16m1(output0 + packn * 8, _acc08, vl); + vse16_v_f16m1(output0 + packn * 9, _acc09, vl); + vse16_v_f16m1(output0 + packn * 10, _acc0a, vl); + vse16_v_f16m1(output0 + packn * 11, _acc0b, vl); + output0 += packn * 12; + } + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + vse16_v_f16m1(output0 + packn * 4, _acc04, vl); + vse16_v_f16m1(output0 + packn * 5, _acc05, vl); + vse16_v_f16m1(output0 + packn * 6, _acc06, vl); + vse16_v_f16m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + vse16_v_f16m1(output0 + packn * 2, _acc02, vl); + vse16_v_f16m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e16m1(m - oc); + __fp16 *output0 = output_data + oc * n; // 8 channel dot output + const __fp16 *img0 = input_data; + const __fp16 *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl); + + img0 += 12; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + vse16_v_f16m1(output0 + vl * 2, _acc02, vl); + vse16_v_f16m1(output0 + vl * 3, _acc03, vl); + vse16_v_f16m1(output0 + vl * 4, _acc04, vl); + vse16_v_f16m1(output0 + vl * 5, _acc05, vl); + vse16_v_f16m1(output0 + vl * 6, _acc06, vl); + vse16_v_f16m1(output0 + vl * 7, _acc07, vl); + vse16_v_f16m1(output0 + vl * 8, _acc08, vl); + vse16_v_f16m1(output0 + vl * 9, _acc09, vl); + vse16_v_f16m1(output0 + vl * 10, _acc0a, vl); + vse16_v_f16m1(output0 + vl * 11, _acc0b, vl); + output0 += vl * 12; + } + for (; t + 7 < n; t += 8) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + vse16_v_f16m1(output0 + vl * 2, _acc02, vl); + vse16_v_f16m1(output0 + vl * 3, _acc03, vl); + vse16_v_f16m1(output0 + vl * 4, _acc04, vl); + vse16_v_f16m1(output0 + vl * 5, _acc05, vl); + vse16_v_f16m1(output0 + vl * 6, _acc06, vl); + vse16_v_f16m1(output0 + vl * 7, _acc07, vl); + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + vse16_v_f16m1(output0 + vl * 2, _acc02, vl); + vse16_v_f16m1(output0 + vl * 3, _acc03, vl); + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse16_v_f16m1(output0, _acc00, vl); + vse16_v_f16m1(output0 + vl * 1, _acc01, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const __fp16 *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse16_v_f16m1(output0, _acc00, vl); + output0 += vl * 1; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/thead_rvv/sgemm.c b/source/thead_rvv/gemm_fp32.c similarity index 86% rename from source/thead_rvv/sgemm.c rename to source/thead_rvv/gemm_fp32.c index 148ea628..8e48f721 100644 --- a/source/thead_rvv/sgemm.c +++ b/source/thead_rvv/gemm_fp32.c @@ -16,99 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -/************************************************************* - note: VLEN = 128 -*************************************************************/ -void csi_nn_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx) -{ - int i = 0; - for (; i + 7 < m; i += 8) { - for (int j = 0; j < k; j++) { - sa[i * k + 8 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 8 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 8 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 8 * j + 3] = a[(i + 3) * k + j]; - sa[i * k + 8 * j + 4] = a[(i + 4) * k + j]; - sa[i * k + 8 * j + 5] = a[(i + 5) * k + j]; - sa[i * k + 8 * j + 6] = a[(i + 6) * k + j]; - sa[i * k + 8 * j + 7] = a[(i + 7) * k + j]; - } - } - - for (; i + 3 < m; i += 4) { - for (int j = 0; j < k; j++) { - sa[i * k + 4 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 4 * j + 1] = a[(i + 1) * k + j]; - sa[i * k + 4 * j + 2] = a[(i + 2) * k + j]; - sa[i * k + 4 * j + 3] = a[(i + 3) * k + j]; - } - } - - for (; i + 1 < m; i += 2) { - for (int j = 0; j < k; j++) { - sa[i * k + 2 * j + 0] = a[(i + 0) * k + j]; - sa[i * k + 2 * j + 1] = a[(i + 1) * k + j]; - } - } - - for (; i < m; i++) { - for (int j = 0; j < k; j++) { - sa[i * k + 1 * j + 0] = a[(i + 0) * k + j]; - } - } -} - -/************************************************************** - * input—matrix: [k, n] - * src: b - * dst: sb - * Data arrangement: Z8 | | | - **************************************************************/ -void csi_nn_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx) -{ - int32_t vl = vsetvl_e32m2(8); - float *b0 = NULL; - int i = 0; - for (; i + 7 < n; i += 8) { - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl); - b0 += ldx; - vse32_v_f32m2(sb, _tmp, vl); - sb += 8; - } - } - - for (; i < n; i++) { - vl = vsetvl_e32m2(8); - b0 = b + i; - int j = 0; - for (; j + 7 < k; j += 8) { - vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); - b0 += 8 * ldx; - vse32_v_f32m2(sb, _tmp, vl); - sb += 8; - } - if (j < k) { - vl = vsetvl_e32m2(k & 7); - vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); - vse32_v_f32m2(sb, _tmp, vl); - sb += vl; - } - } -} +/************************************************************************ + * input matrix and kernel matrix have been reordered + ***********************************************************************/ /* dst - output:[m, n] sa - kernel: [m, k] sb - input: [k, n] */ -void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n, - int ldc, float *bias) +// vlen=128 +void shl_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k, + int n, int ldc) { float *kernel_data = (float *)sa; float *input_data = (float *)sb; @@ -117,7 +40,7 @@ void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int int flag_bias = 1; // default: conv2d layer include bias if (bias == NULL) { flag_bias = 0; - bias = (float *)csi_mem_alloc(m * sizeof(float)); + bias = (float *)shl_mem_alloc(m * sizeof(float)); } float *bias_ptr = bias; @@ -462,67 +385,14 @@ void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int } if (!flag_bias) { - csi_mem_free(bias); + shl_mem_free(bias); bias = NULL; } } -/************************************************************* - note: VLEN = 256 -*************************************************************/ -// kernel 数据排布 可复用 csi_nn_rvv_reorder_kernel_n8 - -void csi_nn_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx) -{ - int vl = vsetvl_e32m2(16); - float *b0 = NULL; - int i = 0; - - // Z16 - for (; i + 15 < n; i += 16) { - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl); - b0 += ldx; - vse32_v_f32m2(sb, _tmp, vl); - sb += 16; - } - } - - // Z8 - for (; i + 7 < n; i += 8) { - vl = vsetvl_e32m1(8); - b0 = b + i; - for (int j = 0; j < k; j++) { - vfloat32m1_t _tmp = vle32_v_f32m1(b0, vl); - b0 += ldx; - vse32_v_f32m1(sb, _tmp, vl); - sb += 8; - } - } - - // col by col - for (; i < n; i++) { - vl = vsetvl_e32m2(16); - b0 = b + i; - int j = 0; - for (; j + 15 < k; j += 16) { - vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); - b0 += 16 * ldx; - vse32_v_f32m2(sb, _tmp, vl); - sb += 16; - } - if (j < k) { - vl = vsetvl_e32m2(k & 15); - vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); - vse32_v_f32m2(sb, _tmp, vl); - sb += vl; - } - } -} - -void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n, - int ldc, float *bias) +// vlen=256 +void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, + int k, int n, int ldc) { float *kernel_data = (float *)sa; float *input_data = (float *)sb; @@ -531,7 +401,7 @@ void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, int flag_bias = 1; // default: conv2d layer include bias if (bias == NULL) { flag_bias = 0; - bias = (float *)csi_mem_alloc(m * sizeof(float)); + bias = (float *)shl_mem_alloc(m * sizeof(float)); } float *bias_ptr = bias; @@ -963,7 +833,7 @@ void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, } if (!flag_bias) { - csi_mem_free(bias); + shl_mem_free(bias); bias = NULL; } } diff --git a/source/thead_rvv/gemm_fp32_packn.c b/source/thead_rvv/gemm_fp32_packn.c new file mode 100644 index 00000000..5b2ff514 --- /dev/null +++ b/source/thead_rvv/gemm_fp32_packn.c @@ -0,0 +1,946 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + * input matrix and kernel matrix have been reordered + * PS: 这里实现了两种寄存器分块,以vlen128 fp32 类型为例,分别是 8*8 和 8*12, + * 两份代码可以合成一份,用宏或者条件来控制 + *************************************************************/ + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/pack2n, k, pack2n] [m/packn, k, packn] + * sb - input: [n/8, k, 8] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int m, int k, int n, int ldc) +{ + float *kernel_data = (float *)sa; + float *input_data = (float *)sb; + float *output_data = dst; + + int flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + int vl = vsetvl_e32m1(packn); + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + float *output0 = output_data + oc * n; // 8 channel dot output + float *output1 = output0 + packn * n; + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + vse32_v_f32m1(output1 + packn * 2, _acc12, vl); + vse32_v_f32m1(output1 + packn * 3, _acc13, vl); + vse32_v_f32m1(output1 + packn * 4, _acc14, vl); + vse32_v_f32m1(output1 + packn * 5, _acc15, vl); + vse32_v_f32m1(output1 + packn * 6, _acc16, vl); + vse32_v_f32m1(output1 + packn * 7, _acc17, vl); + output1 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + vse32_v_f32m1(output1 + packn * 2, _acc12, vl); + vse32_v_f32m1(output1 + packn * 3, _acc13, vl); + output1 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + output1 += packn * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += packn * 1; + + vse32_v_f32m1(output1, _acc10, vl); + output1 += packn * 1; + } + } + + for (; oc + packn - 1 < m; oc += packn) { + float *output0 = output_data + oc * n; // 4 channel dot output + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e32m1(m - oc); + float *output0 = output_data + oc * n; // 4 channel dot output + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + vse32_v_f32m1(output0 + vl * 2, _acc02, vl); + vse32_v_f32m1(output0 + vl * 3, _acc03, vl); + vse32_v_f32m1(output0 + vl * 4, _acc04, vl); + vse32_v_f32m1(output0 + vl * 5, _acc05, vl); + vse32_v_f32m1(output0 + vl * 6, _acc06, vl); + vse32_v_f32m1(output0 + vl * 7, _acc07, vl); + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + vse32_v_f32m1(output0 + vl * 2, _acc02, vl); + vse32_v_f32m1(output0 + vl * 3, _acc03, vl); + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += vl * 1; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/pack2n, k, pack2n] [m/packn, k, packn] + * sb - input: [n/12, k, 12] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int m, int k, int n, int ldc) +{ + float *kernel_data = (float *)sa; + float *input_data = (float *)sb; + float *output_data = dst; + + int flag_bias = 1; // default: conv2d layer include bias + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(m * sizeof(float)); + } + float *bias_ptr = bias; + + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + int vl = vsetvl_e32m1(packn); + + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + float *output0 = output_data + oc * n; // 8 channel dot output + float *output1 = output0 + packn * n; + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc18 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc19 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc1a = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc1b = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl); + _acc18 = vfmacc_vf_f32m1(_acc18, img0[8], _kernel1, vl); + _acc19 = vfmacc_vf_f32m1(_acc19, img0[9], _kernel1, vl); + _acc1a = vfmacc_vf_f32m1(_acc1a, img0[10], _kernel1, vl); + _acc1b = vfmacc_vf_f32m1(_acc1b, img0[11], _kernel1, vl); + img0 += 12; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + vse32_v_f32m1(output0 + packn * 8, _acc08, vl); + vse32_v_f32m1(output0 + packn * 9, _acc09, vl); + vse32_v_f32m1(output0 + packn * 10, _acc0a, vl); + vse32_v_f32m1(output0 + packn * 11, _acc0b, vl); + output0 += packn * 12; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + vse32_v_f32m1(output1 + packn * 2, _acc12, vl); + vse32_v_f32m1(output1 + packn * 3, _acc13, vl); + vse32_v_f32m1(output1 + packn * 4, _acc14, vl); + vse32_v_f32m1(output1 + packn * 5, _acc15, vl); + vse32_v_f32m1(output1 + packn * 6, _acc16, vl); + vse32_v_f32m1(output1 + packn * 7, _acc17, vl); + vse32_v_f32m1(output1 + packn * 8, _acc18, vl); + vse32_v_f32m1(output1 + packn * 9, _acc19, vl); + vse32_v_f32m1(output1 + packn * 10, _acc1a, vl); + vse32_v_f32m1(output1 + packn * 11, _acc1b, vl); + output1 += packn * 12; + } + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl); + _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl); + _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl); + _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + vse32_v_f32m1(output1 + packn * 2, _acc12, vl); + vse32_v_f32m1(output1 + packn * 3, _acc13, vl); + vse32_v_f32m1(output1 + packn * 4, _acc14, vl); + vse32_v_f32m1(output1 + packn * 5, _acc15, vl); + vse32_v_f32m1(output1 + packn * 6, _acc16, vl); + vse32_v_f32m1(output1 + packn * 7, _acc17, vl); + output1 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl); + _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + vse32_v_f32m1(output1 + packn * 2, _acc12, vl); + vse32_v_f32m1(output1 + packn * 3, _acc13, vl); + output1 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + + vse32_v_f32m1(output1, _acc10, vl); + vse32_v_f32m1(output1 + packn * 1, _acc11, vl); + output1 += packn * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 8 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl); + k0 += pack2n; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += packn * 1; + + vse32_v_f32m1(output1, _acc10, vl); + output1 += packn * 1; + } + } + + for (; oc + packn - 1 < m; oc += packn) { + float *output0 = output_data + oc * n; // 4 channel dot output + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl); + + img0 += 12; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + vse32_v_f32m1(output0 + packn * 8, _acc08, vl); + vse32_v_f32m1(output0 + packn * 9, _acc09, vl); + vse32_v_f32m1(output0 + packn * 10, _acc0a, vl); + vse32_v_f32m1(output0 + packn * 11, _acc0b, vl); + output0 += packn * 12; + } + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + vse32_v_f32m1(output0 + packn * 4, _acc04, vl); + vse32_v_f32m1(output0 + packn * 5, _acc05, vl); + vse32_v_f32m1(output0 + packn * 6, _acc06, vl); + vse32_v_f32m1(output0 + packn * 7, _acc07, vl); + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + vse32_v_f32m1(output0 + packn * 2, _acc02, vl); + vse32_v_f32m1(output0 + packn * 3, _acc03, vl); + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + packn * 1, _acc01, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += packn; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e32m1(m - oc); + float *output0 = output_data + oc * n; // tial channel dot output + const float *img0 = input_data; + const float *b0 = bias_ptr + oc; + int t = 0; + for (; t + 11 < n; t += 12) { + const float *k0 = kernel_data + oc * k; // tail channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl); + _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl); + _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl); + _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl); + + img0 += 12; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + vse32_v_f32m1(output0 + vl * 2, _acc02, vl); + vse32_v_f32m1(output0 + vl * 3, _acc03, vl); + vse32_v_f32m1(output0 + vl * 4, _acc04, vl); + vse32_v_f32m1(output0 + vl * 5, _acc05, vl); + vse32_v_f32m1(output0 + vl * 6, _acc06, vl); + vse32_v_f32m1(output0 + vl * 7, _acc07, vl); + vse32_v_f32m1(output0 + vl * 8, _acc08, vl); + vse32_v_f32m1(output0 + vl * 9, _acc09, vl); + vse32_v_f32m1(output0 + vl * 10, _acc0a, vl); + vse32_v_f32m1(output0 + vl * 11, _acc0b, vl); + output0 += vl * 12; + } + for (; t + 7 < n; t += 8) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl); + _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl); + _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl); + _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl); + img0 += 8; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + vse32_v_f32m1(output0 + vl * 2, _acc02, vl); + vse32_v_f32m1(output0 + vl * 3, _acc03, vl); + vse32_v_f32m1(output0 + vl * 4, _acc04, vl); + vse32_v_f32m1(output0 + vl * 5, _acc05, vl); + vse32_v_f32m1(output0 + vl * 6, _acc06, vl); + vse32_v_f32m1(output0 + vl * 7, _acc07, vl); + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl); + _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl); + img0 += 4; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + vse32_v_f32m1(output0 + vl * 2, _acc02, vl); + vse32_v_f32m1(output0 + vl * 3, _acc03, vl); + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl); + img0 += 2; + } + vse32_v_f32m1(output0, _acc00, vl); + vse32_v_f32m1(output0 + vl * 1, _acc01, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const float *k0 = kernel_data + oc * k; // 4 channel kernel + vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl); + + for (int c = 0; c < k; c++) { + vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl); + k0 += vl; + _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl); + img0 += 1; + } + vse32_v_f32m1(output0, _acc00, vl); + output0 += vl * 1; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/thead_rvv/gemm_int4.c b/source/thead_rvv/gemm_int4.c index 918b2581..732a10f9 100644 --- a/source/thead_rvv/gemm_int4.c +++ b/source/thead_rvv/gemm_int4.c @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ -#ifdef __riscv_xtheadv - -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" +#ifdef XTHEADV static vint8mf4_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp, int vl) { @@ -49,122 +48,125 @@ static vint8mf8_t requantize_m1(vint32m1_t _src, int32_t multiplier, int32_t shi * note: VLEN = 128 * layerout: input/output-[n, h, w , c] kernel-[o, h, w, i] *************************************************************/ -void csi_nn_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx) +void shl_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx) { - int k4 = ((k - 1) & -4) + 4; - int i = 0; - // m8 - for (; i + 7 < m; i += 8) { - int j = 0; - // k16 - int32_t *in_ptr0 = (int32_t *)a; - int32_t *out_ptr0 = (int32_t *)sa; - for (; j + 15 < k; j += 16) { - vint32m2_t _nf0, _nf1, _nf2, _nf3; - vlsseg4e32_v_i32m2(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 8); - in_ptr0 += 4; - vse32_v_i32m2(out_ptr0, _nf0, 8); - out_ptr0 += 8; - vse32_v_i32m2(out_ptr0, _nf1, 8); - out_ptr0 += 8; - vse32_v_i32m2(out_ptr0, _nf2, 8); - out_ptr0 += 8; - vse32_v_i32m2(out_ptr0, _nf3, 8); - out_ptr0 += 8; - } - for (; j + 3 < k; j += 4) { - vint32m2_t _input = vlse32_v_i32m2(in_ptr0, k * sizeof(int8_t), 8); - in_ptr0++; - vse32_v_i32m2(out_ptr0, _input, 8); - out_ptr0 += 8; - } - if (j < k) { - int8_t *in_ptr1 = (int8_t *)in_ptr0; - int8_t *out_ptr1 = (int8_t *)out_ptr0; - for (int c = 0; c < 8; c++) { - vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3); - in_ptr1 += k; - vse8_v_i8m1(out_ptr1, _input1, 4); - out_ptr1 += 4; + if (k % 4 == 0) { + int i = 0; + // m8 + for (; i + 7 < m; i += 8) { + int j = 0; + // k16 + int32_t *in_ptr0 = (int32_t *)a; + int32_t *out_ptr0 = (int32_t *)sa; + for (; j + 15 < k; j += 16) { + vint32m2_t _nf0, _nf1, _nf2, _nf3; + vlsseg4e32_v_i32m2(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 8); + in_ptr0 += 4; + vse32_v_i32m2(out_ptr0, _nf0, 8); + out_ptr0 += 8; + vse32_v_i32m2(out_ptr0, _nf1, 8); + out_ptr0 += 8; + vse32_v_i32m2(out_ptr0, _nf2, 8); + out_ptr0 += 8; + vse32_v_i32m2(out_ptr0, _nf3, 8); + out_ptr0 += 8; } - } - a += 8 * k; - sa += 8 * k4; - } - // m4 - for (; i + 3 < m; i += 4) { - int j = 0; - int32_t *in_ptr0 = (int32_t *)a; - int32_t *out_ptr0 = (int32_t *)sa; - for (; j + 15 < k; j += 16) { - vint32m1_t _nf0, _nf1, _nf2, _nf3; - vlsseg4e32_v_i32m1(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 4); - in_ptr0 += 4; - vse32_v_i32m1(out_ptr0, _nf0, 4); - out_ptr0 += 4; - vse32_v_i32m1(out_ptr0, _nf1, 4); - out_ptr0 += 4; - vse32_v_i32m1(out_ptr0, _nf2, 4); - out_ptr0 += 4; - vse32_v_i32m1(out_ptr0, _nf3, 4); - out_ptr0 += 4; - } - for (; j + 3 < k; j += 4) { - vint32m1_t _input = vlse32_v_i32m1(in_ptr0, k * sizeof(int8_t), 4); - in_ptr0++; - vse32_v_i32m1(out_ptr0, _input, 4); - out_ptr0 += 4; - } - if (j < k) { - int8_t *in_ptr1 = (int8_t *)in_ptr0; - int8_t *out_ptr1 = (int8_t *)out_ptr0; - for (int c = 0; c < 4; c++) { - vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3); - in_ptr1 += k; - vse8_v_i8m1(out_ptr1, _input1, 4); - out_ptr1 += 4; + for (; j + 3 < k; j += 4) { + vint32m2_t _input = vlse32_v_i32m2(in_ptr0, k * sizeof(int8_t), 8); + in_ptr0++; + vse32_v_i32m2(out_ptr0, _input, 8); + out_ptr0 += 8; + } + if (j < k) { + int8_t *in_ptr1 = (int8_t *)in_ptr0; + int8_t *out_ptr1 = (int8_t *)out_ptr0; + for (int c = 0; c < 8; c++) { + vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3); + in_ptr1 += k; + vse8_v_i8m1(out_ptr1, _input1, 4); + out_ptr1 += 4; + } } + a += 8 * k; + sa += 8 * k; } - a += 4 * k; - sa += 4 * k4; - } - // m2 - for (; i + 1 < m; i += 2) { - int j = 0; - for (; j + 3 < k; j += 4) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 2; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); - in_ptr += k; - vse8_v_i8m1(sa, _input, 4); - sa += 4; + // m4 + for (; i + 3 < m; i += 4) { + int j = 0; + int32_t *in_ptr0 = (int32_t *)a; + int32_t *out_ptr0 = (int32_t *)sa; + for (; j + 15 < k; j += 16) { + vint32m1_t _nf0, _nf1, _nf2, _nf3; + vlsseg4e32_v_i32m1(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 4); + in_ptr0 += 4; + vse32_v_i32m1(out_ptr0, _nf0, 4); + out_ptr0 += 4; + vse32_v_i32m1(out_ptr0, _nf1, 4); + out_ptr0 += 4; + vse32_v_i32m1(out_ptr0, _nf2, 4); + out_ptr0 += 4; + vse32_v_i32m1(out_ptr0, _nf3, 4); + out_ptr0 += 4; } + for (; j + 3 < k; j += 4) { + vint32m1_t _input = vlse32_v_i32m1(in_ptr0, k * sizeof(int8_t), 4); + in_ptr0++; + vse32_v_i32m1(out_ptr0, _input, 4); + out_ptr0 += 4; + } + if (j < k) { + int8_t *in_ptr1 = (int8_t *)in_ptr0; + int8_t *out_ptr1 = (int8_t *)out_ptr0; + for (int c = 0; c < 4; c++) { + vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3); + in_ptr1 += k; + vse8_v_i8m1(out_ptr1, _input1, 4); + out_ptr1 += 4; + } + } + a += 4 * k; + sa += 4 * k; } - if (j < k) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 2; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); - in_ptr += k; - vse8_v_i8m1(sa, _input, k & 3); - sa += 4; + // m2 + for (; i + 1 < m; i += 2) { + int j = 0; + for (; j + 3 < k; j += 4) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 2; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); + in_ptr += k; + vse8_v_i8m1(sa, _input, 4); + sa += 4; + } } + if (j < k) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 2; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); + in_ptr += k; + vse8_v_i8m1(sa, _input, k & 3); + sa += 4; + } + } + a += 2 * k; } - a += 2 * k; - } - // m1 - for (; i < m; i++) { - memcpy(sa, a, k * sizeof(int8_t)); + // m1 + for (; i < m; i++) { + memcpy(sa, a, k * sizeof(int8_t)); + } + } else { + shl_rvv_reorder_kernel_n8_int8(a, sa, m, k, ldx); } } -// 和 csi_nn_rvv_reorder_kernel_n8_int8 实现相同, 可以直接调用 csi_nn_rvv_reorder_kernel_n8_int8 -void csi_nn_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx) +// 和 shl_rvv_reorder_kernel_n8_int8 实现相同, 可以直接调用 shl_rvv_reorder_kernel_n8_int8 +void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx) { // TODO: } -void csi_nn_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, - int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +void shl_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, + int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) { int8_t *input_data = (int8_t *)sa; int8_t *kernel_data = (int8_t *)sb; diff --git a/source/thead_rvv/gemm_int4_packn.c b/source/thead_rvv/gemm_int4_packn.c new file mode 100644 index 00000000..a6cbde9a --- /dev/null +++ b/source/thead_rvv/gemm_int4_packn.c @@ -0,0 +1,374 @@ +/* + * Copyright (C) 2016-2021 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + * input matrix and kernel matrix have been reordered + *************************************************************/ +static vint8mf4_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + vint8mf4_t _res = vpnclip_wx_i8mf4(vreinterpret_v_i8mf2_i16mf2(_tmp2), 0, vl / 2); + return _res; +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/packn, k, packn] + * sb - input: [n/12, k, 12] + XXX: k 是 int8 而言的累加维度 + **************************************************************/ +void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift) +{ + int8_t *kernel_data = (int8_t *)sa; + int8_t *input_data = (int8_t *)sb; + int8_t *output_data = dst; + int32_t *bias_data = bias; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e32m2(packn); + + for (int oc = 0; oc + packn - 1 < m; oc += packn) { + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + (oc / 2) * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 11 < n; t += 12) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl); + _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl); + _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl); + _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl); + + img0 += 12; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + vint8mf4_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl); + vint8mf4_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl); + vint8mf4_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl); + vint8mf4_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 8, _res8, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 9, _res9, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 10, _resa, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 11, _resb, vl / 2); + + output0 += packn / 2 * 12; + } + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2); + + output0 += packn / 2 * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2); + + output0 += packn / 2 * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + output0 += packn / 2 * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf4(output0, _res0, vl / 2); + output0 += packn / 2 * 1; + } + } +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/packn, k, packn] + * sb - input: [n/8, k, 8] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift) +{ + int8_t *kernel_data = (int8_t *)sa; + int8_t *input_data = (int8_t *)sb; + int8_t *output_data = dst; + int32_t *bias_data = bias; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e32m2(packn); + + for (int oc = 0; oc + packn - 1 < m; oc += packn) { + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + (oc / 2) * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2); + + output0 += packn / 2 * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2); + + output0 += packn / 2 * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf4(output0, _res0, vl / 2); + vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2); + output0 += packn / 2 * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf4(output0, _res0, vl / 2); + output0 += packn / 2 * 1; + } + } +} +#endif diff --git a/source/thead_rvv/gemm_int8.c b/source/thead_rvv/gemm_int8.c index a953d88a..f364aba4 100644 --- a/source/thead_rvv/gemm_int8.c +++ b/source/thead_rvv/gemm_int8.c @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ -#ifdef __riscv_xtheadv - -#include "csi_thead_rvv.h" +/* CSI-NN2 version 2.0.x */ +#include "shl_thead_rvv.h" +#ifdef XTHEADV static vint8mf2_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp, int vl) { @@ -85,177 +84,9 @@ static vint8mf4_t requantize_m1_s(vint32m1_t _src, int32_t *multiplier, int32_t return _tmp2; } -/************************************************************* - note: VLEN = 128 -*************************************************************/ -void csi_nn_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx) -{ - int i = 0; - for (; i + 7 < m; i += 8) { - int j = 0; - for (; j + 3 < k; j += 4) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 8; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); - in_ptr += k; - vse8_v_i8m1(sa, _input, 4); - sa += 4; - } - } - // k_tail - if (j < k) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 8; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); - in_ptr += k; - vse8_v_i8m1(sa, _input, k & 3); - sa += 4; - } - } - a += 8 * k; - } - for (; i + 3 < m; i += 4) { - int j = 0; - for (; j + 3 < k; j += 4) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 4; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); - in_ptr += k; - vse8_v_i8m1(sa, _input, 4); - sa += 4; - } - } - if (j < k) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 4; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); - in_ptr += k; - vse8_v_i8m1(sa, _input, k & 3); - sa += 4; - } - } - a += 4 * k; - } - for (; i + 1 < m; i += 2) { - int j = 0; - for (; j + 3 < k; j += 4) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 2; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); - in_ptr += k; - vse8_v_i8m1(sa, _input, 4); - sa += 4; - } - } - if (j < k) { - int8_t *in_ptr = a + j; - for (int c = 0; c < 2; c++) { - vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); - in_ptr += k; - vse8_v_i8m1(sa, _input, k & 3); - sa += 4; - } - } - a += 2 * k; - } - for (; i < m; i++) { - memcpy(sa, a, k * sizeof(int8_t)); - } -} - -void csi_nn_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) -{ - int vl = vsetvl_e8m1(8); - int i = 0; - for (; i + 7 < n; i += 8) { - int8_t *b0 = b + i; - int j = 0; - for (; j + 3 < k; j += 4) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb += 32 - 3; - } - // k_tail - if (j < k) { - int8_t *sb0 = sb; - for (; j < k; j++) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); - sb0++; - } - sb += 32; - } - } - for (; i + 3 < n; i += 4) { - vl = vsetvl_e8m1(4); - int8_t *b0 = b + i; - int j = 0; - for (; j + 3 < k; j += 4) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb += 13; - } - // k_tail - if (j < k) { - int8_t *sb0 = sb; - for (; j < k; j++) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); - sb0++; - } - sb += 16; - } - } - // n_tail - for (; i < n; i++) { - vl = vsetvl_e8m1(16); - int8_t *b0 = b + i; - int j = 0; - for (; j + 15 < k; j += 16) { - vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); - b0 += 16 * ldx; - vse8_v_i8m1(sb, _tmp, vl); - sb += 16; - } - if (j < k) { - vl = vsetvl_e8m1(k & 15); - vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); - vse8_v_i8m1(sb, _tmp, vl); - sb += ((k & 15) / 4 + 1) * 4; - } - } -} - -void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, - int n, int ldc, int32_t *bias) +// vlen=128 +void shl_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc) { int8_t *kernel_data = (int8_t *)sa; int8_t *input_data = (int8_t *)sb; @@ -638,8 +469,8 @@ void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, } } -void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n, - int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift) +void shl_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m, + int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift) { int8_t *kernel_data = (int8_t *)sa; int8_t *input_data = (int8_t *)sb; @@ -1068,101 +899,8 @@ void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, i /************************************************************* note: VLEN = 256 *************************************************************/ -// kernel 数据排布 可复用 csi_nn_rvv_reorder_kernel_n8_int8 - -void csi_nn_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) -{ - int vl = vsetvl_e8m1(16); - int i = 0; - for (; i + 15 < n; i += 16) { - int8_t *b0 = b + i; - int j = 0; - for (; j + 3 < k; j += 4) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb += 64 - 3; - } - // k_tail - if (j < k) { - int8_t *sb0 = sb; - for (; j < k; j++) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); - sb0++; - } - sb += 64; - } - } - for (; i + 7 < n; i += 8) { - vl = vsetvl_e8m1(8); - int8_t *b0 = b + i; - int j = 0; - for (; j + 3 < k; j += 4) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb++; - _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); - sb += 32 - 3; - } - // k_tail - if (j < k) { - int8_t *sb0 = sb; - for (; j < k; j++) { - vint8m1_t _tmp = vle8_v_i8m1(b0, vl); - b0 += n; - vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); - sb0++; - } - sb += 32; - } - } - // n_tail - for (; i < n; i++) { - vl = vsetvl_e8m1(16); - int8_t *b0 = b + i; - int j = 0; - for (; j + 15 < k; j += 16) { - vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); - b0 += 16 * ldx; - vse8_v_i8m1(sb, _tmp, vl); - sb += 16; - } - if (j < k) { - vl = vsetvl_e8m1(k & 15); - vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); - vse8_v_i8m1(sb, _tmp, vl); - sb += ((k & 15) / 4 + 1) * 4; - } - } -} - -void csi_nn_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, - int n, int ldc, int32_t *bias) +void shl_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, + int m, int k, int n, int ldc) { int8_t *kernel_data = (int8_t *)sa; int8_t *input_data = (int8_t *)sb; diff --git a/source/thead_rvv/gemm_int8_packn.c b/source/thead_rvv/gemm_int8_packn.c new file mode 100644 index 00000000..6615dbb0 --- /dev/null +++ b/source/thead_rvv/gemm_int8_packn.c @@ -0,0 +1,681 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" +#ifdef XTHEADV +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + * input matrix and kernel matrix have been reordered + *************************************************************/ + +// shift 已经处理 +static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift, + int32_t out_zp, int vl) +{ + vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl); + _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl); + _mulh = vadd_vx_i32m2(_mulh, out_zp, vl); + vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl); + vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl); + return _tmp2; +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/packn, k, packn] + * sb - input: [n/12, k, 12] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift) +{ + int8_t *kernel_data = (int8_t *)sa; + int8_t *input_data = (int8_t *)sb; + int8_t *output_data = dst; + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + int32_t *bias_data = bias; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e32m2(packn); + + int oc = 0; + for (; oc + packn - 1 < m; oc += packn) { + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + oc * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 11 < n; t += 12) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl); + _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl); + _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl); + _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl); + + img0 += 12; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + vint8mf2_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl); + vint8mf2_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl); + vint8mf2_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl); + vint8mf2_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + vse8_v_i8mf2(output0 + packn * 4, _res4, vl); + vse8_v_i8mf2(output0 + packn * 5, _res5, vl); + vse8_v_i8mf2(output0 + packn * 6, _res6, vl); + vse8_v_i8mf2(output0 + packn * 7, _res7, vl); + vse8_v_i8mf2(output0 + packn * 8, _res8, vl); + vse8_v_i8mf2(output0 + packn * 9, _res9, vl); + vse8_v_i8mf2(output0 + packn * 10, _resa, vl); + vse8_v_i8mf2(output0 + packn * 11, _resb, vl); + + output0 += packn * 12; + } + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + vse8_v_i8mf2(output0 + packn * 4, _res4, vl); + vse8_v_i8mf2(output0 + packn * 5, _res5, vl); + vse8_v_i8mf2(output0 + packn * 6, _res6, vl); + vse8_v_i8mf2(output0 + packn * 7, _res7, vl); + + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf2(output0, _res0, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e32m2(m - oc); + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + oc * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 11 < n; t += 12) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl); + _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl); + _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl); + _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl); + + img0 += 12; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + vint8mf2_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl); + vint8mf2_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl); + vint8mf2_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl); + vint8mf2_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + vse8_v_i8mf2(output0 + vl * 2, _res2, vl); + vse8_v_i8mf2(output0 + vl * 3, _res3, vl); + vse8_v_i8mf2(output0 + vl * 4, _res4, vl); + vse8_v_i8mf2(output0 + vl * 5, _res5, vl); + vse8_v_i8mf2(output0 + vl * 6, _res6, vl); + vse8_v_i8mf2(output0 + vl * 7, _res7, vl); + vse8_v_i8mf2(output0 + vl * 8, _res8, vl); + vse8_v_i8mf2(output0 + vl * 9, _res9, vl); + vse8_v_i8mf2(output0 + vl * 10, _resa, vl); + vse8_v_i8mf2(output0 + vl * 11, _resb, vl); + + output0 += vl * 12; + } + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + vse8_v_i8mf2(output0 + vl * 2, _res2, vl); + vse8_v_i8mf2(output0 + vl * 3, _res3, vl); + vse8_v_i8mf2(output0 + vl * 4, _res4, vl); + vse8_v_i8mf2(output0 + vl * 5, _res5, vl); + vse8_v_i8mf2(output0 + vl * 6, _res6, vl); + vse8_v_i8mf2(output0 + vl * 7, _res7, vl); + + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + vse8_v_i8mf2(output0 + vl * 2, _res2, vl); + vse8_v_i8mf2(output0 + vl * 3, _res3, vl); + + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf2(output0, _res0, vl); + output0 += vl * 1; + } + } +} + +/************************************************************** + * dst - output: [m/packn, n, packn] + * sa - kernel: [m/packn, k, packn] + * sb - input: [n/8, k, 8] + **************************************************************/ +void shl_rvv_ncxhwx_gemm_8xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, + int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp, + int32_t *mult, int32_t *shift) +{ + int8_t *kernel_data = (int8_t *)sa; + int8_t *input_data = (int8_t *)sb; + int8_t *output_data = dst; + // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL + int32_t *bias_data = bias; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e32m2(packn); + + int oc = 0; + for (; oc + packn - 1 < m; oc += packn) { + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + oc * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + vse8_v_i8mf2(output0 + packn * 4, _res4, vl); + vse8_v_i8mf2(output0 + packn * 5, _res5, vl); + vse8_v_i8mf2(output0 + packn * 6, _res6, vl); + vse8_v_i8mf2(output0 + packn * 7, _res7, vl); + + output0 += packn * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + vse8_v_i8mf2(output0 + packn * 2, _res2, vl); + vse8_v_i8mf2(output0 + packn * 3, _res3, vl); + + output0 += packn * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + packn * 1, _res1, vl); + output0 += packn * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf2(output0, _res0, vl); + output0 += packn * 1; + } + } + + /* tail output_channel */ + if (oc < m) { + vl = vsetvl_e32m2(m - oc); + vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl); + vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl); + _shift = vrsub_vx_i32m2(_shift, -1, vl); + + int8_t *output0 = output_data + oc * n; + const int32_t *img0 = (const int32_t *)input_data; + const int32_t *b0 = bias_data + oc; + + int t = 0; + for (; t + 7 < n; t += 8) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl); + _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl); + _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl); + _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl); + + img0 += 8; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl); + vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl); + vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl); + vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + vse8_v_i8mf2(output0 + vl * 2, _res2, vl); + vse8_v_i8mf2(output0 + vl * 3, _res3, vl); + vse8_v_i8mf2(output0 + vl * 4, _res4, vl); + vse8_v_i8mf2(output0 + vl * 5, _res5, vl); + vse8_v_i8mf2(output0 + vl * 6, _res6, vl); + vse8_v_i8mf2(output0 + vl * 7, _res7, vl); + + output0 += vl * 8; + } + for (; t + 3 < n; t += 4) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl); + vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl); + _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl); + + img0 += 4; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl); + vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + vse8_v_i8mf2(output0 + vl * 2, _res2, vl); + vse8_v_i8mf2(output0 + vl * 3, _res3, vl); + + output0 += vl * 4; + } + for (; t + 1 < n; t += 2) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl); + img0 += 2; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl); + + vse8_v_i8mf2(output0, _res0, vl); + vse8_v_i8mf2(output0 + vl * 1, _res1, vl); + output0 += vl * 2; + } + for (; t < n; t++) { + const int8_t *k0 = kernel_data + oc * k; + vint32m2_t _acc0 = vle32_v_i32m2(b0, vl); + + for (int c = 0; c + 3 < k; c += 4) { + vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4); + k0 += vl * 4; + _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl); + img0 += 1; + } + vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl); + vse8_v_i8mf2(output0, _res0, vl); + output0 += vl * 1; + } + } +} +#endif diff --git a/source/thead_rvv/global_avgpool.c b/source/thead_rvv/global_avgpool.c index 69e949a6..0fb40a3f 100644 --- a/source/thead_rvv/global_avgpool.c +++ b/source/thead_rvv/global_avgpool.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_global_avgpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -54,8 +54,8 @@ int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_nn_rvv_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/global_avgpool_packn.c b/source/thead_rvv/global_avgpool_packn.c new file mode 100644 index 00000000..ca4f37d6 --- /dev/null +++ b/source/thead_rvv/global_avgpool_packn.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + *************************************************************/ +int shl_rvv_global_avgpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vfloat32m1_t _acc = vle32_v_f32m1(input_data, vl); + input_data += packn; + for (int i = 1; i < in_hw; i++) { + _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(input_data, vl), vl); + input_data += packn; + } + vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 1.0f / (float)in_hw, vl); + vse32_v_f32m1(output_data, _avg, vl); + output_data += packn; + } + } + return CSINN_TRUE; +} + +int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vfloat16m1_t _acc = vle16_v_f16m1(input_data, vl); + input_data += packn; + for (int i = 1; i < in_hw; i++) { + _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(input_data, vl), vl); + input_data += packn; + } + vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 1.0f / in_hw, vl); + vse16_v_f16m1(output_data, _avg, vl); + output_data += packn; + } + } + return CSINN_TRUE; +} + +/* int8 --> fp16 acc --> int8 */ +int shl_rvv_global_avgpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vint8mf2_t _input = vle8_v_i8mf2(input_data, vl); + input_data += packn; + vint16m1_t _tmp = vwsub_vx_i16m1(_input, (int8_t)input->qinfo->zero_point, vl); + vfloat16m1_t _acc = + vfmul_vf_f16m1(vfcvt_f_x_v_f16m1(_tmp, vl), input->qinfo->scale, vl); + for (int i = 1; i < in_hw; i++) { + _tmp = vwsub_vx_i16m1(vle8_v_i8mf2(input_data, vl), + (int8_t)input->qinfo->zero_point, vl); + vfloat16m1_t _inputf = + vfmul_vf_f16m1(vfcvt_f_x_v_f16m1(_tmp, vl), input->qinfo->scale, vl); + _acc = vfadd_vv_f16m1(_acc, _inputf, vl); + input_data += packn; + } + vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 1.0f / in_hw / output->qinfo->scale, vl); + _avg = vfadd_vf_f16m1(_avg, output->qinfo->zero_point, vl); + vint16m1_t _output = vfcvt_x_f_v_i16m1(_avg, vl); + vint8mf2_t _res = vnclip_wx_i8mf2(_output, 0, vl); + vse8_v_i8mf2(output_data, _res, vl); + output_data += packn; + } + } + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport global_avgpool2d packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} diff --git a/source/thead_rvv/global_maxpool.c b/source/thead_rvv/global_maxpool.c index 5eccf907..4361f51e 100644 --- a/source/thead_rvv/global_maxpool.c +++ b/source/thead_rvv/global_maxpool.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -54,8 +54,8 @@ int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_nn_rvv_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/global_maxpool_packn.c b/source/thead_rvv/global_maxpool_packn.c new file mode 100644 index 00000000..11284c22 --- /dev/null +++ b/source/thead_rvv/global_maxpool_packn.c @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: VLEN = 128/256 ... flexible vlen + *************************************************************/ +int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vfloat32m1_t _max = vle32_v_f32m1(input_data, vl); + input_data += packn; + for (int i = 1; i < in_hw; i++) { + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(input_data, vl), vl); + input_data += packn; + } + vse32_v_f32m1(output_data, _max, vl); + output_data += packn; + } + } + return CSINN_TRUE; +} + +int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vfloat16m1_t _max = vle16_v_f16m1(input_data, vl); + input_data += packn; + for (int i = 1; i < in_hw; i++) { + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(input_data, vl), vl); + input_data += packn; + } + vse16_v_f16m1(output_data, _max, vl); + output_data += packn; + } + } + return CSINN_TRUE; +} + +int shl_rvv_global_maxpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int in_hw = in_h * in_w; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + for (int b = 0; b < batch; b++) { + for (int c = 0; c + packn - 1 < in_c; c += packn) { + vint8mf2_t _max = vle8_v_i8mf2(input_data, vl); + input_data += packn; + for (int i = 1; i < in_hw; i++) { + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(input_data, vl), vl); + input_data += packn; + } + vse8_v_i8mf2(output_data, _max, vl); + output_data += packn; + } + } + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport global_maxpool2d packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} diff --git a/source/thead_rvv/leaky_relu.c b/source/thead_rvv/leaky_relu.c index 9f4eb418..24e63f71 100644 --- a/source/thead_rvv/leaky_relu.c +++ b/source/thead_rvv/leaky_relu.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 ... *************************************************************/ -int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; float alpha = params->n; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); while (size > 0) { int vl = vsetvl_e32m2(size); vfloat32m2_t _input = vle32_v_f32m2(input_data, vl); @@ -43,13 +43,13 @@ int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *outp return CSINN_TRUE; } -int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; __fp16 alpha = (__fp16)params->n; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); while (size > 0) { int vl = vsetvl_e16m2(size); vfloat16m2_t _input = vle16_v_f16m2(input_data, vl); @@ -69,17 +69,17 @@ int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *outp * else q2 = s1/s2 * alpha * (q1 -z1) + z2 * constrains: params->n < 0.5 * ******************************************************************/ -int csi_nn_rvv_leaky_relu_int8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_leaky_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; // TODO: move to init api float real_scale0 = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale0, &output->qinfo->multiplier, &output->qinfo->shift); + shl_quantize_multiplier(real_scale0, &output->qinfo->multiplier, &output->qinfo->shift); - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); while (size > 0) { int vl = vsetvl_e8m1(size); vint8m1_t _input = vle8_v_i8m1(input_data, vl); diff --git a/source/thead_rvv/maxpool.c b/source/thead_rvv/maxpool.c index 3db8457c..37227596 100644 --- a/source/thead_rvv/maxpool.c +++ b/source/thead_rvv/maxpool.c @@ -16,37 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { - int32_t input_h = input->dim[2]; - int32_t input_w = input->dim[3]; - + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; int32_t kernel_h = params->filter_height; int32_t kernel_w = params->filter_width; int32_t stride_h = params->stride_height; int32_t stride_w = params->stride_width; - int32_t pad_left = params->pad_left; int32_t pad_right = params->pad_right; int32_t pad_top = params->pad_top; int32_t pad_down = params->pad_down; - params->base.bc = NULL; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; - // global maxpool2d - if (input_h == kernel_h && input_w == kernel_w) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_global_maxpool2d_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_global_maxpool2d_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_ref_global_maxpool2d_quant; - } + const int packn = csrr_vlenb() / sizeof(float); + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 + : shl_rvv_global_maxpool2d_fp32; return CSINN_TRUE; } @@ -54,84 +51,243 @@ int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *outpu if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 if (pad_left == 0 && pad_top == 0) { // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (input_h % 2 == 1 && params->ceil_mode == 1) { + if (in_h % 2 == 1 && params->ceil_mode == 1) { if (params->pad_down == 0) params->pad_down++; } - if (input_w % 2 == 1 && params->ceil_mode == 1) { + if (in_w % 2 == 1 && params->ceil_mode == 1) { if (params->pad_right == 0) params->pad_right++; } // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32 + : shl_rvv_maxpool2x2s2_fp32; - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_int8; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32 + : shl_rvv_maxpool2x2s2_p1_fp32; + } + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32 + : shl_rvv_maxpool3x3s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_int8; + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32 + : shl_rvv_maxpool3x3s2_p1_fp32; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp32 + : shl_rvv_maxpool3x3s1_p1_fp32; + } + } + } + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference func " + "replaced.\n"); + cb->exec = shl_ref_maxpool2d_f32; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(__fp16); + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 + : shl_rvv_global_maxpool2d_fp16; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16 + : shl_rvv_maxpool2x2s2_fp16; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16 + : shl_rvv_maxpool2x2s2_p1_fp16; } } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 if (pad_left == 0 && pad_top == 0) { // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (input_h % 2 == 0 && params->ceil_mode == 1) { + if (in_h % 2 == 0 && params->ceil_mode == 1) { if (params->pad_down == 0) params->pad_down++; // origin pad_down mast be equal to zero ? } - if (input_w % 2 == 0 && params->ceil_mode == 1) { + if (in_w % 2 == 0 && params->ceil_mode == 1) { if (params->pad_right == 0) params->pad_right++; } // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16 + : shl_rvv_maxpool3x3s2_fp16; - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_int8; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16 + : shl_rvv_maxpool3x3s2_p1_fp16; + } + } + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp16 + : shl_rvv_maxpool3x3s1_p1_fp16; + } + } + } + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference func " + "replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; // fixme: consider ncxhwx + } + return CSINN_TRUE; +} + +int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + int32_t in_h = input->dim[2]; + int32_t in_w = input->dim[3]; + int32_t kernel_h = params->filter_height; + int32_t kernel_w = params->filter_width; + int32_t stride_h = params->stride_height; + int32_t stride_w = params->stride_width; + int32_t pad_left = params->pad_left; + int32_t pad_right = params->pad_right; + int32_t pad_top = params->pad_top; + int32_t pad_down = params->pad_down; + + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 + : shl_ref_global_maxpool2d_quant; + return CSINN_TRUE; + } + + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; } + // end consider ceil_mode 2x2s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8 + : shl_rvv_maxpool2x2s2_int8; + } else if (pad_left == 1 && pad_top == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_int8; + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8 + : shl_rvv_maxpool2x2s2_p1_int8; + } + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; } + // end consider ceil_mode 3x3s2p0 + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8 + : shl_rvv_maxpool3x3s2_int8; + + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8 + : shl_rvv_maxpool3x3s2_p1_int8; } } } else if (stride_h == 1 && stride_w == 1) { if (kernel_h == 3 && kernel_w == 3) { if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_fp32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_fp16; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_int8; - } + cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_int8 + : shl_rvv_maxpool3x3s1_p1_int8; } } } - - if (params->base.bc == NULL) { - csi_debug_warning( - "maxpool is not optimized to achieve under this condition on RVV, call reference func " + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference func " "replaced.\n"); - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->base.bc = csi_ref_maxpool2d_f32; - } else if (input->dtype == CSINN_DTYPE_FLOAT16) { - params->base.bc = csi_ref_maxpool2d_quant; - } else if (input->dtype == CSINN_DTYPE_INT8) { - params->base.bc = csi_ref_maxpool2d_quant; - } + cb->exec = shl_ref_maxpool2d_quant; // fixme: consider ncxhwx } return CSINN_TRUE; } + +int shl_rvv_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + return CSINN_FALSE; +} + +int shl_rvv_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + int32_t in_c = input->dim[1]; + struct csinn_callback *cb = params->base.cb; + cb->exec = NULL; + int packn = 0; + + if (input->dtype == CSINN_DTYPE_FLOAT32) { + packn = csrr_vlenb() / sizeof(float); + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 + : shl_rvv_global_maxpool2d_fp32; + } else if (input->dtype == CSINN_DTYPE_FLOAT16) { + packn = csrr_vlenb() / sizeof(__fp16); + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 + : shl_rvv_global_maxpool2d_fp16; + } else if (input->dtype == CSINN_DTYPE_INT8) { + packn = csrr_vlenb() / sizeof(int8_t) / 2; + cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 + : shl_ref_global_maxpool2d_quant; + } +} diff --git a/source/thead_rvv/maxpool_2x2_fp16.c b/source/thead_rvv/maxpool_2x2_fp16.c index b094377b..50825d74 100644 --- a/source/thead_rvv/maxpool_2x2_fp16.c +++ b/source/thead_rvv/maxpool_2x2_fp16.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -118,8 +118,8 @@ int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_maxpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/maxpool_2x2_fp16_packn.c b/source/thead_rvv/maxpool_2x2_fp16_packn.c new file mode 100644 index 00000000..804f6521 --- /dev/null +++ b/source/thead_rvv/maxpool_2x2_fp16_packn.c @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + // 1. 统一padding之后再计算,不考虑padiing + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _max = vle16_v_f16m1(line0, vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn, vl), vl); + vse16_v_f16m1(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/maxpool_2x2.c b/source/thead_rvv/maxpool_2x2_fp32.c similarity index 95% rename from source/thead_rvv/maxpool_2x2.c rename to source/thead_rvv/maxpool_2x2_fp32.c index 1c1f44d4..c2b6b34b 100644 --- a/source/thead_rvv/maxpool_2x2.c +++ b/source/thead_rvv/maxpool_2x2_fp32.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 @@ -28,8 +28,8 @@ pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -127,8 +127,8 @@ int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *ou pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_maxpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; diff --git a/source/thead_rvv/maxpool_2x2_fp32_packn.c b/source/thead_rvv/maxpool_2x2_fp32_packn.c new file mode 100644 index 00000000..20989eb1 --- /dev/null +++ b/source/thead_rvv/maxpool_2x2_fp32_packn.c @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ + +/* + TODO: 所有的 kernel_size 和 stride 的都可以写成一个接口,库大小被优化了, 可以参考: + /lhome/shaowg/hhb_workspace/csinn2/source/i805_ref/pooling/shl_pool_q7_HWC.c + 或者参考 ppl.nn 中 maxpool +*/ + +int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + // 1. 统一padding之后再计算,不考虑padiing + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _max = vle32_v_f32m1(line0, vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn, vl), vl); + vse32_v_f32m1(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/maxpool_2x2_int8.c b/source/thead_rvv/maxpool_2x2_int8.c index 38f56630..c72d533f 100644 --- a/source/thead_rvv/maxpool_2x2_int8.c +++ b/source/thead_rvv/maxpool_2x2_int8.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /**************************************************************************** * note: VLEN = 128/256 ... * constrains: Input and outputs must all have same scale/zero_point ****************************************************************************/ -int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -120,8 +120,8 @@ int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_maxpool2x2s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool2x2s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; diff --git a/source/thead_rvv/maxpool_2x2_int8_packn.c b/source/thead_rvv/maxpool_2x2_int8_packn.c new file mode 100644 index 00000000..c4392cb6 --- /dev/null +++ b/source/thead_rvv/maxpool_2x2_int8_packn.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ +#ifdef RVV_1_0_0 + // 1. 统一padding之后再计算,不考虑padiing + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left, + input->qinfo->zero_point); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + int8_t *out0 = output_data + c * out_h * out_w; + const int8_t *line0 = input_ncxhwx + c * in_h * padded_in_w; + const int8_t *line1 = line0 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vint8mf2_t _max = vle8_v_i8mf2(line0, vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn, vl), vl); + vse8_v_i8mf2(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} diff --git a/source/thead_rvv/maxpool_3x3_fp16.c b/source/thead_rvv/maxpool_3x3_fp16.c index f6e2e88f..439c71a0 100644 --- a/source/thead_rvv/maxpool_3x3_fp16.c +++ b/source/thead_rvv/maxpool_3x3_fp16.c @@ -16,15 +16,15 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 *************************************************************/ -int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -155,8 +155,8 @@ int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; @@ -360,8 +360,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_nn_rvv_maxpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/maxpool_3x3_fp16_packn.c b/source/thead_rvv/maxpool_3x3_fp16_packn.c new file mode 100644 index 00000000..37ba46bd --- /dev/null +++ b/source/thead_rvv/maxpool_3x3_fp16_packn.c @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + const __fp16 *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _max = vle16_v_f16m1(line0, vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 2, vl), vl); + vse16_v_f16m1(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + line2 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} + +int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16)); + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + __fp16 *out0 = output_data + c * out_h * out_w; + const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const __fp16 *line1 = line0 + padded_in_w * packn; + const __fp16 *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat16m1_t _max = vle16_v_f16m1(line0, vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 1, vl), vl); + _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 2, vl), vl); + vse16_v_f16m1(out0, _max, vl); + + line0 += packn * 1; + line1 += packn * 1; + line2 += packn * 1; + out0 += packn; + } + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/maxpool_3x3.c b/source/thead_rvv/maxpool_3x3_fp32.c similarity index 97% rename from source/thead_rvv/maxpool_3x3.c rename to source/thead_rvv/maxpool_3x3_fp32.c index 8efeb11a..16ac7048 100644 --- a/source/thead_rvv/maxpool_3x3.c +++ b/source/thead_rvv/maxpool_3x3_fp32.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 @@ -28,8 +28,8 @@ pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -161,8 +161,8 @@ int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *ou pad_right = 0 or 1 pad_down = 0 or 1 */ -int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -357,8 +357,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor pad_left = pad_right = pad_top = pad_down = 1 in_w = out_w in_h = out_h */ -int csi_nn_rvv_maxpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; diff --git a/source/thead_rvv/maxpool_3x3_fp32_packn.c b/source/thead_rvv/maxpool_3x3_fp32_packn.c new file mode 100644 index 00000000..7ecf604b --- /dev/null +++ b/source/thead_rvv/maxpool_3x3_fp32_packn.c @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + const float *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _max = vle32_v_f32m1(line0, vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 2, vl), vl); + vse32_v_f32m1(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + line2 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} + +int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float)); + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + float *out0 = output_data + c * out_h * out_w; + const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const float *line1 = line0 + padded_in_w * packn; + const float *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vfloat32m1_t _max = vle32_v_f32m1(line0, vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 1, vl), vl); + _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 2, vl), vl); + vse32_v_f32m1(out0, _max, vl); + + line0 += packn * 1; + line1 += packn * 1; + line2 += packn * 1; + out0 += packn; + } + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/maxpool_3x3_int8.c b/source/thead_rvv/maxpool_3x3_int8.c index 99a69054..d9b800b5 100644 --- a/source/thead_rvv/maxpool_3x3_int8.c +++ b/source/thead_rvv/maxpool_3x3_int8.c @@ -16,16 +16,16 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /**************************************************************************** * note: VLEN = 128/256 ... * constrains: Input and outputs must all have same scale/zero_point ****************************************************************************/ -int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -154,8 +154,8 @@ int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *ou return CSINN_TRUE; } -int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -359,8 +359,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -int csi_nn_rvv_maxpool3x3s1_p1_int8(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params) +int shl_rvv_maxpool3x3s1_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; diff --git a/source/thead_rvv/maxpool_3x3_int8_packn.c b/source/thead_rvv/maxpool_3x3_int8_packn.c new file mode 100644 index 00000000..06bbb795 --- /dev/null +++ b/source/thead_rvv/maxpool_3x3_int8_packn.c @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * note: support flexible vlen + *************************************************************/ +int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn; + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left, + input->qinfo->zero_point); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + int8_t *out0 = output_data + c * out_h * out_w; + const int8_t *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const int8_t *line1 = line0 + padded_in_w * packn; + const int8_t *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vint8mf2_t _max = vle8_v_i8mf2(line0, vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 2, vl), vl); + vse8_v_i8mf2(out0, _max, vl); + + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + out0 += packn; + } + line0 += tailstep; + line1 += tailstep; + line2 += tailstep; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} + +int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params) +{ +#ifdef RVV_1_0_0 + int8_t *input_data = (int8_t *)input->data; + int8_t *output_data = (int8_t *)output->data; + + int batch = input->dim[0]; + int in_c = input->dim[1]; + int in_h = input->dim[2]; + int in_w = input->dim[3]; + int input_size = in_c * in_h * in_w; + + int out_h = output->dim[2]; + int out_w = output->dim[3]; + int output_size = in_c * out_h * out_w; + + int padded_in_h = in_h + params->pad_top + params->pad_down; + int padded_in_w = in_w + params->pad_left + params->pad_right; + int padded_in_hw = padded_in_w * padded_in_h; + + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t)); + + for (int b = 0; b < batch; b++) { + shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h, + padded_in_w, params->pad_top, params->pad_left, + input->qinfo->zero_point); + + for (int c = 0; c + packn - 1 < in_c; c += packn) { + int8_t *out0 = output_data + c * out_h * out_w; + const int8_t *line0 = input_ncxhwx + c * padded_in_h * padded_in_w; + const int8_t *line1 = line0 + padded_in_w * packn; + const int8_t *line2 = line1 + padded_in_w * packn; + + for (int h = 0; h < out_h; h++) { + for (int w = 0; w < out_w; w++) { + vint8mf2_t _max = vle8_v_i8mf2(line0, vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 1, vl), vl); + _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 2, vl), vl); + vse8_v_i8mf2(out0, _max, vl); + + line0 += packn * 1; + line1 += packn * 1; + line2 += packn * 1; + out0 += packn; + } + line0 += packn * 2; + line1 += packn * 2; + line2 += packn * 2; + } + } + input_data += input_size; + output_data += output_size; + } + shl_mem_free(input_ncxhwx); + return CSINN_TRUE; +#elif define RVV_0_7_1 + shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n"); + return CSINN_FALSE; +#endif +} diff --git a/source/thead_rvv/mul.c b/source/thead_rvv/mul.c index 538eeaeb..9d28fb1a 100644 --- a/source/thead_rvv/mul.c +++ b/source/thead_rvv/mul.c @@ -16,19 +16,77 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_mul_fp32(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +static void element_mul_fp32(float *input0, float *input1, float *output, int size) { + while (size > 0) { + int vl = vsetvl_e32m2(size); + vfloat32m2_t _in0 = vle32_v_f32m2(input0, vl); + vfloat32m2_t _in1 = vle32_v_f32m2(input1, vl); + vfloat32m2_t _sum = vfmul_vv_f32m2(_in0, _in1, vl); + vse32_v_f32m2(output, _sum, vl); + input0 += vl; + input1 += vl; + output += vl; + size -= vl; + } +} + +int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + float *input0_data = (float *)input0->data; + float *input1_data = (float *)input1->data; + float *output_data = (float *)output->data; + + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); + + if (in_size0 == in_size1) { + element_mul_fp32(input0_data, input1_data, output_data, out_size); + } else { + shl_debug_error("unsupport broadcast mul for fp32\n"); + return CSINN_FALSE; + } return CSINN_TRUE; } -int csi_nn_rvv_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +static void element_mul_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size) { + while (size > 0) { + int vl = vsetvl_e16m2(size); + vfloat16m2_t _in0 = vle16_v_f16m2(input0, vl); + vfloat16m2_t _in1 = vle16_v_f16m2(input1, vl); + vfloat16m2_t _sum = vfmul_vv_f16m2(_in0, _in1, vl); + vse16_v_f16m2(output, _sum, vl); + input0 += vl; + input1 += vl; + output += vl; + size -= vl; + } +} + +int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + __fp16 *input0_data = (__fp16 *)input0->data; + __fp16 *input1_data = (__fp16 *)input1->data; + __fp16 *output_data = (__fp16 *)output->data; + + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); + + if (in_size0 == in_size1) { + element_mul_fp16(input0_data, input1_data, output_data, out_size); + } else { + shl_debug_error("unsupport broadcast mul for fp16\n"); + return CSINN_FALSE; + } return CSINN_TRUE; } @@ -40,21 +98,21 @@ right shift(>0) TODO: broadcast mul note: if input1 is const, support per-channel quantization ************************************************************************************/ -int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params) +int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params) { int8_t *input0_data = (int8_t *)input0->data; int8_t *input1_data = (int8_t *)input1->data; int8_t *output_data = (int8_t *)output->data; - int in_size0 = csi_tensor_size(input0); - int in_size1 = csi_tensor_size(input1); - int out_size = csi_tensor_size(output); + int in_size0 = csinn_tensor_size(input0); + int in_size1 = csinn_tensor_size(input1); + int out_size = csinn_tensor_size(output); // TODO: move to init api for (int q = 0; q < input1->quant_channel; q++) { float real_scale = input0->qinfo->scale * input1->qinfo[q].scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &input1->qinfo[q].multiplier, &input1->qinfo[q].shift); + shl_quantize_multiplier(real_scale, &input1->qinfo[q].multiplier, &input1->qinfo[q].shift); } if (in_size0 == in_size1) { @@ -96,7 +154,7 @@ int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1, } } } else { - csi_debug_error("Only support elementwise mul on RVV CPU\n"); + shl_debug_error("Only support elementwise mul on RVV CPU\n"); } return CSINN_TRUE; } diff --git a/source/thead_rvv/pad.c b/source/thead_rvv/pad.c new file mode 100644 index 00000000..b284356a --- /dev/null +++ b/source/thead_rvv/pad.c @@ -0,0 +1,501 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + * params: + * input: origin input data + * input_padded: input data after pad + * inc: origin input channel + * inh: origin input height + * inw: origin input width + * padded_h: input height after pad + * padded_w: input width after pad + * pad_top: origin pad top + * pad_left: origin pad left + *************************************************************/ +void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left) +{ + int padded_hw = padded_h * padded_w; + + float *pad_ptr = input_padded; + float *inp_ptr = (float *)input; + int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) + int size; + int vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); + + for (int c = 0; c < inc; c++) { + pad_ptr = input_padded + c * padded_hw; + // pad h_top + size = padded_w * pad_top; + while (size > 0) { + vl = vsetvl_e32m1(size); + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + size -= vl; + } + // pad h_mid + for (int h = 0; h < inh; h++) { + // pad w_left + memset(pad_ptr, 0, pad_left * sizeof(float)); + pad_ptr += pad_left; + // pad w_mid + size = inw; + while (size > 0) { + vl = vsetvl_e32m1(size); + vfloat32m1_t _input = vle32_v_f32m1(inp_ptr, vl); + inp_ptr += vl; + vse32_v_f32m1(pad_ptr, _input, vl); + pad_ptr += vl; + size -= vl; + } + // pad w_end + memset(pad_ptr, 0, resi_w * sizeof(float)); + pad_ptr += resi_w; + } + // pad h_bottom + size = padded_w * resi_h; + while (size > 0) { + vl = vsetvl_e32m1(size); + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + size -= vl; + } + } +} + +void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left) +{ + int padded_hw = padded_h * padded_w; + + __fp16 *pad_ptr = input_padded; + __fp16 *inp_ptr = (__fp16 *)input; + int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) + int size; + int vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl); + + for (int c = 0; c < inc; c++) { + pad_ptr = input_padded + c * padded_hw; + // pad h_top + size = padded_w * pad_top; + while (size > 0) { + vl = vsetvl_e16m1(size); + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + size -= vl; + } + // pad h_mid + for (int h = 0; h < inh; h++) { + // pad w_left + memset(pad_ptr, 0, pad_left * sizeof(__fp16)); + pad_ptr += pad_left; + // pad w_mid + size = inw; + while (size > 0) { + vl = vsetvl_e16m1(size); + vfloat16m1_t _input = vle16_v_f16m1(inp_ptr, vl); + inp_ptr += vl; + vse16_v_f16m1(pad_ptr, _input, vl); + pad_ptr += vl; + size -= vl; + } + // pad w_end + memset(pad_ptr, 0, resi_w * sizeof(__fp16)); + pad_ptr += resi_w; + } + // pad h_bottom + size = padded_w * resi_h; + while (size > 0) { + vl = vsetvl_e16m1(size); + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + size -= vl; + } + } +} + +void shl_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw, + int padded_h, int padded_w, int pad_top, int pad_left, int8_t pad_value) +{ + int padded_hw = padded_h * padded_w; + + int8_t *pad_ptr = input_padded; + int8_t *inp_ptr = (int8_t *)input; + int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) + int size; + int vl = vsetvl_e8m1(csrr_vlenb() / sizeof(int8_t)); + vint8m1_t _pad_zero = vmv_v_x_i8m1(pad_value, vl); // float 0.0 -> input->zero_point + + for (int c = 0; c < inc; c++) { + pad_ptr = input_padded + c * padded_hw; + // pad h_top + size = padded_w * pad_top; + while (size > 0) { + vl = vsetvl_e8m1(size); + vse8_v_i8m1(pad_ptr, _pad_zero, vl); + pad_ptr += vl; + size -= vl; + } + // pad h_mid + for (int h = 0; h < inh; h++) { + // pad w_left + memset(pad_ptr, pad_value, pad_left * sizeof(int8_t)); + pad_ptr += pad_left; + // pad w_mid + size = inw; + while (size > 0) { + vl = vsetvl_e8m1(size); + vint8m1_t _input = vle8_v_i8m1(inp_ptr, vl); + inp_ptr += vl; + vse8_v_i8m1(pad_ptr, _input, vl); + pad_ptr += vl; + size -= vl; + } + // pad w_end + memset(pad_ptr, pad_value, resi_w * sizeof(int8_t)); + pad_ptr += resi_w; + } + // pad h_bottom + size = padded_w * resi_h; + while (size > 0) { + vl = vsetvl_e8m1(size); + vse8_v_i8m1(pad_ptr, _pad_zero, vl); + pad_ptr += vl; + size -= vl; + } + } +} + +// constrains: in_c % packn = 0 +void shl_rvv_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + + float *pad_ptr = input_padded; + float *inp_ptr = (float *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + while (inc > 0) { + vl = vsetvl_e32m1(inc); + vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vle32_v_f32m1(inp_ptr, vl); + inp_ptr += vl; + vse32_v_f32m1(pad_ptr, _tmp, vl); + pad_ptr += vl; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + inc -= vl; + } +} + +void shl_rvv_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + __fp16 *pad_ptr = input_padded; + __fp16 *inp_ptr = (__fp16 *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat16m1_t _tmp = vle16_v_f16m1(inp_ptr, vl); + inp_ptr += packn; + vse16_v_f16m1(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +} + +// XXX: 需要适配 vector 0.7.1, mf2 不支持 +// packn = 8 for vlen128 +void shl_rvv_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left, + int8_t pad_value) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + + int8_t *pad_ptr = input_padded; + int8_t *inp_ptr = (int8_t *)input; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl); + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vle8_v_i8mf2(inp_ptr, vl); + inp_ptr += packn; + vse8_v_i8mf2(pad_ptr, _tmp, vl); + pad_ptr += packn; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += packn; + } + } +#endif +} + +// constrains: inc % packn = 0 +void shl_rvv_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + const int in_size = inh * inw; // per-channel size + + float *pad_ptr = input_padded; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); + + while (inc > 0) { + vl = vsetvl_e32m1(inc); + float *inp_ptr = (float *)input; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl); + inp_ptr++; + vse32_v_f32m1(pad_ptr, _tmp, vl); + pad_ptr += vl; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse32_v_f32m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + input += in_size * vl; + inc -= vl; + } +} + +void shl_rvv_pad_input_pack1ton_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(packn); + const int in_size = inh * inw; // per-channel size + + __fp16 *pad_ptr = input_padded; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl); + + int c = 0; + while (inc > 0) { + vl = vsetvl_e16m1(inc); + __fp16 *inp_ptr = (__fp16 *)input; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(inp_ptr, in_size * sizeof(__fp16), vl); + inp_ptr++; + vse16_v_f16m1(pad_ptr, _tmp, vl); + pad_ptr += vl; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse16_v_f16m1(pad_ptr, _zero, vl); + pad_ptr += vl; + } + input += in_size * vl; + inc -= vl; + } +} + +void shl_rvv_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, int pad_left, + int8_t pad_value) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + const int in_size = inh * inw; // per-channel size + + int8_t *pad_ptr = input_padded; + int pad_down = padded_h - pad_top - inh; // remain to pad on h (pad_down) + int pad_right = padded_w - pad_left - inw; // remain to pad on w (pad_right) + + vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl); + + int c = 0; + while (inc > 0) { + vl = vsetvl_e8mf2(inc); + int8_t *inp_ptr = (int8_t *)input; + // pad h_top + for (int i = 0; i < pad_top * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad h_mid + for (int i = 0; i < inh; i++) { + // pad w_left + for (int j = 0; j < pad_left; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += vl; + } + // pad w_mid + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl); + inp_ptr++; + vse8_v_i8mf2(pad_ptr, _tmp, vl); + pad_ptr += vl; + } + // pad w_end + for (int j = 0; j < pad_right; j++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += vl; + } + } + // pad h_bottom + for (int i = 0; i < pad_down * padded_w; i++) { + vse8_v_i8mf2(pad_ptr, _zero, vl); + pad_ptr += vl; + } + input += in_size * vl; + inc -= vl; + } +#endif +} diff --git a/source/thead_rvv/relu.c b/source/thead_rvv/relu.c index d213be08..bf966b68 100644 --- a/source/thead_rvv/relu.c +++ b/source/thead_rvv/relu.c @@ -16,68 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" /************************************************************* note: VLEN = 128/256 ... *************************************************************/ -int csi_nn_rvv_relu_fp32(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { - float *input_data = input->data; - float *output_data = output->data; - int size = 1; - for (int i = 0; i < input->dim_count; i++) { - size = size * input->dim[i]; - } - - int vl = vsetvl_e32m2(size); // vl=8 if vlen=128 + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; - int i = 0; - for (; i + vl - 1 < size; i += vl) { + int size = csinn_tensor_size(input); + while (size > 0) { + int vl = vsetvl_e32m2(size); vfloat32m2_t _input = vle32_v_f32m2(input_data, vl); input_data += vl; vfloat32m2_t _output = vfmax_vf_f32m2(_input, 0.0f, vl); vse32_v_f32m2(output_data, _output, vl); output_data += vl; - } - if (i < size) { - vl = vsetvl_e32m2(size & (vl - 1)); // ??? - vfloat32m2_t _input = vle32_v_f32m2(input_data, vl); - vfloat32m2_t _output = vfmax_vf_f32m2(_input, 0.0f, vl); - vse32_v_f32m2(output_data, _output, vl); + size -= vl; } return CSINN_TRUE; } -int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; - int size = 1; - for (int i = 0; i < input->dim_count; i++) { - size = size * input->dim[i]; - } - - int vl = vsetvl_e16m2(size); - - int i = 0; - for (; i + vl - 1 < size; i += vl) { + int size = csinn_tensor_size(input); + while (size > 0) { + int vl = vsetvl_e16m2(size); vfloat16m2_t _input = vle16_v_f16m2(input_data, vl); input_data += vl; vfloat16m2_t _output = vfmax_vf_f16m2(_input, 0.0f, vl); vse16_v_f16m2(output_data, _output, vl); output_data += vl; - } - if (i < size) { - vl = vsetvl_e16m2(size & (vl - 1)); - vfloat16m2_t _input = vle16_v_f16m2(input_data, vl); - vfloat16m2_t _output = vfmax_vf_f16m2(_input, 0.0f, vl); - vse16_v_f16m2(output_data, _output, vl); + size -= vl; } return CSINN_TRUE; } @@ -88,8 +67,8 @@ int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output, * * note:relu 一般接在全连接/卷积后面,可以直接和全连接/卷积 融合 ************************************************************************************/ -int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params) +int shl_rvv_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; @@ -97,9 +76,9 @@ int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output, // TODO: move to init api // real_scale > 1 => output->qinfo->shift > 0 ==> shift left float real_scale = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); + shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); while (size > 0) { int vl = vsetvl_e8m1(size); diff --git a/source/thead_rvv/relu6.c b/source/thead_rvv/relu6.c new file mode 100644 index 00000000..383fcb64 --- /dev/null +++ b/source/thead_rvv/relu6.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************* + note: VLEN = 128/256 ... +*************************************************************/ +int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int size = csinn_tensor_size(input); + while (size > 0) { + int vl = vsetvl_e32m2(size); + vfloat32m2_t _input = vle32_v_f32m2(input_data, vl); + input_data += vl; + vfloat32m2_t _output = vfmin_vf_f32m2(vfmax_vf_f32m2(_input, 0.0f, vl), 6.0f, vl); + vse32_v_f32m2(output_data, _output, vl); + output_data += vl; + size -= vl; + } + return CSINN_TRUE; +} + +int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + + int size = csinn_tensor_size(input); + while (size > 0) { + int vl = vsetvl_e16m2(size); + vfloat16m2_t _input = vle16_v_f16m2(input_data, vl); + input_data += vl; + vfloat16m2_t _output = vfmin_vf_f16m2(vfmax_vf_f16m2(_input, 0.0f, vl), 6.0f, vl); + vse16_v_f16m2(output_data, _output, vl); + output_data += vl; + size -= vl; + } + return CSINN_TRUE; +} + +/************************************************************************************ + * s2(q2 - z2) = relu6{ s1(q1 - z1) } + * q2 = (q1 - z1) * s1/s2 + z2 + * + * note:relu6 一般接在全连接/卷积后面,可以直接和全连接/卷积 融合 + ************************************************************************************/ +int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params) +{ + // refer to relu + return CSINN_FALSE; +} diff --git a/source/thead_rvv/reorder.c b/source/thead_rvv/reorder.c new file mode 100644 index 00000000..8c110234 --- /dev/null +++ b/source/thead_rvv/reorder.c @@ -0,0 +1,1976 @@ +/* + * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CSI-NN2 version 2.0.x */ + +#include "shl_thead_rvv.h" + +/************************************************************************ + * pack1ton: change input(activation) layout from nchw to nc1hwc0 + * 当 inc 不是 packn 的倍数时, 末梢单独处理(用 vl 控制) + * packnto1: change input(activation) layout from nc1hwc0 to nchw + ***********************************************************************/ +// constrains: inc % packn = 0 +void shl_rvv_reorder_input_pack1ton_fp32(const float *src, float *dst, int inc, int inh, int inw) +{ + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e32m1(inc); + float *in_ptr = (float *)src; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(in_ptr, in_size * sizeof(float), vl); + in_ptr++; + vse32_v_f32m1(dst, _tmp, vl); + dst += vl; + } + } + src += in_size * vl; + inc -= vl; + } +} + +void shl_rvv_reorder_input_pack1ton_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e16m1(inc); + __fp16 *in_ptr = (__fp16 *)src; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(in_ptr, in_size * sizeof(__fp16), vl); + in_ptr++; + vse16_v_f16m1(dst, _tmp, vl); + dst += vl; + } + } + src += in_size * vl; + inc -= vl; + } +} + +// XXX: 需要适配 vector 0.7.1, mf2 不支持 +void shl_rvv_reorder_input_pack1ton_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + const int in_size = inh * inw; // per-channel size + + int c = 0; + for (; c + packn - 1 < inc; c += packn) { + int8_t *in_ptr = (int8_t *)src + c * in_size; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl); + in_ptr++; + vse8_v_i8mf2(dst, _tmp, vl); + dst += packn; + } + } + } +#endif +} + +// constrains: inc % packn = 0 (tail) +void shl_rvv_reorder_input_packnto1_fp32(const float *src, float *dst, int inc, int inh, int inw) +{ + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + int vl = vsetvl_e32m1(inc); + float *out_ptr = dst; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vfloat32m1_t _tmp = vle32_v_f32m1(src, vl); + src += vl; + vsse32_v_f32m1(out_ptr, in_size * sizeof(float), _tmp, vl); + out_ptr++; + } + } + dst += in_size * vl; + inc -= vl; + } +} + +void shl_rvv_reorder_input_packnto1_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e16m1(inc); + __fp16 *out_ptr = dst; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vfloat16m1_t _tmp = vle16_v_f16m1(src, vl); + src += vl; + vsse16_v_f16m1(out_ptr, in_size * sizeof(__fp16), _tmp, vl); + out_ptr++; + } + } + dst += in_size * vl; + inc -= vl; + } +} + +void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(packn); + const int in_size = inh * inw; // per-channel size + + while (inc > 0) { + vl = vsetvl_e8mf2(inc); + int8_t *out_ptr = dst; + for (int i = 0; i < inh; i++) { + for (int j = 0; j < inw; j++) { + vint8mf2_t _tmp = vle8_v_i8mf2(src, vl); + src += vl; + vsse8_v_i8mf2(out_ptr, in_size * sizeof(int8_t), _tmp, vl); + out_ptr++; + } + } + dst += in_size * vl; + inc -= vl; + } +#endif +} + +/************************************************************************ + * reorder kernel matrix + ***********************************************************************/ +// vlen=128 +void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx) +{ + int i = 0; + for (; i + 7 < m; i += 8) { + for (int j = 0; j < k; j++) { + float *in_ptr = a + j; + vfloat32m2_t _input = vlse32_v_f32m2(in_ptr, k * sizeof(float), 8); + vse32_v_f32m2(sa, _input, 8); + sa += 8; + } + a += 8 * k; + } + for (; i + 3 < m; i += 4) { + for (int j = 0; j < k; j++) { + float *in_ptr = a + j; + vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), 4); + vse32_v_f32m1(sa, _input, 4); + sa += 4; + } + a += 4 * k; + } + for (; i + 1 < m; i += 2) { + for (int j = 0; j < k; j++) { + float *in_ptr = a + j; + vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), 2); + vse32_v_f32m1(sa, _input, 2); + sa += 2; + } + a += 2 * k; + } + for (; i < m; i++) { + memcpy(sa, a, k * sizeof(float)); + } +} + +void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) +{ + int i = 0; + for (; i + 7 < m; i += 8) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 8); + vse16_v_f16m1(sa, _input, 8); + sa += 8; + } + a += 8 * k; + } + for (; i + 3 < m; i += 4) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 4); + vse16_v_f16m1(sa, _input, 4); + sa += 4; + } + a += 4 * k; + } + for (; i + 1 < m; i += 2) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 2); + vse16_v_f16m1(sa, _input, 2); + sa += 2; + } + a += 2 * k; + } + for (; i < m; i++) { + memcpy(sa, a, k * sizeof(__fp16)); + } +} + +void shl_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx) +{ + int i = 0; + for (; i + 7 < m; i += 8) { + int j = 0; + for (; j + 3 < k; j += 4) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 8; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); + in_ptr += k; + vse8_v_i8m1(sa, _input, 4); + sa += 4; + } + } + // k_tail + if (j < k) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 8; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); + in_ptr += k; + vse8_v_i8m1(sa, _input, k & 3); + sa += 4; + } + } + a += 8 * k; + } + for (; i + 3 < m; i += 4) { + int j = 0; + for (; j + 3 < k; j += 4) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 4; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); + in_ptr += k; + vse8_v_i8m1(sa, _input, 4); + sa += 4; + } + } + if (j < k) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 4; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); + in_ptr += k; + vse8_v_i8m1(sa, _input, k & 3); + sa += 4; + } + } + a += 4 * k; + } + for (; i + 1 < m; i += 2) { + int j = 0; + for (; j + 3 < k; j += 4) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 2; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, 4); + in_ptr += k; + vse8_v_i8m1(sa, _input, 4); + sa += 4; + } + } + if (j < k) { + int8_t *in_ptr = a + j; + for (int c = 0; c < 2; c++) { + vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3); + in_ptr += k; + vse8_v_i8m1(sa, _input, k & 3); + sa += 4; + } + } + a += 2 * k; + } + for (; i < m; i++) { + memcpy(sa, a, k * sizeof(int8_t)); + } +} + +// vlen=256 +void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) +{ + int i = 0; + for (; i + 15 < m; i += 16) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m2_t _input = vlse16_v_f16m2(in_ptr, k * sizeof(__fp16), 16); + vse16_v_f16m2(sa, _input, 16); + sa += 16; + } + a += 16 * k; + } + for (; i + 7 < m; i += 8) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 8); + vse16_v_f16m1(sa, _input, 8); + sa += 8; + } + a += 8 * k; + } + for (; i + 3 < m; i += 4) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 4); + vse16_v_f16m1(sa, _input, 4); + sa += 4; + } + a += 4 * k; + } + for (; i + 1 < m; i += 2) { + for (int j = 0; j < k; j++) { + __fp16 *in_ptr = a + j; + vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 2); + vse16_v_f16m1(sa, _input, 2); + sa += 2; + } + a += 2 * k; + } + for (; i < m; i++) { + memcpy(sa, a, k * sizeof(__fp16)); + } +} + +// flexible vlen +/************************************************************* + * constrain: m(out_channel) % packn = 0; k % packn = 0 + * e.g. vlen=128, n8 --> n4 + ************************************************************/ +void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + int vl = vsetvl_e32m2(pack2n); + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + float *g0 = a + oc * k; + for (int ic = 0; ic < k; ic++) { + vfloat32m2_t _tmp = vlse32_v_f32m2(g0 + ic, k * sizeof(float), vl); + vse32_v_f32m2(sa, _tmp, vl); + sa += vl; + } + } + vl = vsetvl_e32m1(packn); + for (; oc + packn - 1 < m; oc += packn) { + float *g0 = a + oc * k; + for (int ic = 0; ic < k; ic++) { + vfloat32m1_t _tmp = vlse32_v_f32m1(g0 + ic, k * sizeof(float), vl); + vse32_v_f32m1(sa, _tmp, vl); + sa += vl; + } + } +} + +/************************************************************* + * constrain: m(out_channel) % packn = 0; k % packn = 0 + * e.g. vlen=128, n16 --> n8 + ************************************************************/ +void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + int vl = vsetvl_e16m2(pack2n); + int oc = 0; + for (; oc + pack2n - 1 < m; oc += pack2n) { + __fp16 *g0 = a + oc * k; + for (int ic = 0; ic < k; ic++) { + vfloat16m2_t _tmp = vlse16_v_f16m2(g0 + ic, k * sizeof(__fp16), vl); + vse16_v_f16m2(sa, _tmp, vl); + sa += vl; + } + } + vl = vsetvl_e16m1(packn); + for (; oc + packn - 1 < m; oc += packn) { + __fp16 *g0 = a + oc * k; + for (int ic = 0; ic < k; ic++) { + vfloat16m1_t _tmp = vlse16_v_f16m1(g0 + ic, k * sizeof(__fp16), vl); + vse16_v_f16m1(sa, _tmp, vl); + sa += vl; + } + } +} + +/************************************************************************ + * reorder input matrix + ***********************************************************************/ + +// vlen=128 +/************************************************************** + * Data arrangement: Z8 | | | + **************************************************************/ +void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx) +{ + int32_t vl = vsetvl_e32m2(8); + float *b0 = NULL; + int i = 0; + for (; i + 7 < n; i += 8) { + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl); + b0 += ldx; + vse32_v_f32m2(sb, _tmp, vl); + sb += 8; + } + } + + for (; i < n; i++) { + vl = vsetvl_e32m2(8); + b0 = b + i; + int j = 0; + for (; j + 7 < k; j += 8) { + vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); + b0 += 8 * ldx; + vse32_v_f32m2(sb, _tmp, vl); + sb += 8; + } + if (j < k) { + vl = vsetvl_e32m2(k & 7); + vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); + vse32_v_f32m2(sb, _tmp, vl); + sb += vl; + } + } +} + +/************************************************************** + * Data arrangement: Z16 Z8 | | | + **************************************************************/ +void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +{ + int vl = vsetvl_e16m2(16); + __fp16 *b0 = NULL; + int i = 0; + for (; i + 15 < n; i += 16) { + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat16m2_t _tmp = vle16_v_f16m2(b0, vl); + b0 += ldx; + vse16_v_f16m2(sb, _tmp, vl); + sb += 16; + } + } + + for (; i + 7 < n; i += 8) { + vl = vsetvl_e16m1(8); + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl); + b0 += ldx; + vse16_v_f16m1(sb, _tmp, vl); + sb += 8; + } + } + + for (; i < n; i++) { + vl = vsetvl_e16m2(16); + b0 = b + i; + int j = 0; + for (; j + 15 < k; j += 16) { + vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl); + b0 += 16 * ldx; + vse16_v_f16m2(sb, _tmp, vl); + sb += 16; + } + if (j < k) { + vl = vsetvl_e16m2(k & 15); + vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl); + vse16_v_f16m2(sb, _tmp, vl); + sb += vl; + } + } +} + +/************************************************************** + * Data arrangement: Z8 Z4 | | | + **************************************************************/ +void shl_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ + int vl = vsetvl_e8m1(8); + int i = 0; + for (; i + 7 < n; i += 8) { + int8_t *b0 = b + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb += 32 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = sb; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + sb += 32; + } + } + for (; i + 3 < n; i += 4) { + vl = vsetvl_e8m1(4); + int8_t *b0 = b + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb += 13; + } + // k_tail + if (j < k) { + int8_t *sb0 = sb; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + sb += 16; + } + } + // n_tail + for (; i < n; i++) { + vl = vsetvl_e8m1(16); + int8_t *b0 = b + i; + int j = 0; + for (; j + 15 < k; j += 16) { + vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); + b0 += 16 * ldx; + vse8_v_i8m1(sb, _tmp, vl); + sb += 16; + } + if (j < k) { + vl = vsetvl_e8m1(k & 15); + vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); + vse8_v_i8m1(sb, _tmp, vl); + sb += ((k & 15) / 4 + 1) * 4; + } + } +} + +// vlen=256 +void shl_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx) +{ + int vl = vsetvl_e32m2(16); + float *b0 = NULL; + int i = 0; + + // Z16 + for (; i + 15 < n; i += 16) { + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl); + b0 += ldx; + vse32_v_f32m2(sb, _tmp, vl); + sb += 16; + } + } + + // Z8 + for (; i + 7 < n; i += 8) { + vl = vsetvl_e32m1(8); + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat32m1_t _tmp = vle32_v_f32m1(b0, vl); + b0 += ldx; + vse32_v_f32m1(sb, _tmp, vl); + sb += 8; + } + } + + // col by col + for (; i < n; i++) { + vl = vsetvl_e32m2(16); + b0 = b + i; + int j = 0; + for (; j + 15 < k; j += 16) { + vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); + b0 += 16 * ldx; + vse32_v_f32m2(sb, _tmp, vl); + sb += 16; + } + if (j < k) { + vl = vsetvl_e32m2(k & 15); + vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl); + vse32_v_f32m2(sb, _tmp, vl); + sb += vl; + } + } +} + +void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +{ + int vl = vsetvl_e16m1(16); + __fp16 *b0 = NULL; + int i = 0; + for (; i + 15 < n; i += 16) { + b0 = b + i; + for (int j = 0; j < k; j++) { + vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl); + b0 += ldx; + vse16_v_f16m1(sb, _tmp, vl); + sb += 16; + } + } + + for (; i < n; i++) { + vl = vsetvl_e16m1(16); + b0 = b + i; + int j = 0; + for (; j + 15 < k; j += 16) { + vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl); + b0 += 16 * ldx; + vse16_v_f16m1(sb, _tmp, vl); + sb += 16; + } + if (j < k) { + vl = vsetvl_e16m1(k & 15); + vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl); + vse16_v_f16m1(sb, _tmp, vl); + sb += vl; + } + } +} + +void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ + int vl = vsetvl_e8m1(16); + int i = 0; + for (; i + 15 < n; i += 16) { + int8_t *b0 = b + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb += 64 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = sb; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + sb += 64; + } + } + for (; i + 7 < n; i += 8) { + vl = vsetvl_e8m1(8); + int8_t *b0 = b + i; + int j = 0; + for (; j + 3 < k; j += 4) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb++; + _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl); + sb += 32 - 3; + } + // k_tail + if (j < k) { + int8_t *sb0 = sb; + for (; j < k; j++) { + vint8m1_t _tmp = vle8_v_i8m1(b0, vl); + b0 += n; + vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl); + sb0++; + } + sb += 32; + } + } + // n_tail + for (; i < n; i++) { + vl = vsetvl_e8m1(16); + int8_t *b0 = b + i; + int j = 0; + for (; j + 15 < k; j += 16) { + vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); + b0 += 16 * ldx; + vse8_v_i8m1(sb, _tmp, vl); + sb += 16; + } + if (j < k) { + vl = vsetvl_e8m1(k & 15); + vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl); + vse8_v_i8m1(sb, _tmp, vl); + sb += ((k & 15) / 4 + 1) * 4; + } + } +} + +// flexible vlen +/************************************************************** + * src: b [inc/packn, maxk, n, packn] + [maxk, n, inc%packn] + * dst: sb [n/12, inc/packn * maxk * packn + maxk * inc%packn, 12] + * Data arrangement: Z12 Z8 Z4 Z2 Z1 + * 注意 inc 在 packn 倍数和非 packn 的倍数时边界点 + **************************************************************/ +void shl_rvv_reorder_input_z12_pack1ton_fp32(float *b, float *sb, int inc, int maxk, int n, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(float); + int vl = vsetvl_e32m1(inc); + + int t = 0; + for (; t + 11 < n; t += 12) { + const float *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e32m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + avl * 4, avl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + avl * 5, avl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + avl * 6, avl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + avl * 7, avl); + vfloat32m1_t _tmp8 = vle32_v_f32m1(tm1 + avl * 8, avl); + vfloat32m1_t _tmp9 = vle32_v_f32m1(tm1 + avl * 9, avl); + vfloat32m1_t _tmp10 = vle32_v_f32m1(tm1 + avl * 10, avl); + vfloat32m1_t _tmp11 = vle32_v_f32m1(tm1 + avl * 11, avl); + + vsse32_v_f32m1(sb, 12 * sizeof(float), _tmp0, avl); + vsse32_v_f32m1(sb + 1, 12 * sizeof(float), _tmp1, avl); + vsse32_v_f32m1(sb + 2, 12 * sizeof(float), _tmp2, avl); + vsse32_v_f32m1(sb + 3, 12 * sizeof(float), _tmp3, avl); + vsse32_v_f32m1(sb + 4, 12 * sizeof(float), _tmp4, avl); + vsse32_v_f32m1(sb + 5, 12 * sizeof(float), _tmp5, avl); + vsse32_v_f32m1(sb + 6, 12 * sizeof(float), _tmp6, avl); + vsse32_v_f32m1(sb + 7, 12 * sizeof(float), _tmp7, avl); + vsse32_v_f32m1(sb + 8, 12 * sizeof(float), _tmp8, avl); + vsse32_v_f32m1(sb + 9, 12 * sizeof(float), _tmp9, avl); + vsse32_v_f32m1(sb + 10, 12 * sizeof(float), _tmp10, avl); + vsse32_v_f32m1(sb + 11, 12 * sizeof(float), _tmp11, avl); + + tm1 += n * avl; + sb += 12 * avl; + } + loop_c -= avl; + } + } + for (; t + 7 < n; t += 8) { + const float *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e32m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + avl * 4, avl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + avl * 5, avl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + avl * 6, avl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + avl * 7, avl); + vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, avl); + tm1 += n * avl; + sb += 8 * avl; + } + loop_c -= avl; + } + } + for (; t + 3 < n; t += 4) { + const float *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e32m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl); + vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, avl); + tm1 += n * avl; + sb += 4 * avl; + } + loop_c -= avl; + } + } + for (; t + 1 < n; t += 2) { + const float *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e32m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl); + vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, avl); + tm1 += n * avl; + sb += 2 * avl; + } + loop_c -= avl; + } + } + for (; t < n; t++) { + const float *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e32m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl); + vse32_v_f32m1(sb, _tmp0, avl); + tm1 += n * avl; + sb += 1 * avl; + } + loop_c -= avl; + } + } +} + +void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb, int inc, int maxk, int n, + int ldx) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + int vl = vsetvl_e16m1(inc); + + int t = 0; + for (; t + 11 < n; t += 12) { + const __fp16 *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e16m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + avl * 4, avl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + avl * 5, avl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + avl * 6, avl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + avl * 7, avl); + vfloat16m1_t _tmp8 = vle16_v_f16m1(tm1 + avl * 8, avl); + vfloat16m1_t _tmp9 = vle16_v_f16m1(tm1 + avl * 9, avl); + vfloat16m1_t _tmp10 = vle16_v_f16m1(tm1 + avl * 10, avl); + vfloat16m1_t _tmp11 = vle16_v_f16m1(tm1 + avl * 11, avl); + + vsse16_v_f16m1(sb, 12 * sizeof(__fp16), _tmp0, avl); + vsse16_v_f16m1(sb + 1, 12 * sizeof(__fp16), _tmp1, avl); + vsse16_v_f16m1(sb + 2, 12 * sizeof(__fp16), _tmp2, avl); + vsse16_v_f16m1(sb + 3, 12 * sizeof(__fp16), _tmp3, avl); + vsse16_v_f16m1(sb + 4, 12 * sizeof(__fp16), _tmp4, avl); + vsse16_v_f16m1(sb + 5, 12 * sizeof(__fp16), _tmp5, avl); + vsse16_v_f16m1(sb + 6, 12 * sizeof(__fp16), _tmp6, avl); + vsse16_v_f16m1(sb + 7, 12 * sizeof(__fp16), _tmp7, avl); + vsse16_v_f16m1(sb + 8, 12 * sizeof(__fp16), _tmp8, avl); + vsse16_v_f16m1(sb + 9, 12 * sizeof(__fp16), _tmp9, avl); + vsse16_v_f16m1(sb + 10, 12 * sizeof(__fp16), _tmp10, avl); + vsse16_v_f16m1(sb + 11, 12 * sizeof(__fp16), _tmp11, avl); + + tm1 += n * avl; + sb += 12 * avl; + } + loop_c -= avl; + } + } + for (; t + 7 < n; t += 8) { + const __fp16 *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e16m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + avl * 4, avl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + avl * 5, avl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + avl * 6, avl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + avl * 7, avl); + vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, avl); + tm1 += n * avl; + sb += 8 * avl; + } + loop_c -= avl; + } + } + for (; t + 3 < n; t += 4) { + const __fp16 *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e16m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl); + vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, avl); + tm1 += n * avl; + sb += 4 * avl; + } + loop_c -= avl; + } + } + for (; t + 1 < n; t += 2) { + const __fp16 *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e16m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl); + vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, avl); + tm1 += n * avl; + sb += 2 * avl; + } + loop_c -= avl; + } + } + for (; t < n; t++) { + const __fp16 *tm1 = b + t * vl; + int loop_c = inc; + while (loop_c > 0) { + int avl = vsetvl_e16m1(loop_c); + tm1 += t * (avl - vl); + for (int i = 0; i < maxk; i++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl); + vse16_v_f16m1(sb, _tmp0, avl); + tm1 += n * avl; + sb += 1 * avl; + } + loop_c -= avl; + } + } +} + +/************************************************************** + * inc % 4 = 0 + **************************************************************/ +void shl_rvv_reorder_input_z12_pack1ton_int8(int8_t *b, int8_t *sb, int inc, int maxk, int n, + int ldx) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + int vl = vsetvl_e8mf2(inc); + int avl = vl / 4; + int avl_tail = (inc % packn) / 4; + int32_t *dst = (int32_t *)sb; + + int t = 0; + for (; t + 11 < n; t += 12) { + const int32_t *tm1 = (const int32_t *)(b + t * vl); + int ic = 0; + for (; ic + packn - 1 < inc; ic += packn) { + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl); + vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl); + vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl); + vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl); + vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl); + vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl); + vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl); + vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl); + vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl); + + dst += 12 * avl; + tm1 += n * avl; + } + } + if (ic < inc) { + tm1 += t * (avl_tail - avl); + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail); + vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl_tail); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail); + vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl_tail); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail); + vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl_tail); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail); + vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl_tail); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl_tail * 4, avl_tail); + vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl_tail); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl_tail * 5, avl_tail); + vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl_tail); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl_tail * 6, avl_tail); + vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl_tail); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl_tail * 7, avl_tail); + vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl_tail); + vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl_tail * 8, avl_tail); + vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl_tail); + vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl_tail * 9, avl_tail); + vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl_tail); + vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl_tail * 10, avl_tail); + vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl_tail); + vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl_tail * 11, avl_tail); + vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl_tail); + + dst += 12 * avl_tail; + tm1 += n * avl_tail; + } + } + } + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * vl); + int ic = 0; + for (; ic + packn - 1 < inc; ic += packn) { + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl); + + dst += 8 * avl; + tm1 += n * avl; + } + } + if (ic < inc) { + tm1 += t * (avl_tail - avl); + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl_tail); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl_tail); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl_tail); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl_tail); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl_tail * 4, avl_tail); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl_tail); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl_tail * 5, avl_tail); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl_tail); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl_tail * 6, avl_tail); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl_tail); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl_tail * 7, avl_tail); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl_tail); + + dst += 8 * avl_tail; + tm1 += n * avl_tail; + } + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * vl); + int ic = 0; + for (; ic + packn - 1 < inc; ic += packn) { + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl); + + dst += 4 * avl; + tm1 += n * avl; + } + } + if (ic < inc) { + tm1 += t * (avl_tail - avl); + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl_tail); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl_tail); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl_tail); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl_tail); + + dst += 4 * avl_tail; + tm1 += n * avl_tail; + } + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * vl); + int ic = 0; + for (; ic + packn - 1 < inc; ic += packn) { + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl); + dst += 2 * avl; + tm1 += n * avl; + } + } + if (ic < inc) { + tm1 += t * (avl_tail - avl); + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl_tail); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl_tail); + dst += 2 * avl_tail; + tm1 += n * avl_tail; + } + } + } + for (; t < n; t += 1) { + const int32_t *tm1 = (const int32_t *)(b + t * vl); + int ic = 0; + for (; ic + packn - 1 < inc; ic += packn) { + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vse32_v_i32mf2(dst, _col0, avl); + dst += 1 * avl; + tm1 += n * avl; + } + } + if (ic < inc) { + tm1 += t * (avl_tail - avl); + for (int i = 0; i < maxk; i++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail); + vse32_v_i32mf2(dst, _col0, avl_tail); + dst += 1 * avl_tail; + tm1 += n * avl_tail; + } + } + } +#endif +} + +/************************************************************** + * input—matrix: [k, n] + * src: b [k/packn, n, packn] + * dst: sb [n/8, k, 8] + * Data arrangement: Z8 Z4 Z2 Z1 + **************************************************************/ +void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + int t = 0; + for (; t + 7 < n; t += 8) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl); + tm1 += n * packn; + sb += 8 * packn; + } + } + for (; t + 3 < n; t += 4) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += n * packn; + sb += 4 * packn; + } + } + for (; t + 1 < n; t += 2) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, vl); + tm1 += n * packn; + sb += 2 * packn; + } + } + for (; t < n; t++) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vse32_v_f32m1(sb, _tmp0, vl); + tm1 += n * packn; + sb += 1 * packn; + } + } +} + +void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + int t = 0; + for (; t + 7 < n; t += 8) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl); + tm1 += n * packn; + sb += 8 * packn; + } + } + for (; t + 3 < n; t += 4) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += n * packn; + sb += 4 * packn; + } + } + for (; t + 1 < n; t += 2) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, vl); + tm1 += n * packn; + sb += 2 * packn; + } + } + for (; t < n; t++) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vse16_v_f16m1(sb, _tmp0, vl); + tm1 += n * packn; + sb += 1 * packn; + } + } +} + +void shl_rvv_reorder_input_z8_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int32_t *dst = (int32_t *)sb; + + int t = 0; + /* 只适合 vlen=128,需要兼容 vlen + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32m2_t _line0, _line1; + vlseg2e32_v_i32m2(&_line0, &_line1, tm1, 8); + vse32_v_i32m2(dst, _line0, 8); + dst += 8; + vse32_v_i32m2(dst, _line1, 8); + dst += 8; + tm1 += n * packn / 4; + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32m1_t _line0, _line1; + vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 4); + vse32_v_i32m1(dst, _line0, 4); + dst += 4; + vse32_v_i32m1(dst, _line1, 4); + dst += 4; + tm1 += n * packn / 4; + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32m1_t _line0, _line1; + vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 2); + vse32_v_i32m1(dst, _line0, 2); + dst += 2; + vse32_v_i32m1(dst, _line1, 2); + dst += 2; + tm1 += n * packn / 4; + } + } + for (; t < n; t++) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32m1_t _line0, _line1; + vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 1); + vse32_v_i32m1(dst, _line0, 1); + dst += 1; + vse32_v_i32m1(dst, _line1, 1); + dst += 1; + tm1 += n * packn / 4; + } + } + */ + + int avl = packn / 4; + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl); + + dst += 8 * avl; + tm1 += n * avl; + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl); + + dst += 4 * avl; + tm1 += n * avl; + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl); + + dst += 2 * avl; + tm1 += n * avl; + } + } + for (; t < n; t++) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vse32_v_i32mf2(dst, _col0, avl); + + dst += 1 * avl; + tm1 += n * avl; + } + } +#endif +} + +/************************************************************** + * input—matrix: [k, n] + * src: b [k/packn/2, n, packn/2] + * dst: sb [n/8, k, 8] + * Data arrangement: Z8 Z4 Z2 Z1 + **************************************************************/ +void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2 / 2; + const int vl = vsetvl_e8mf4(packn); + int32_t *dst = (int32_t *)sb; + + int t = 0; + int avl = packn / 4; + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl); + + dst += 8 * avl; + tm1 += n * avl; + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl); + + dst += 4 * avl; + tm1 += n * avl; + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl); + + dst += 2 * avl; + tm1 += n * avl; + } + } + for (; t < n; t++) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vse32_v_i32mf2(dst, _col0, avl); + + dst += 1 * avl; + tm1 += n * avl; + } + } +#endif +} + +/************************************************************** + * input—matrix: [k, n] + * src: b [k/packn, n, packn] + * dst: sb [n/12, k, 12] + * Data arrangement: Z12 Z8 Z4 Z2 Z1 + **************************************************************/ +void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int vl = vsetvl_e32m1(packn); + + int t = 0; + for (; t + 11 < n; t += 12) { + const float *tm1 = b + t * packn; // start addr + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + vfloat32m1_t _tmp8 = vle32_v_f32m1(tm1 + packn * 8, vl); + vfloat32m1_t _tmp9 = vle32_v_f32m1(tm1 + packn * 9, vl); + vfloat32m1_t _tmp10 = vle32_v_f32m1(tm1 + packn * 10, vl); + vfloat32m1_t _tmp11 = vle32_v_f32m1(tm1 + packn * 11, vl); + + vsse32_v_f32m1(sb, 12 * sizeof(float), _tmp0, vl); + vsse32_v_f32m1(sb + 1, 12 * sizeof(float), _tmp1, vl); + vsse32_v_f32m1(sb + 2, 12 * sizeof(float), _tmp2, vl); + vsse32_v_f32m1(sb + 3, 12 * sizeof(float), _tmp3, vl); + vsse32_v_f32m1(sb + 4, 12 * sizeof(float), _tmp4, vl); + vsse32_v_f32m1(sb + 5, 12 * sizeof(float), _tmp5, vl); + vsse32_v_f32m1(sb + 6, 12 * sizeof(float), _tmp6, vl); + vsse32_v_f32m1(sb + 7, 12 * sizeof(float), _tmp7, vl); + vsse32_v_f32m1(sb + 8, 12 * sizeof(float), _tmp8, vl); + vsse32_v_f32m1(sb + 9, 12 * sizeof(float), _tmp9, vl); + vsse32_v_f32m1(sb + 10, 12 * sizeof(float), _tmp10, vl); + vsse32_v_f32m1(sb + 11, 12 * sizeof(float), _tmp11, vl); + tm1 += n * packn; + sb += 12 * packn; + } + } + for (; t + 7 < n; t += 8) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl); + vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl); + vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl); + vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl); + vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl); + tm1 += n * packn; + sb += 8 * packn; + } + } + for (; t + 3 < n; t += 4) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl); + vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl); + vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += n * packn; + sb += 4 * packn; + } + } + for (; t + 1 < n; t += 2) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl); + vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, vl); + tm1 += n * packn; + sb += 2 * packn; + } + } + for (; t < n; t++) { + const float *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl); + vse32_v_f32m1(sb, _tmp0, vl); + tm1 += n * packn; + sb += 1 * packn; + } + } +} + +void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int vl = vsetvl_e16m1(packn); + + int t = 0; + for (; t + 11 < n; t += 12) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + vfloat16m1_t _tmp8 = vle16_v_f16m1(tm1 + packn * 8, vl); + vfloat16m1_t _tmp9 = vle16_v_f16m1(tm1 + packn * 9, vl); + vfloat16m1_t _tmp10 = vle16_v_f16m1(tm1 + packn * 10, vl); + vfloat16m1_t _tmp11 = vle16_v_f16m1(tm1 + packn * 11, vl); + + vsse16_v_f16m1(sb, 12 * sizeof(__fp16), _tmp0, vl); + vsse16_v_f16m1(sb + 1, 12 * sizeof(__fp16), _tmp1, vl); + vsse16_v_f16m1(sb + 2, 12 * sizeof(__fp16), _tmp2, vl); + vsse16_v_f16m1(sb + 3, 12 * sizeof(__fp16), _tmp3, vl); + vsse16_v_f16m1(sb + 4, 12 * sizeof(__fp16), _tmp4, vl); + vsse16_v_f16m1(sb + 5, 12 * sizeof(__fp16), _tmp5, vl); + vsse16_v_f16m1(sb + 6, 12 * sizeof(__fp16), _tmp6, vl); + vsse16_v_f16m1(sb + 7, 12 * sizeof(__fp16), _tmp7, vl); + vsse16_v_f16m1(sb + 8, 12 * sizeof(__fp16), _tmp8, vl); + vsse16_v_f16m1(sb + 9, 12 * sizeof(__fp16), _tmp9, vl); + vsse16_v_f16m1(sb + 10, 12 * sizeof(__fp16), _tmp10, vl); + vsse16_v_f16m1(sb + 11, 12 * sizeof(__fp16), _tmp11, vl); + tm1 += n * packn; + sb += 12 * packn; + } + } + for (; t + 7 < n; t += 8) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl); + vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl); + vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl); + vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl); + vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl); + tm1 += n * packn; + sb += 8 * packn; + } + } + for (; t + 3 < n; t += 4) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl); + vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl); + vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl); + tm1 += n * packn; + sb += 4 * packn; + } + } + for (; t + 1 < n; t += 2) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl); + vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, vl); + tm1 += n * packn; + sb += 2 * packn; + } + } + for (; t < n; t++) { + const __fp16 *tm1 = b + t * packn; + for (int q = 0; q < k / packn; q++) { + vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl); + vse16_v_f16m1(sb, _tmp0, vl); + tm1 += n * packn; + sb += 1 * packn; + } + } +} + +void shl_rvv_reorder_input_z12_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; + const int vl = vsetvl_e8mf2(packn); + int32_t *dst = (int32_t *)sb; + + int t = 0; + int avl = packn / 4; + for (; t + 11 < n; t += 12) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl); + vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl); + vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl); + vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl); + vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl); + vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl); + vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl); + vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl); + vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl); + + dst += 12 * avl; + tm1 += n * avl; + } + } + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl); + + dst += 8 * avl; + tm1 += n * avl; + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl); + + dst += 4 * avl; + tm1 += n * avl; + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl); + + dst += 2 * avl; + tm1 += n * avl; + } + } + for (; t < n; t++) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vse32_v_i32mf2(dst, _col0, avl); + + dst += 1 * avl; + tm1 += n * avl; + } + } +#endif +} + +/************************************************************** + * input—matrix: [k, n] + * src: b [k/packn/2, n, packn/2] + * dst: sb [n/12, k, 12] + * Data arrangement: Z12 Z8 Z4 Z2 Z1 + **************************************************************/ +void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx) +{ +#ifdef RVV_1_0_0 + const int packn = csrr_vlenb() / sizeof(int8_t) / 2 / 2; + const int vl = vsetvl_e8mf4(packn); + int32_t *dst = (int32_t *)sb; + + int t = 0; + int avl = packn / 4; + for (; t + 11 < n; t += 12) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl); + vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl); + vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl); + vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl); + vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl); + vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl); + vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl); + vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl); + vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl); + + dst += 12 * avl; + tm1 += n * avl; + } + } + for (; t + 7 < n; t += 8) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl); + vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl); + vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl); + vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl); + vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl); + vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl); + vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl); + vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl); + vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl); + + dst += 8 * avl; + tm1 += n * avl; + } + } + for (; t + 3 < n; t += 4) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl); + vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl); + vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl); + vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl); + vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl); + + dst += 4 * avl; + tm1 += n * avl; + } + } + for (; t + 1 < n; t += 2) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl); + vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl); + vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl); + + dst += 2 * avl; + tm1 += n * avl; + } + } + for (; t < n; t++) { + const int32_t *tm1 = (const int32_t *)(b + t * packn); + + for (int q = 0; q < k / packn; q++) { + vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl); + vse32_v_i32mf2(dst, _col0, avl); + + dst += 1 * avl; + tm1 += n * avl; + } + } +#endif +} diff --git a/source/thead_rvv/setup.c b/source/thead_rvv/setup.c index 28f15b70..f75220c1 100644 --- a/source/thead_rvv/setup.c +++ b/source/thead_rvv/setup.c @@ -16,377 +16,137 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -void *csi_init_map_rvv(int op, int dtype) -{ - if (op == CSINN_OP_CONV2D || op == CSINN_OP_GROUP_CONV2D) { - return csi_nn_rvv_conv2d_init; - } else if (op == CSINN_OP_DEPTHWISE_CONV2D) { - return csi_nn_rvv_depthwise_conv2d_init; - } else if (op == CSINN_OP_MAXPOOL2D) { - return csi_nn_rvv_maxpool2d_init; - } else if (op == CSINN_OP_AVGPOOL2D) { - return csi_nn_rvv_avgpool2d_init; - } else if (op == CSINN_OP_FULLYCONNECTED) { - return csi_nn_rvv_fullyconnected_init; - } else if (op == CSINN_OP_CONV2D_RELU) { - if (dtype == CSINN_DTYPE_INT8 || dtype == CSINN_DTYPE_INT4) { - return csi_nn_rvv_conv2d_init; - } - } else if (op == CSINN_OP_DEPTHWISE_CONV2D_RELU) { - if (dtype == CSINN_DTYPE_INT8 || dtype == CSINN_DTYPE_INT4) { - return csi_nn_rvv_depthwise_conv2d_init; - } - } - return NULL; -} +#define RVV_OP_PATTERN_MAX 80 +static struct csinn_callback __rvv_cb_table[RVV_OP_PATTERN_MAX]; +static int __rvv_cb_key[RVV_OP_PATTERN_MAX]; -static void *setup_bc_map() +void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec, + void *est) { - static void *bc_map[CSINN_OP_AND_UTILS_SIZE][4]; - - bc_map[CSINN_OP_ABS][3] = csi_ref_abs_f32; - bc_map[CSINN_OP_ACOS][3] = csi_ref_acos_f32; - bc_map[CSINN_OP_ACOSH][3] = csi_ref_acosh_f32; - bc_map[CSINN_OP_ADD][3] = csi_nn_rvv_add_fp32; - bc_map[CSINN_OP_ARANGE][3] = csi_ref_arange_f32; - bc_map[CSINN_OP_ARGMAX][3] = csi_ref_argmax_stride_i32_f32; - bc_map[CSINN_OP_ARGMIN][3] = csi_ref_argmin_stride_i32_f32; - bc_map[CSINN_OP_ASIN][3] = csi_ref_asin_f32; - bc_map[CSINN_OP_ASINH][3] = csi_ref_asinh_f32; - bc_map[CSINN_OP_ATAN][3] = csi_ref_atan_f32; - bc_map[CSINN_OP_ATANH][3] = csi_ref_atanh_f32; - bc_map[CSINN_OP_AVGPOOL2D][3] = csi_ref_avgpool2d_f32; - bc_map[CSINN_OP_AVGPOOL3D][3] = csi_ref_avgpool3d_f32; - bc_map[CSINN_OP_BN][3] = csi_ref_batch_normalization_f32; - bc_map[CSINN_OP_BATCH_TO_SPACE][3] = csi_ref_batch_to_space_f32; - bc_map[CSINN_OP_BROADCOST][3] = csi_ref_broadcast_to_f32; - bc_map[CSINN_OP_CEIL][3] = csi_ref_ceil_f32; - bc_map[CSINN_OP_CLIP][3] = csi_ref_clip_f32; - bc_map[CSINN_OP_COL2IM][3] = csi_ref_col2im_f32; - bc_map[CSINN_OP_CONCAT][3] = csi_nn_rvv_concat_fp32; - bc_map[CSINN_OP_CONV2D][3] = csi_ref_conv2d_f32; - bc_map[CSINN_OP_CONV2D_RELU][3] = csi_ref_conv2d_relu_f32; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][3] = csi_ref_depthwise_conv2d_f32; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][3] = csi_ref_depthwise_conv2d_relu_f32; - bc_map[CSINN_OP_GROUP_CONV2D][3] = csi_ref_group_conv2d_f32; - bc_map[CSINN_OP_CONV3D][3] = csi_ref_conv3d_f32; - bc_map[CSINN_OP_DECONV2D][3] = csi_ref_deconv2d_f32; - bc_map[CSINN_OP_DEPTHWISE_DECONV2D][3] = csi_ref_depthwise_deconv2d_f32; - bc_map[CSINN_OP_DECONV3D][3] = csi_ref_deconv3d_f32; - bc_map[CSINN_OP_COS][3] = csi_ref_cos_f32; - bc_map[CSINN_OP_COSH][3] = csi_ref_cosh_f32; - bc_map[CSINN_OP_CUMPROD][3] = csi_ref_cumprod_f32; - bc_map[CSINN_OP_CUMSUM][3] = csi_ref_cumsum_f32; - bc_map[CSINN_OP_DEPTH_TO_SPACE][3] = csi_ref_depth_to_space_f32; - bc_map[CSINN_OP_DIV][3] = csi_ref_div_f32; - bc_map[CSINN_OP_ELU][3] = csi_ref_elu_f32; - bc_map[CSINN_OP_EQUANL][3] = csi_ref_equal_f32; - bc_map[CSINN_OP_ERF][3] = csi_ref_erf_f32; - bc_map[CSINN_OP_EXP][3] = csi_ref_exp_f32; - bc_map[CSINN_OP_EXPAND_DIMS][3] = csi_ref_expand_dims_f32; - bc_map[CSINN_OP_EXPM1][3] = csi_ref_expm1_f32; - bc_map[CSINN_OP_FLATTEN][3] = csi_ref_flatten; - bc_map[CSINN_OP_FLOOR_DIVIDE][3] = csi_ref_floor_divide_f32; - bc_map[CSINN_OP_FLOOR_MOD][3] = csi_ref_floor_mod_f32; - bc_map[CSINN_OP_FLOOR][3] = csi_ref_floor_f32; - bc_map[CSINN_OP_FSMN][3] = csi_ref_fsmn_f32; - bc_map[CSINN_OP_FULLYCONNECTED][3] = csi_ref_fullyconnected_f32; - bc_map[CSINN_OP_GATHER_ND][3] = csi_ref_gather_nd_f32; - bc_map[CSINN_OP_GATHER][3] = csi_ref_gather_f32; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][3] = csi_nn_rvv_global_avgpool2d_fp32; - bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][3] = csi_ref_global_maxpool2d_f32; - bc_map[CSINN_OP_GREATHER_EQUAL][3] = csi_ref_greater_equal_f32; - bc_map[CSINN_OP_GREATHER][3] = csi_ref_greater_f32; - bc_map[CSINN_OP_HARD_SIGMOID][3] = csi_ref_hard_sigmoid_f32; - bc_map[CSINN_OP_IM2COL][3] = csi_ref_im2col_f32; - bc_map[CSINN_OP_ISNAN][3] = csi_ref_isnan_bool_f32; - bc_map[CSINN_OP_L2N][3] = csi_ref_l2_normalization_f32; - bc_map[CSINN_OP_L2POOL2D][3] = csi_ref_l2pool_f32; - bc_map[CSINN_OP_LEAKY_RELU][3] = csi_nn_rvv_leaky_relu_fp32; - bc_map[CSINN_OP_LESS_EQUAL][3] = csi_ref_less_equal_f32; - bc_map[CSINN_OP_LESS][3] = csi_ref_less_f32; - bc_map[CSINN_OP_LOG_SOFTMAX][3] = csi_ref_log_softmax_f32; - bc_map[CSINN_OP_LOG][3] = csi_ref_log_f32; - bc_map[CSINN_OP_LOG1P][3] = csi_ref_log1p_f32; - bc_map[CSINN_OP_LOGICAL_AND][3] = csi_ref_logical_and_f32; - bc_map[CSINN_OP_LOGICAL_NOT][3] = csi_ref_logical_not_f32; - bc_map[CSINN_OP_LOGICAL_OR][3] = csi_ref_logical_or_f32; - bc_map[CSINN_OP_LOGICAL_XOR][3] = csi_ref_logical_xor_f32; - bc_map[CSINN_OP_LRN][3] = csi_ref_lrn_f32; - bc_map[CSINN_OP_MATMUL][3] = csi_ref_matmul_f32; - bc_map[CSINN_OP_MAX][3] = csi_ref_max_stride_f32; - bc_map[CSINN_OP_MAXIMUM][3] = csi_ref_maximum_f32; - bc_map[CSINN_OP_MAXPOOL2D][3] = csi_ref_maxpool2d_f32; - bc_map[CSINN_OP_MAXPOOL2D_LOCAT][3] = csi_ref_maxpool2d_locat_f32; - bc_map[CSINN_OP_MAXPOOL3D][3] = csi_ref_maxpool3d_f32; - bc_map[CSINN_OP_MEAN][3] = csi_ref_mean_stride_f32; - bc_map[CSINN_OP_MEAN_STRIDE][3] = csi_ref_mean_stride_f32; - bc_map[CSINN_OP_MINIMUM][3] = csi_ref_minimum_f32; - bc_map[CSINN_OP_MOD][3] = csi_ref_mod_f32; - bc_map[CSINN_OP_MUL][3] = csi_ref_mul_f32; - bc_map[CSINN_OP_NDARRAY_SIZE][3] = csi_ref_ndarray_size_f32; - bc_map[CSINN_OP_NEGATIIVE][3] = csi_ref_negative_f32; - bc_map[CSINN_OP_NOT_EQUAL][3] = csi_ref_not_equal_f32; - bc_map[CSINN_OP_PAD][3] = csi_ref_pad_f32; - bc_map[CSINN_OP_POWER][3] = csi_ref_power_f32; - bc_map[CSINN_OP_PRELU][3] = csi_ref_prelu_f32; - bc_map[CSINN_OP_PROD][3] = csi_ref_prod_stride_f32; - bc_map[CSINN_OP_PROPOSAL][3] = csi_ref_proposal_f32; - bc_map[CSINN_OP_PSROIPOOLING][3] = csi_ref_psroipooling_f32; - bc_map[CSINN_OP_REDUCE_LOGSUMEXP][3] = csi_ref_reduce_logsumexp_f32; - bc_map[CSINN_OP_REDUCE_MAX][3] = csi_ref_reduce_max_f32; - bc_map[CSINN_OP_REDUCE_MEAN][3] = csi_ref_reduce_mean_f32; - bc_map[CSINN_OP_REDUCE_MIN][3] = csi_ref_reduce_min_f32; - bc_map[CSINN_OP_REDUCE_PROD][3] = csi_ref_reduce_prod_f32; - bc_map[CSINN_OP_REDUCE_SUM][3] = csi_ref_reduce_sum_f32; - bc_map[CSINN_OP_RELU][3] = csi_nn_rvv_relu_fp32; - bc_map[CSINN_OP_RELU1][3] = csi_ref_relu1_f32; - bc_map[CSINN_OP_RELU6][3] = csi_ref_relu6_f32; - bc_map[CSINN_OP_RELUN][3] = csi_ref_relun_f32; - bc_map[CSINN_OP_RESHAPE][3] = csi_ref_reshape; - bc_map[CSINN_OP_RESIZE][3] = csi_ref_resize_f32; - bc_map[CSINN_OP_REVERSE][3] = csi_ref_reverse_f32; - bc_map[CSINN_OP_ROIALIGN][3] = csi_ref_roi_align_f32; - bc_map[CSINN_OP_ROIPOOL][3] = csi_ref_roipool_f32; - bc_map[CSINN_OP_ROUND][3] = csi_ref_round_f32; - bc_map[CSINN_OP_RSQRT][3] = csi_ref_rsqrt_f32; - bc_map[CSINN_OP_SCATTER_ND][3] = csi_ref_scatter_nd_f32; - bc_map[CSINN_OP_SEGMENT_MAX][3] = csi_ref_segment_max_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][3] = csi_ref_unsorted_segment_max_f32; - bc_map[CSINN_OP_SEGMENT_MEAN][3] = csi_ref_segment_mean_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][3] = csi_ref_unsorted_segment_mean_f32; - bc_map[CSINN_OP_SEGMENT_MIN][3] = csi_ref_segment_min_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][3] = csi_ref_unsorted_segment_min_f32; - bc_map[CSINN_OP_SEGMENT_PROD][3] = csi_ref_segment_prod_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][3] = csi_ref_unsorted_segment_prod_f32; - bc_map[CSINN_OP_SEGMENT_SUM][3] = csi_ref_segment_sum_f32; - bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][3] = csi_ref_unsorted_segment_sum_f32; - bc_map[CSINN_OP_SELECT][3] = csi_ref_select_f32; - bc_map[CSINN_OP_SHUFFLE_CHANNEL][3] = csi_ref_shuffle_channel_f32; - bc_map[CSINN_OP_SIGMOID][3] = csi_ref_sigmoid_f32; - bc_map[CSINN_OP_SIGN][3] = csi_ref_sign_f32; - bc_map[CSINN_OP_SIN][3] = csi_ref_sin_f32; - bc_map[CSINN_OP_SINH][3] = csi_ref_sinh_f32; - bc_map[CSINN_OP_SLICE][3] = csi_ref_slice_f32; - bc_map[CSINN_OP_SOFTMAX][3] = csi_ref_softmax_f32; - bc_map[CSINN_OP_SOFTPLUS][3] = csi_ref_softplus_f32; - bc_map[CSINN_OP_SOFTRELU][3] = csi_ref_softrelu_f32; - bc_map[CSINN_OP_SOFTSIGN][3] = csi_ref_softsign_f32; - bc_map[CSINN_OP_SPACE_TO_BATCH][3] = csi_ref_space_to_batch_f32; - bc_map[CSINN_OP_SPACE_TO_DEPTH][3] = csi_ref_space_to_depth_f32; - bc_map[CSINN_OP_SPLIT][3] = csi_ref_split_f32; - bc_map[CSINN_OP_SQRT][3] = csi_ref_sqrt_f32; - bc_map[CSINN_OP_SQUARE][3] = csi_ref_square_f32; - bc_map[CSINN_OP_SQUEEZE][3] = csi_ref_squeeze; - bc_map[CSINN_OP_STACK][3] = csi_ref_stack_f32; - bc_map[CSINN_OP_STRIDED_SLICE][3] = csi_ref_strided_slice_f32; - bc_map[CSINN_OP_SUB][3] = csi_ref_sub_f32; - bc_map[CSINN_OP_SUM][3] = csi_ref_sum_stride_f32; - bc_map[CSINN_OP_TAN][3] = csi_ref_tan_f32; - bc_map[CSINN_OP_TANH][3] = csi_ref_tanh_f32; - bc_map[CSINN_OP_THRESHOLD_RELU][3] = csi_ref_threshold_relu_f32; - bc_map[CSINN_OP_TILE][3] = csi_ref_tile_f32; - bc_map[CSINN_OP_TOPK][3] = csi_ref_topk_f32; - bc_map[CSINN_OP_TRUNC][3] = csi_ref_trunc_f32; - bc_map[CSINN_OP_TRANSPOSE][3] = csi_ref_transpose; - bc_map[CSINN_OP_TRUNC][3] = csi_ref_trunc_f32; - bc_map[CSINN_OP_UNPOOLING][3] = csi_ref_unpooling_f32; - bc_map[CSINN_OP_UNSTACK][3] = csi_ref_unstack_f32; - bc_map[CSINN_OP_YUV_RGB_SCALE][3] = csi_ref_yuv_rgb_scale_f32; - - for (int i = 0; i < 3; i++) { - bc_map[CSINN_OP_ABS][i] = csi_ref_abs_quant; - bc_map[CSINN_OP_ACOS][i] = csi_ref_acos_quant; - bc_map[CSINN_OP_ACOSH][i] = csi_ref_acosh_quant; - bc_map[CSINN_OP_ADD][i] = csi_ref_add_quant; - bc_map[CSINN_OP_ARANGE][i] = csi_ref_arange_quant; - bc_map[CSINN_OP_ARGMAX][i] = csi_ref_argmax_stride_quant; - bc_map[CSINN_OP_ARGMIN][i] = csi_ref_argmin_stride_quant; - bc_map[CSINN_OP_ASIN][i] = csi_ref_asin_quant; - bc_map[CSINN_OP_ASINH][i] = csi_ref_asinh_quant; - bc_map[CSINN_OP_ATAN][i] = csi_ref_atan_quant; - bc_map[CSINN_OP_ATANH][i] = csi_ref_atanh_quant; - bc_map[CSINN_OP_AVGPOOL2D][i] = csi_ref_avgpool2d_quant; - bc_map[CSINN_OP_AVGPOOL3D][i] = csi_ref_avgpool3d_quant; - bc_map[CSINN_OP_BN][i] = csi_ref_batch_normalization_quant; - bc_map[CSINN_OP_BATCH_TO_SPACE][i] = csi_ref_batch_to_space_quant; - bc_map[CSINN_OP_BROADCOST][i] = csi_ref_broadcast_to_quant; - bc_map[CSINN_OP_CEIL][i] = csi_ref_ceil_quant; - bc_map[CSINN_OP_CLIP][i] = csi_ref_clip_quant; - bc_map[CSINN_OP_CONCAT][i] = csi_ref_concat_quant; - bc_map[CSINN_OP_CONV2D][i] = csi_ref_conv2d_quant; - bc_map[CSINN_OP_CONV2D_RELU][i] = csi_ref_conv2d_relu_quant; - bc_map[CSINN_OP_CONV2D_RELU6][i] = csi_ref_conv2d_relu6_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D][i] = csi_ref_depthwise_conv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i] = csi_ref_depthwise_conv2d_relu_quant; - bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i] = csi_ref_depthwise_conv2d_relu6_quant; - bc_map[CSINN_OP_GROUP_CONV2D][i] = csi_ref_group_conv2d_quant; - bc_map[CSINN_OP_CONV3D][i] = csi_ref_conv3d_quant; - bc_map[CSINN_OP_DECONV2D][i] = csi_ref_deconv2d_quant; - bc_map[CSINN_OP_DEPTHWISE_DECONV2D][i] = csi_ref_depthwise_deconv2d_quant; - bc_map[CSINN_OP_DECONV3D][i] = csi_ref_deconv3d_quant; - bc_map[CSINN_OP_COS][i] = csi_ref_cos_quant; - bc_map[CSINN_OP_COSH][i] = csi_ref_cosh_quant; - bc_map[CSINN_OP_CUMPROD][i] = csi_ref_cumprod_quant; - bc_map[CSINN_OP_CUMSUM][i] = csi_ref_cumsum_quant; - bc_map[CSINN_OP_DEPTH_TO_SPACE][i] = csi_ref_depth_to_space_quant; - bc_map[CSINN_OP_DIV][i] = csi_ref_div_quant; - bc_map[CSINN_OP_ELU][i] = csi_ref_elu_quant; - bc_map[CSINN_OP_EQUANL][i] = csi_ref_equal_quant; - bc_map[CSINN_OP_ERF][i] = csi_ref_erf_quant; - bc_map[CSINN_OP_EXP][i] = csi_ref_exp_quant; - bc_map[CSINN_OP_EXPAND_DIMS][i] = csi_ref_expand_dims_quant; - bc_map[CSINN_OP_EXPM1][i] = csi_ref_expm1_quant; - bc_map[CSINN_OP_FLATTEN][i] = csi_ref_flatten; - bc_map[CSINN_OP_FLOOR_DIVIDE][i] = csi_ref_floor_divide_quant; - bc_map[CSINN_OP_FLOOR_MOD][i] = csi_ref_floor_mod_quant; - bc_map[CSINN_OP_FLOOR][i] = csi_ref_floor_quant; - bc_map[CSINN_OP_FSMN][i] = csi_ref_fsmn_quant; - bc_map[CSINN_OP_FULLYCONNECTED][i] = csi_ref_fullyconnected_quant; - bc_map[CSINN_OP_GATHER_ND][i] = csi_ref_gather_nd_quant; - bc_map[CSINN_OP_GATHER][i] = csi_ref_gather_quant; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][i] = csi_ref_global_avgpool2d_quant; - bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][i] = csi_ref_global_maxpool2d_quant; - bc_map[CSINN_OP_GREATHER_EQUAL][i] = csi_ref_greater_equal_quant; - bc_map[CSINN_OP_GREATHER][i] = csi_ref_greater_quant; - bc_map[CSINN_OP_HARD_SIGMOID][i] = csi_ref_hard_sigmoid_quant; - bc_map[CSINN_OP_IM2COL][i] = csi_ref_im2col_quant; - bc_map[CSINN_OP_L2N][i] = csi_ref_l2_normalization_quant; - bc_map[CSINN_OP_LEAKY_RELU][i] = csi_ref_leaky_relu_quant; - bc_map[CSINN_OP_LESS_EQUAL][i] = csi_ref_less_equal_quant; - bc_map[CSINN_OP_LESS][i] = csi_ref_less_quant; - bc_map[CSINN_OP_LOG_SOFTMAX][i] = csi_ref_log_softmax_quant; - bc_map[CSINN_OP_LOG][i] = csi_ref_log_quant; - bc_map[CSINN_OP_LOG1P][i] = csi_ref_log1p_quant; - bc_map[CSINN_OP_LOGICAL_AND][i] = csi_ref_logical_and_quant; - bc_map[CSINN_OP_LOGICAL_NOT][i] = csi_ref_logical_not_quant; - bc_map[CSINN_OP_LOGICAL_OR][i] = csi_ref_logical_or_quant; - bc_map[CSINN_OP_LOGICAL_XOR][i] = csi_ref_logical_xor_quant; - bc_map[CSINN_OP_LRN][i] = csi_ref_lrn_quant; - bc_map[CSINN_OP_MATMUL][i] = csi_ref_matmul_quant; - bc_map[CSINN_OP_MAX][i] = csi_ref_max_stride_quant; - bc_map[CSINN_OP_MAXIMUM][i] = csi_ref_maximum_quant; - bc_map[CSINN_OP_MAXPOOL2D][i] = csi_ref_maxpool2d_quant; - bc_map[CSINN_OP_MAXPOOL2D_LOCAT][i] = csi_ref_maxpool2d_locat_quant; - bc_map[CSINN_OP_MAXPOOL3D][i] = csi_ref_maxpool3d_quant; - bc_map[CSINN_OP_MEAN][i] = csi_ref_mean_stride_quant; - bc_map[CSINN_OP_MEAN_STRIDE][i] = csi_ref_mean_stride_quant; - bc_map[CSINN_OP_MIN][i] = csi_ref_min_stride_quant; - bc_map[CSINN_OP_MINIMUM][i] = csi_ref_minimum_quant; - bc_map[CSINN_OP_MOD][i] = csi_ref_mod_quant; - bc_map[CSINN_OP_MUL][i] = csi_ref_mul_quant; - bc_map[CSINN_OP_NEGATIIVE][i] = csi_ref_negative_quant; - bc_map[CSINN_OP_NOT_EQUAL][i] = csi_ref_not_equal_quant; - bc_map[CSINN_OP_PAD][i] = csi_ref_pad_quant; - bc_map[CSINN_OP_POWER][i] = csi_ref_power_quant; - bc_map[CSINN_OP_PRELU][i] = csi_ref_prelu_quant; - bc_map[CSINN_OP_PROD][i] = csi_ref_prod_stride_quant; - bc_map[CSINN_OP_PROPOSAL][i] = csi_ref_proposal_quant; - bc_map[CSINN_OP_PSROIPOOLING][i] = csi_ref_psroipooling_quant; - bc_map[CSINN_OP_REDUCE_LOGSUMEXP][i] = csi_ref_reduce_logsumexp_quant; - bc_map[CSINN_OP_REDUCE_MAX][i] = csi_ref_reduce_max_quant; - bc_map[CSINN_OP_REDUCE_MEAN][i] = csi_ref_reduce_mean_quant; - bc_map[CSINN_OP_REDUCE_MIN][i] = csi_ref_reduce_min_quant; - bc_map[CSINN_OP_REDUCE_PROD][i] = csi_ref_reduce_prod_quant; - bc_map[CSINN_OP_REDUCE_SUM][i] = csi_ref_reduce_sum_quant; - bc_map[CSINN_OP_RELU][i] = csi_ref_relu_quant; - bc_map[CSINN_OP_RELU1][i] = csi_ref_relu1_quant; - bc_map[CSINN_OP_RELU6][i] = csi_ref_relu6_quant; - bc_map[CSINN_OP_RELUN][i] = csi_ref_relun_quant; - bc_map[CSINN_OP_RESHAPE][i] = csi_ref_reshape_quant; - bc_map[CSINN_OP_RESIZE][i] = csi_ref_resize_quant; - bc_map[CSINN_OP_REVERSE][i] = csi_ref_reverse_quant; - bc_map[CSINN_OP_ROIPOOL][i] = csi_ref_roipool_quant; - bc_map[CSINN_OP_ROUND][i] = csi_ref_round_quant; - bc_map[CSINN_OP_RSQRT][i] = csi_ref_rsqrt_quant; - bc_map[CSINN_OP_SCATTER_ND][i] = csi_ref_scatter_nd_quant; - bc_map[CSINN_OP_SEGMENT_MAX][i] = csi_ref_segment_max_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i] = csi_ref_unsorted_segment_max_quant; - bc_map[CSINN_OP_SEGMENT_MEAN][i] = csi_ref_segment_mean_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i] = csi_ref_unsorted_segment_mean_quant; - bc_map[CSINN_OP_SEGMENT_MIN][i] = csi_ref_segment_min_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i] = csi_ref_unsorted_segment_min_quant; - bc_map[CSINN_OP_SEGMENT_PROD][i] = csi_ref_segment_prod_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i] = csi_ref_unsorted_segment_prod_quant; - bc_map[CSINN_OP_SEGMENT_SUM][i] = csi_ref_segment_sum_quant; - bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i] = csi_ref_unsorted_segment_sum_quant; - bc_map[CSINN_OP_SHUFFLE_CHANNEL][i] = csi_ref_shuffle_channel_quant; - bc_map[CSINN_OP_SIGMOID][i] = csi_ref_sigmoid_quant; - bc_map[CSINN_OP_SIGN][i] = csi_ref_sign_quant; - bc_map[CSINN_OP_SIN][i] = csi_ref_sin_quant; - bc_map[CSINN_OP_SINH][i] = csi_ref_sinh_quant; - bc_map[CSINN_OP_SLICE][i] = csi_ref_slice_quant; - bc_map[CSINN_OP_SOFTMAX][i] = csi_ref_softmax_quant; - bc_map[CSINN_OP_SOFTPLUS][i] = csi_ref_softplus_quant; - bc_map[CSINN_OP_SOFTRELU][i] = csi_ref_softrelu_quant; - bc_map[CSINN_OP_SOFTSIGN][i] = csi_ref_softsign_quant; - bc_map[CSINN_OP_SPACE_TO_BATCH][i] = csi_ref_space_to_batch_quant; - bc_map[CSINN_OP_SPACE_TO_DEPTH][i] = csi_ref_space_to_depth_quant; - bc_map[CSINN_OP_SPLIT][i] = csi_ref_split_quant; - bc_map[CSINN_OP_SQRT][i] = csi_ref_sqrt_quant; - bc_map[CSINN_OP_STACK][i] = csi_ref_stack_quant; - bc_map[CSINN_OP_STRIDED_SLICE][i] = csi_ref_strided_slice_quant; - bc_map[CSINN_OP_SUB][i] = csi_ref_sub_quant; - bc_map[CSINN_OP_SUM][i] = csi_ref_sum_stride_quant; - bc_map[CSINN_OP_TAN][i] = csi_ref_tan_quant; - bc_map[CSINN_OP_TANH][i] = csi_ref_tanh_quant; - bc_map[CSINN_OP_THRESHOLD_RELU][i] = csi_ref_threshold_relu_quant; - bc_map[CSINN_OP_TILE][i] = csi_ref_tile_quant; - bc_map[CSINN_OP_TOPK][i] = csi_ref_topk_quant; - bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant; - bc_map[CSINN_OP_TRANSPOSE][i] = csi_ref_transpose_quant; - bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant; - bc_map[CSINN_OP_UNPOOLING][i] = csi_ref_unpooling_quant; - bc_map[CSINN_OP_UNSTACK][i] = csi_ref_unstack_qunat; - bc_map[CSINN_OP_YUV_RGB_SCALE][i] = csi_ref_yuv_rgb_scale_quant; - } - // fp16 opt interface - bc_map[CSINN_OP_ADD][2] = csi_nn_rvv_add_fp16; - bc_map[CSINN_OP_CONCAT][2] = csi_nn_rvv_concat_fp16; - bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][2] = csi_nn_rvv_global_avgpool2d_fp16; - bc_map[CSINN_OP_LEAKY_RELU][2] = csi_nn_rvv_leaky_relu_fp16; - bc_map[CSINN_OP_RELU][2] = csi_nn_rvv_relu_fp16; - // int8 opt interface - bc_map[CSINN_OP_ADD][1] = csi_nn_rvv_add_int8; - bc_map[CSINN_OP_CONCAT][1] = csi_nn_rvv_concat_int8; - bc_map[CSINN_OP_LEAKY_RELU][1] = csi_nn_rvv_leaky_relu_int8; - bc_map[CSINN_OP_RELU][1] = csi_nn_rvv_relu_int8; - // int4 opt interface - - return bc_map; + static int i = 0; + __rvv_cb_key[i] = op_name * CSINN_DTYPE_SIZE + dtype; + __rvv_cb_table[i].init = init; + __rvv_cb_table[i].exec = exec; + __rvv_cb_table[i].est = est; + i++; } -static int get_bc_map_index(int op, int dtype) +struct csinn_callback *shl_cb_map_ref(int op, int dtype); +struct csinn_callback *shl_cb_map_rvv(int op, int dtype) { - switch (dtype) { - case CSINN_DTYPE_INT4: - return op * 4; + struct csinn_callback *cb = NULL; + for (int i = 0; i < RVV_OP_PATTERN_MAX; i++) { + if (__rvv_cb_key[i] == (op * CSINN_DTYPE_SIZE + dtype)) { + cb = &__rvv_cb_table[i]; break; - case CSINN_DTYPE_INT8: - return op * 4 + 1; - break; - case CSINN_DTYPE_FLOAT16: - return op * 4 + 2; - break; - case CSINN_DTYPE_FLOAT32: - return op * 4 + 3; - break; - default: - return CSINN_UNSUPPORT_DTYPE; + } + } + if ((cb == NULL) || (cb->est == NULL && (cb->init == NULL || cb->exec == NULL))) { + cb = shl_cb_map_ref(op, dtype); } + return cb; } -void *csi_bc_map_rvv(int op, int dtype) +void shl_target_init_rvv() { - static int has_init; - static void **bc_map_table; - if (has_init == 0) { - bc_map_table = setup_bc_map(); - has_init = 1; - } - return bc_map_table[get_bc_map_index(op, dtype)]; + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp32, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp16, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int8, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int4, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp32, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp16, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int8, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int4, NULL, + shl_gref_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, + shl_rvv_depthwise_conv2d_init_fp32, NULL, shl_gref_depthwise_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, + shl_rvv_depthwise_conv2d_init_fp16, NULL, shl_gref_depthwise_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int8, + NULL, shl_gref_depthwise_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int4, + NULL, shl_gref_depthwise_conv2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp32, NULL, + shl_gref_maxpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp16, NULL, + shl_gref_maxpool2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_int8, NULL, + shl_gref_maxpool2d); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_int4, NULL, + shl_gref_maxpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp32, NULL, + shl_gref_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp16, NULL, + shl_gref_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_int8, NULL, + shl_gref_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_int4, NULL, + shl_gref_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL, + shl_gref_fullyconnected); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL, + shl_gref_fullyconnected); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL, + shl_gref_fullyconnected); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL, + shl_gref_fullyconnected); + + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int8, NULL, + shl_gref_conv2d_relu); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int4, NULL, + shl_gref_conv2d_relu); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU, + shl_rvv_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu); + shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D_RELU, + shl_rvv_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d_relu); + + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, NULL, shl_rvv_add_fp32, shl_gref_add); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, NULL, shl_rvv_add_fp16, shl_gref_add); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ADD, NULL, shl_rvv_add_int8, shl_gref_add); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_rvv_mul_fp32, shl_gref_mul); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, NULL, shl_rvv_mul_fp16, shl_gref_mul); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, NULL, shl_rvv_mul_int8, shl_gref_mul); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp32, + shl_gref_concat); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp16, + shl_gref_concat); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONCAT, NULL, shl_rvv_concat_int8, shl_gref_concat); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp32, + shl_gref_leaky_relu); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp16, + shl_gref_leaky_relu); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_int8, + shl_gref_leaky_relu); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, NULL, shl_rvv_relu_fp32, shl_gref_relu); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, NULL, shl_rvv_relu_fp16, shl_gref_relu); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU, NULL, shl_rvv_relu_int8, shl_gref_relu); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp32, shl_gref_relu6); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp16, shl_gref_relu6); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU6, NULL, shl_rvv_relu6_int8, shl_gref_relu6); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init, + NULL, shl_gref_global_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init, + NULL, shl_gref_global_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init, NULL, + shl_gref_global_avgpool2d); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_fp16, + shl_gref_sigmoid); + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp16, + shl_gref_softmax); + shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SUM, NULL, shl_rvv_sum_stride_int8, shl_gref_sum); + + shl_register_runtime_callback(CSINN_RVV, NULL); + shl_register_op_callback(CSINN_RVV, shl_cb_map_rvv); + shl_register_runtime_callback(CSINN_RVV, shl_gref_runtime_callback); } diff --git a/source/thead_rvv/sigmoid.c b/source/thead_rvv/sigmoid.c index 5cb8575d..503eecf8 100644 --- a/source/thead_rvv/sigmoid.c +++ b/source/thead_rvv/sigmoid.c @@ -16,26 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" #include "rvv_mathfun.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_sigmoid_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params) +int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params) { - __fp16 *input_data = input->data; - __fp16 *output_data = output->data; - int size = 1; - for (int i = 0; i < input->dim_count; i++) { - size = size * input->dim[i]; - } + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + int size = csinn_tensor_size(input); while (size > 0) { size_t vl = vsetvl_e16m2(size); vfloat16m2_t _val = vle16_v_f16m2(input_data, vl); // val - _val = vfmul_vf_f16m2(_val, -1.0f, 16); + _val = vfmul_vf_f16m2(_val, -1.0f, vl); vfloat16m2_t _output_data = exp_ps_vfloat16m2(_val, vl); _output_data = vfadd_vf_f16m2(_output_data, 1.0f, vl); _output_data = vfrdiv_vf_f16m2(_output_data, 1.0f, vl); diff --git a/source/thead_rvv/softmax.c b/source/thead_rvv/softmax.c index 3d860e90..bb4a316c 100644 --- a/source/thead_rvv/softmax.c +++ b/source/thead_rvv/softmax.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" #include "rvv_mathfun.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_softmax_fp16(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params) +int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params) { __fp16 *input_data = (__fp16 *)input->data; __fp16 *output_data = (__fp16 *)output->data; diff --git a/source/thead_rvv/sum.c b/source/thead_rvv/sum.c index a113c536..d0f61124 100644 --- a/source/thead_rvv/sum.c +++ b/source/thead_rvv/sum.c @@ -16,19 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" -int csi_nn_rvv_sum_stride_int8(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) +int shl_rvv_sum_stride_int8(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params) { int8_t *input_data = (int8_t *)input->data; int8_t *output_data = (int8_t *)output->data; // TODO: move to init api float real_scale = input->qinfo->scale / output->qinfo->scale; - csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); + shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift); if (*(params->axis) == -1) { int size = 1; diff --git a/source/thead_rvv/utils.c b/source/thead_rvv/utils.c index b4b3e790..788dbdb5 100644 --- a/source/thead_rvv/utils.c +++ b/source/thead_rvv/utils.c @@ -16,9 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "csi_thead_rvv.h" +#include "shl_thead_rvv.h" int csrr_vl() { @@ -34,180 +34,9 @@ int csrr_vlenb() return a; } -/* params: - input: origin input data - input_padded: input data after pad - inc: origin input channel - inh: origin input height - inw: origin input width - padded_h: input height after pad - padded_w: input width after pad - pad_top: origin pad top - pad_left: origin pad left -*/ -void csi_nn_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left) -{ - int padded_hw = padded_h * padded_w; - - float *pad_ptr = input_padded; - float *inp_ptr = (float *)input; - int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) - int size; - int vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); - vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl); - - for (int c = 0; c < inc; c++) { - pad_ptr = input_padded + c * padded_hw; - // pad h_top - size = padded_w * pad_top; - while (size > 0) { - vl = vsetvl_e32m1(size); - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += vl; - size -= vl; - } - // pad h_mid - for (int h = 0; h < inh; h++) { - // pad w_left - memset(pad_ptr, 0, pad_left * sizeof(float)); - pad_ptr += pad_left; - // pad w_mid - size = inw; - while (size > 0) { - vl = vsetvl_e32m1(size); - vfloat32m1_t _input = vle32_v_f32m1(inp_ptr, vl); - inp_ptr += vl; - vse32_v_f32m1(pad_ptr, _input, vl); - pad_ptr += vl; - size -= vl; - } - // pad w_end - memset(pad_ptr, 0, resi_w * sizeof(float)); - pad_ptr += resi_w; - } - // pad h_bottom - size = padded_w * resi_h; - while (size > 0) { - vl = vsetvl_e32m1(size); - vse32_v_f32m1(pad_ptr, _zero, vl); - pad_ptr += vl; - size -= vl; - } - } -} - -void csi_nn_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left) -{ - int padded_hw = padded_h * padded_w; - - __fp16 *pad_ptr = input_padded; - __fp16 *inp_ptr = (__fp16 *)input; - int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) - int size; - int vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); - vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl); - - for (int c = 0; c < inc; c++) { - pad_ptr = input_padded + c * padded_hw; - // pad h_top - size = padded_w * pad_top; - while (size > 0) { - vl = vsetvl_e16m1(size); - vse16_v_f16m1(pad_ptr, _zero, vl); - pad_ptr += vl; - size -= vl; - } - // pad h_mid - for (int h = 0; h < inh; h++) { - // pad w_left - memset(pad_ptr, 0, pad_left * sizeof(__fp16)); - pad_ptr += pad_left; - // pad w_mid - size = inw; - while (size > 0) { - vl = vsetvl_e16m1(size); - vfloat16m1_t _input = vle16_v_f16m1(inp_ptr, vl); - inp_ptr += vl; - vse16_v_f16m1(pad_ptr, _input, vl); - pad_ptr += vl; - size -= vl; - } - // pad w_end - memset(pad_ptr, 0, resi_w * sizeof(__fp16)); - pad_ptr += resi_w; - } - // pad h_bottom - size = padded_w * resi_h; - while (size > 0) { - vl = vsetvl_e16m1(size); - vse16_v_f16m1(pad_ptr, _zero, vl); - pad_ptr += vl; - size -= vl; - } - } -} - -void csi_nn_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw, - int padded_h, int padded_w, int pad_top, int pad_left, - int8_t pad_value) -{ - int padded_hw = padded_h * padded_w; - - int8_t *pad_ptr = input_padded; - int8_t *inp_ptr = (int8_t *)input; - int resi_h = padded_h - pad_top - inh; // remain to pad on h (pad_down) - int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right) - int size; - int vl = vsetvl_e8m1(csrr_vlenb() / sizeof(int8_t)); - vint8m1_t _pad_zero = vmv_v_x_i8m1(pad_value, vl); // float 0.0 -> input->zero_point - - for (int c = 0; c < inc; c++) { - pad_ptr = input_padded + c * padded_hw; - // pad h_top - size = padded_w * pad_top; - while (size > 0) { - vl = vsetvl_e8m1(size); - vse8_v_i8m1(pad_ptr, _pad_zero, vl); - pad_ptr += vl; - size -= vl; - } - // pad h_mid - for (int h = 0; h < inh; h++) { - // pad w_left - memset(pad_ptr, pad_value, pad_left * sizeof(int8_t)); - pad_ptr += pad_left; - // pad w_mid - size = inw; - while (size > 0) { - vl = vsetvl_e8m1(size); - vint8m1_t _input = vle8_v_i8m1(inp_ptr, vl); - inp_ptr += vl; - vse8_v_i8m1(pad_ptr, _input, vl); - pad_ptr += vl; - size -= vl; - } - // pad w_end - memset(pad_ptr, pad_value, resi_w * sizeof(int8_t)); - pad_ptr += resi_w; - } - // pad h_bottom - size = padded_w * resi_h; - while (size > 0) { - vl = vsetvl_e8m1(size); - vse8_v_i8m1(pad_ptr, _pad_zero, vl); - pad_ptr += vl; - size -= vl; - } - } -} - /********************* for int8 quantization *********************/ // add output_zeropint -void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size) +void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size) { while (size > 0) { int vl = vsetvl_e32m2(size); @@ -226,7 +55,7 @@ void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int si // 再量化 int32 -> int8 // (val * multiplier)/(2 ^ shift) -void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size) +void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size) { while (channel_size > 0) { int vl = vsetvl_e32m4(channel_size); @@ -246,15 +75,12 @@ void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int } // 反量化 int32 -> float32 int8 -> float32 -void csi_nn_rvv_dequantize() -{ - ; -} +void shl_rvv_dequantize() { ; } /********************* int4 easter eggs *********************/ -void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, - int inh, int inw, int padded_h, int padded_w, int pad_top, - int pad_left, int8_t pad_value) +void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, + int inw, int padded_h, int padded_w, int pad_top, + int pad_left, int8_t pad_value) { int padded_hw = padded_h * padded_w; @@ -281,7 +107,7 @@ void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_pad memset(pad_ptr, pad_value, size * sizeof(int8_t)); pad_ptr += size; // pad w_mid - csi_nn_rvv_int4_trans_int8(inp_ptr, pad_ptr, inw * inc); + shl_rvv_int4_trans_int8(inp_ptr, pad_ptr, inw * inc); inp_ptr += inw * inc / 2; pad_ptr += inw * inc; // pad w_right @@ -301,7 +127,7 @@ void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_pad // size: int4 number // TODO: 这里是不是需要增加一条指令 -void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size) +void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size) { int j = size / 2; while (j > 0) { @@ -325,7 +151,7 @@ void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size) // size: int4 number // todo: replace with vpnclip_wx inst -void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size) +void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size) { int j = size / 2; while (j > 0) { @@ -349,7 +175,7 @@ void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size) // size: int4 number // TODO: replace with vpwadd.vx inst -void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size) +void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size) { int j = size / 2; while (j > 0) { @@ -371,3 +197,23 @@ void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size) *dst = *src > 7 ? (*src - 16) : (*src); } } + +void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size) +{ +#ifdef XTHEADV + while (size > 0) { + int vl = vsetvl_e32m8(size); + vint32m8_t _tmp = vle32_v_i32m8(src, vl); + _tmp = vadd_vx_i32m8(_tmp, out_zp, vl); + + vint16m4_t _tmp1 = vnclip_wx_i16m4(_tmp, 0, vl); // narrow 32->16 + vint8m2_t _tmp2 = vnclip_wx_i8m2(_tmp1, 0, vl); // narrow 16->8 + vint8m1_t _res = vpnclip_wx_i8m1(vreinterpret_v_i8m2_i16m2(_tmp2), 0, vl / 2); + + vse8_v_i8m1(dst, _res, vl / 2); + src += vl; + dst += vl / 2; + size -= vl; + } +#endif +} diff --git a/source/utils/atat_malloc.c b/source/utils/atat_malloc.c index 8d90a7ab..3ef51a67 100644 --- a/source/utils/atat_malloc.c +++ b/source/utils/atat_malloc.c @@ -26,39 +26,39 @@ use or performance of this software. #include #include -#ifdef CSI_BUILD_RTOS +#ifdef SHL_BUILD_RTOS #define SBGULP 0x800000 #else #define SBGULP 0x8000000 #endif -typedef struct csi_atat_mem { - struct csi_atat_mem *next; +typedef struct shl_atat_mem { + struct shl_atat_mem *next; size_t len; -} csi_atat_mem; +} shl_atat_mem; -#define MINBLK (2 * sizeof(struct csi_atat_mem) + 16) +#define MINBLK (2 * sizeof(struct shl_atat_mem) + 16) -csi_atat_mem *F; +shl_atat_mem *F; static char *sbrk_wrapper(int size) { -#ifdef CSI_BUILD_RTOS +#ifdef SHL_BUILD_RTOS return (char *)0x60000000; #else return sbrk(size); #endif } -void *csi_atat_malloc(register size_t size) +void *shl_atat_malloc(register size_t size) { - register csi_atat_mem *p, *q, *r, *s; + register shl_atat_mem *p, *q, *r, *s; unsigned register k, m; // extern void *sbrk(Int); char *top, *top1; size = (size + 7) & ~7; - r = (csi_atat_mem *)&F; + r = (shl_atat_mem *)&F; for (p = F, q = 0; p; r = p, p = p->next) { if ((k = p->len) >= size && (!q || m > k)) { m = k; @@ -68,9 +68,9 @@ void *csi_atat_malloc(register size_t size) } if (q) { if (q->len - size >= MINBLK) { /* split block */ - p = (csi_atat_mem *)(((char *)(q + 1)) + size); + p = (shl_atat_mem *)(((char *)(q + 1)) + size); p->next = q->next; - p->len = q->len - size - sizeof(csi_atat_mem); + p->len = q->len - size - sizeof(shl_atat_mem); s->next = p; q->len = size; } else { @@ -82,14 +82,14 @@ void *csi_atat_malloc(register size_t size) q = F; F = F->next; } else { - q = (csi_atat_mem *)top; + q = (shl_atat_mem *)top; } top1 = (char *)(q + 1) + size; if (sbrk_wrapper((int)(top1 - top + SBGULP)) == (void *)-1) { return 0; } - r = (csi_atat_mem *)top1; - r->len = SBGULP - sizeof(csi_atat_mem); + r = (shl_atat_mem *)top1; + r->len = SBGULP - sizeof(shl_atat_mem); r->next = F; F = r; q->len = size; @@ -97,22 +97,22 @@ void *csi_atat_malloc(register size_t size) return (void *)(q + 1); } -void csi_atat_free(void *f) +void shl_atat_free(void *f) { - csi_atat_mem *p, *q, *r; + shl_atat_mem *p, *q, *r; char *pn, *qn; if (!f) return; - q = (csi_atat_mem *)((char *)f - sizeof(csi_atat_mem)); + q = (shl_atat_mem *)((char *)f - sizeof(shl_atat_mem)); qn = (char *)f + q->len; - for (p = F, r = (csi_atat_mem *)&F;; r = p, p = p->next) { + for (p = F, r = (shl_atat_mem *)&F;; r = p, p = p->next) { if (qn == (void *)p) { - q->len += p->len + sizeof(csi_atat_mem); + q->len += p->len + sizeof(shl_atat_mem); p = p->next; } pn = p ? ((char *)(p + 1)) + p->len : 0; if (pn == (void *)q) { - p->len += sizeof(csi_atat_mem) + q->len; + p->len += sizeof(shl_atat_mem) + q->len; q->len = 0; q->next = p; r->next = p; @@ -126,10 +126,10 @@ void csi_atat_free(void *f) } } -void *csi_atat_calloc(size_t n, size_t m) +void *shl_atat_calloc(size_t n, size_t m) { void *rv; - rv = csi_atat_malloc(n *= m); + rv = shl_atat_malloc(n *= m); if (n && rv) { memset(rv, 0, n); } diff --git a/source/utils/debug.c b/source/utils/debug.c index 2302a10f..1db000ad 100644 --- a/source/utils/debug.c +++ b/source/utils/debug.c @@ -16,1019 +16,891 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include #include -#include "csi_nn.h" -#include "csi_node.h" -int csi_debug_level = CSI_DEBUG_LEVEL_WARNING; +#include "shl_debug.h" -int csi_debug_get_level() -{ - return csi_debug_level; -} +int shl_debug_level = SHL_DEBUG_LEVEL_WARNING; -void csi_debug_set_level(int level) -{ - csi_debug_level = level; -} -#ifdef CSI_DEBUG -void csi_debug_debug(const char *format, ...) +int shl_debug_get_level() { return shl_debug_level; } + +void shl_debug_set_level(int level) { shl_debug_level = level; } +#ifdef SHL_DEBUG +void shl_debug_debug(const char *format, ...) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_DEBUG) { + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_DEBUG) { va_list arg; va_start(arg, format); +#ifdef SHL_BUILD_RTOS + printf(format, arg); +#else vfprintf(stdout, format, arg); +#endif va_end(arg); } } -void csi_debug_info(const char *format, ...) +void shl_debug_info(const char *format, ...) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) { + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) { va_list arg; va_start(arg, format); +#ifdef SHL_BUILD_RTOS + printf(format, arg); +#else vfprintf(stdout, format, arg); +#endif va_end(arg); } } -void csi_debug_warning(const char *format, ...) +void shl_debug_warning(const char *format, ...) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_WARNING) { + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_WARNING) { va_list arg; va_start(arg, format); +#ifdef SHL_BUILD_RTOS + printf(format, arg); +#else vfprintf(stdout, format, arg); +#endif va_end(arg); } } -void csi_debug_error(const char *format, ...) +void shl_debug_error(const char *format, ...) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_ERROR) { + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_ERROR) { va_list arg; va_start(arg, format); +#ifdef SHL_BUILD_RTOS + printf(format, arg); +#else vfprintf(stdout, format, arg); +#endif va_end(arg); } } -void csi_debug_fatal(const char *format, ...) +void shl_debug_fatal(const char *format, ...) { - if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_FATAL) { + if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_FATAL) { va_list arg; va_start(arg, format); +#ifdef SHL_BUILD_RTOS + printf(format, arg); +#else vfprintf(stdout, format, arg); +#endif va_end(arg); } } -static int csi_debug_print_list_int(int32_t *list, int len, char *name) +static int shl_debug_print_list_int(int32_t *list, int len, char *name) { - csi_debug_info("%s", name); + shl_debug_info("%s", name); for (int i = 0; i < len; i++) { if (i == 0) { - csi_debug_info("["); + shl_debug_info("["); } - csi_debug_info("%4d", list[i]); + shl_debug_info("%4d", list[i]); if (i == (len - 1)) { - csi_debug_info("]"); + shl_debug_info("]"); } else { - csi_debug_info(","); + shl_debug_info(","); } } return CSINN_TRUE; } -static int csi_debug_print_list_float(float *list, int len, char *name) +static int shl_debug_print_list_float(float *list, int len, char *name) { - csi_debug_info("%s", name); + shl_debug_info("%s", name); for (int i = 0; i < len; i++) { if (i == 0) { - csi_debug_info("["); + shl_debug_info("["); } - csi_debug_info("%f", list[i]); + shl_debug_info("%f", list[i]); if (i == (len - 1)) { - csi_debug_info("]"); + shl_debug_info("]"); } else { - csi_debug_info(","); + shl_debug_info(","); } } return CSINN_TRUE; } -int csi_debug_print_tensor(struct csi_tensor *t) +int shl_debug_print_tensor(struct csinn_tensor *t) { - csi_debug_info("%s(", t->name); - csi_debug_print_list_int(t->dim, t->dim_count, ""); - csi_debug_info(", "); + shl_debug_info("%s(", t->name); + shl_debug_print_list_int(t->dim, t->dim_count, ""); + shl_debug_info(", "); /* FIX ME : channel quantize for input and output tensor ??? */ if (t->quant_channel != 0) { - csi_debug_info("max=%f, min=%f", t->qinfo->max, t->qinfo->min); + shl_debug_info("max=%f, min=%f,", t->qinfo->max, t->qinfo->min); + shl_debug_info("scale=%f, zp=%d", t->qinfo->scale, t->qinfo->zero_point); } - csi_debug_info("), "); + shl_debug_info("), "); return CSINN_TRUE; } -int csi_debug_print_params_base(struct csi_params_base *base) +int shl_debug_print_params_base(struct csinn_params_base *base) { - csi_debug_info("%s(", base->name); + shl_debug_info("%s(", base->name); if (base->layout == CSINN_LAYOUT_NCHW) { - csi_debug_info("NCHW, "); + shl_debug_info("NCHW, "); } else if (base->layout == CSINN_LAYOUT_NHWC) { - csi_debug_info("NHWC, "); + shl_debug_info("NHWC, "); } /* TODO : params.base.API ? */ return CSINN_TRUE; } -int csi_debug_print_siso_base(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_params_base *base, - const char *name) +int shl_debug_print_siso_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_params_base *base, const char *name) { - csi_debug_info("%s = %s(", output->name, name); - csi_debug_print_tensor(input); - csi_debug_print_params_base(base); + shl_debug_info("%s = %s(", output->name, name); + shl_debug_print_tensor(input); + shl_debug_print_params_base(base); return CSINN_TRUE; } -int csi_debug_print_diso_base(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct csi_params_base *base, +int shl_debug_print_diso_base(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_params_base *base, const char *name) { - csi_debug_info("%s = %s(", output->name, name); - csi_debug_print_tensor(input0); - csi_debug_print_tensor(input1); - csi_debug_print_params_base(base); + shl_debug_info("%s = %s(", output->name, name); + shl_debug_print_tensor(input0); + shl_debug_print_tensor(input1); + shl_debug_print_params_base(base); return CSINN_TRUE; } -int csi_debug_print_sidcso_base(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct csi_params_base *base, - const char *name) +int shl_debug_print_sidcso_base(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_params_base *base, const char *name) { - csi_debug_info("%s = %s(", output->name, name); - csi_debug_print_tensor(input); - csi_debug_print_tensor(kernel); - csi_debug_print_tensor(bias); - csi_debug_print_params_base(base); + shl_debug_info("%s = %s(", output->name, name); + shl_debug_print_tensor(input); + shl_debug_print_tensor(kernel); + shl_debug_print_tensor(bias); + shl_debug_print_params_base(base); return CSINN_TRUE; } -int csi_siso_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params, - const char *name) +int shl_siso_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_diso_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params, +int shl_diso_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_conv1d_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv1d_params *params, - const char *name) +int shl_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, const char *name) { - csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); + shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); return CSINN_TRUE; } -int csi_conv2d_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params, - const char *name) +int shl_conv2d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, const char *name) { - csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); - csi_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d])", - params->pad_top, params->pad_down, params->pad_left, params->pad_right, - params->stride_height, params->stride_width, - params->dilation_height, params->dilation_width); - csi_debug_info(")\n"); + shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); + shl_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d])", params->pad_top, + params->pad_down, params->pad_left, params->pad_right, params->stride_height, + params->stride_width, params->dilation_height, params->dilation_width); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_fullyconnected_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params, - const char *name) +int shl_fullyconnected_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, const char *name) { - csi_debug_print_sidcso_base(input, output, weights, bias, &(params->base), name); - csi_debug_info("units=%d", params->units); - csi_debug_info(")\n"); + shl_debug_print_sidcso_base(input, output, weights, bias, &(params->base), name); + shl_debug_info("units=%d", params->units); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_layer_norm_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct layer_norm_params *params, - const char *name) +int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_siso_base(input, output, &(params->base), name); return CSINN_TRUE; } -int csi_relu_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params, - const char *name) +int shl_relu_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("clip_min=0.0, clip_max=%f", params->n); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("clip_min=0.0, clip_max=%f", params->n); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_conv3d_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params, - const char *name) +int shl_conv3d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, const char *name) { - csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); - csi_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], dilation=[%d,%d,%d]", - params->pad_front, params->pad_back, params->pad_top, params->pad_down, params->pad_left, params->pad_right, - params->stride_depth, params->stride_height, params->stride_width, - params->dilation_depth, params->dilation_height, params->dilation_width); - csi_debug_info(")\n"); + shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name); + shl_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], dilation=[%d,%d,%d]", + params->pad_front, params->pad_back, params->pad_top, params->pad_down, + params->pad_left, params->pad_right, params->stride_depth, params->stride_height, + params->stride_width, params->dilation_depth, params->dilation_height, + params->dilation_width); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_arange_debug_info(struct csi_tensor *output, - struct arange_params *params, +int shl_arange_debug_info(struct csinn_tensor *output, struct csinn_arange_params *params, const char *name) { - csi_debug_info("%s = %s()\n", output->name, name); - csi_debug_info("start=%f, stop=%f, step=%f",params->start, params->stop, params->step); - csi_debug_info(")\n"); + shl_debug_info("%s = %s()\n", output->name, name); + shl_debug_info("start=%f, stop=%f, step=%f", params->start, params->stop, params->step); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_pool_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params, - const char *name) +int shl_pool_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], filter=[%d,%d,%d]", - params->pad_front, params->pad_back, params->pad_top, params->pad_down, params->pad_left, params->pad_right, - params->stride_depth, params->stride_height, params->stride_width, - params->filter_depth, params->filter_height, params->filter_width); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], filter=[%d,%d,%d]", + params->pad_front, params->pad_back, params->pad_top, params->pad_down, + params->pad_left, params->pad_right, params->stride_depth, params->stride_height, + params->stride_width, params->filter_depth, params->filter_height, + params->filter_width); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_pad_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params, - const char *name) +int shl_pad_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("pad_value=%f, pad_mode=%d, ", params->pad_value, params->pad_mode); - csi_debug_print_list_int(params->pad_before, params->pad_num, "pad_before="); - csi_debug_info(", "); - csi_debug_print_list_int(params->pad_after, params->pad_num, "pad_after="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("pad_value=%f, pad_mode=%d, ", params->pad_value, params->pad_mode); + shl_debug_print_list_int(params->pad_before, params->pad_num, "pad_before="); + shl_debug_info(", "); + shl_debug_print_list_int(params->pad_after, params->pad_num, "pad_after="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_crop_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct crop_params *params, - const char *name) +int shl_crop_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_crop_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d, ", params->axis); - csi_debug_print_list_int(params->offset, input->dim_count - params->axis, "offset="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d, ", params->axis); + shl_debug_print_list_int(params->offset, input->dim_count - params->axis, "offset="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_roi_pool_debug_info(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params, +int shl_roi_pool_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params, const char *name) { - csi_debug_print_siso_base(data, output, &(params->base), name); - csi_debug_info("pooled_h=%d, pooled_w=%d, spatial_scale=%f", - params->pooled_size_h, params->pooled_size_w, params->spatial_scale); - csi_debug_info(")\n"); + shl_debug_print_siso_base(data, output, &(params->base), name); + shl_debug_info("pooled_h=%d, pooled_w=%d, spatial_scale=%f", params->pooled_size_h, + params->pooled_size_w, params->spatial_scale); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_bn_debug_info(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params, - const char *name) +int shl_bn_debug_info(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("epsilon=%f", params->epsilon); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("epsilon=%f", params->epsilon); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_batch_to_space_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params, - const char *name) +int shl_batch_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("block_size=%d, crop=[%d,%d,%d,%d]", params->block_size, - params->crop_top, params->crop_bottom, params->crop_left, params->crop_right); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("block_size=%d, crop=[%d,%d,%d,%d]", params->block_size, params->crop_top, + params->crop_bottom, params->crop_left, params->crop_right); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_batch_to_space_nd_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_nd_params *params, +int shl_batch_to_space_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_nd_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape="); - csi_debug_print_list_int(params->crops, 2 * params->spatial_dim_cnt, "crops="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape="); + shl_debug_print_list_int(params->crops, 2 * params->spatial_dim_cnt, "crops="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_depth_to_space_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params, - const char *name) +int shl_depth_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("block_size=%d\n", params->block_size); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("block_size=%d\n", params->block_size); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_space_to_depth_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params, - const char *name) +int shl_space_to_depth_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("block_size=%d", params->block_size); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("block_size=%d", params->block_size); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_space_to_batch_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params, - const char *name) +int shl_space_to_batch_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("block_size=%d, pad=[%d,%d,%d,%d]", params->block_size, - params->pad_top, params->pad_bottom, params->pad_left, params->pad_right); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("block_size=%d, pad=[%d,%d,%d,%d]", params->block_size, params->pad_top, + params->pad_bottom, params->pad_left, params->pad_right); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_space_to_batch_nd_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_nd_params *params, +int shl_space_to_batch_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_nd_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape="); - csi_debug_print_list_int(params->paddings, 2 * params->spatial_dim_cnt, "paddings="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape="); + shl_debug_print_list_int(params->paddings, 2 * params->spatial_dim_cnt, "paddings="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_broadcast_to_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params, - const char *name) +int shl_broadcast_to_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->shape, params->shape_count, "shape="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->shape, params->shape_count, "shape="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_reduce_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params, - const char *name) +int shl_reduce_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("keepdim=%d, ", params->keepdims); - csi_debug_print_list_int(params->axis, params->axis_count, "axis="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("keepdim=%d, ", params->keepdims); + shl_debug_print_list_int(params->axis, params->axis_count, "axis="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_cache_matmul_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_matmul_params *params, const char *name) +int shl_cache_matmul_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_siso_base(input, output, &(params->base), name); return CSINN_TRUE; } -int csi_cache_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *weight, struct csi_tensor *bias, - struct cache_conv1d_params *params, const char *name) +int shl_cache_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_siso_base(input, output, &(params->base), name); return CSINN_TRUE; } -int csi_clip_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params, - const char *name) +int shl_clip_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("min_value=%f, max_value=%f", params->min_value, params->max_value); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("min_value=%f, max_value=%f", params->min_value, params->max_value); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_col2im_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct col2im_params *params, - const char *name) +int shl_col2im_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_col2im_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("pad_h=%d, pad_w=%d, stride_h=%d, stride_w=%d", - params->pad_h, params->pad_w, params->stride_h, params->stride_w); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("pad_h=%d, pad_w=%d, stride_h=%d, stride_w=%d", params->pad_h, params->pad_w, + params->stride_h, params->stride_w); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_concat_debug_info(struct csi_tensor **input, - struct csi_tensor *output, - struct concat_params *params, - const char *name) +int shl_concat_debug_info(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, const char *name) { - csi_debug_info("%s = %s(", output->name, name); + shl_debug_info("%s = %s(", output->name, name); for (int i = 0; i < params->inputs_count; i++) { - csi_debug_print_tensor(input[i]); + shl_debug_print_tensor(input[i]); } - csi_debug_print_params_base(&(params->base)); - csi_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis); - csi_debug_info(")\n"); + shl_debug_print_params_base(&(params->base)); + shl_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_cumprod_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params, - const char *name) +int shl_cumprod_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_cumsum_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params, - const char *name) +int shl_cumsum_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_expand_dims_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params, - const char *name) +int shl_expand_dims_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d", params->axis); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d", params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_flatten_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params, - const char *name) +int shl_flatten_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_fsmn_debug_info(struct csi_tensor *frame, - struct csi_tensor *l_filter, - struct csi_tensor *r_filter, - struct csi_tensor *frame_sequence, - struct csi_tensor *frame_counter, - struct csi_tensor *output, - struct fsmn_params *params, - const char *name) +int shl_fsmn_debug_info(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params, const char *name) { - csi_debug_info("%s = %s(", output->name, name); - csi_debug_print_tensor(frame); - csi_debug_print_tensor(l_filter); - csi_debug_print_tensor(r_filter); - csi_debug_print_tensor(frame_sequence); - csi_debug_print_tensor(frame_counter); - csi_debug_print_params_base(&(params->base)); - csi_debug_info("l_order=%d, r_order=%d, l_stride=%d, r_stride=%d, unavailable_frames=%d)", - params->l_order, params->r_order, params->l_stride, params->r_stride, - params->unavailable_frames); - csi_debug_info(")\n"); + shl_debug_info("%s = %s(", output->name, name); + shl_debug_print_tensor(frame); + shl_debug_print_tensor(l_filter); + shl_debug_print_tensor(r_filter); + shl_debug_print_tensor(frame_sequence); + shl_debug_print_tensor(frame_counter); + shl_debug_print_params_base(&(params->base)); + shl_debug_info("l_order=%d, r_order=%d, l_stride=%d, r_stride=%d, unavailable_frames=%d)", + params->l_order, params->r_order, params->l_stride, params->r_stride, + params->unavailable_frames); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_gather_nd_debug_info(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params, +int shl_gather_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params, const char *name) { - csi_debug_print_diso_base(input, indices, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input, indices, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_gather_debug_info(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_params *params, +int shl_gather_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, const char *name) { - csi_debug_print_diso_base(input, indices, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input, indices, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_hard_sigmoid_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params, - const char *name) +int shl_hard_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_im2col_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct im2col_params *params, - const char *name) +int shl_im2col_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], kernel_size=[%d,%d]", - params->pad_top, params->pad_down, params->pad_left, params->pad_right, - params->stride_h, params->stride_w, params->kernel_h, params->kernel_w); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], kernel_size=[%d,%d]", params->pad_top, + params->pad_down, params->pad_left, params->pad_right, params->stride_h, + params->stride_w, params->kernel_h, params->kernel_w); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_l2n_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params, - const char *name) +int shl_l2n_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("spsilon=%f", params->epsilon); - csi_debug_print_list_int(params->axis, params->n, "axis="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("spsilon=%f", params->epsilon); + shl_debug_print_list_int(params->axis, params->n, "axis="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_softmax_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params, - const char *name) +int shl_softmax_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d", params->axis); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d", params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_lrn_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct lrn_params *params, - const char *name) +int shl_lrn_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("range=%d, bias=%f, alpha=%f, beta=%f", params->range, params->bias, params->alpha, params->beta); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("range=%d, bias=%f, alpha=%f, beta=%f", params->range, params->bias, + params->alpha, params->beta); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_matmul_debug_info(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params, +int shl_matmul_debug_info(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, const char *name) { - csi_debug_print_diso_base(mat0, mat1, output, &(params->base), name); - csi_debug_info("trans_a=%d, trans_b=%d", params->trans_a, params->trans_b); - csi_debug_info(")\n"); + shl_debug_print_diso_base(mat0, mat1, output, &(params->base), name); + shl_debug_info("trans_a=%d, trans_b=%d", params->trans_a, params->trans_b); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_ndarray_size_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params, - const char *name) +int shl_ndarray_size_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_nms_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct non_max_suppression_params *params, +int shl_nms_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_non_max_suppression_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info("max_output_size=%d, iou_threshold=%f", params->max_output_size, params->iou_threshold); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info("max_output_size=%d, iou_threshold=%f", params->max_output_size, + params->iou_threshold); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_one_hot_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct one_hot_params *params, - const char *name) +int shl_one_hot_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("on_value=%f, off_value=%f, depth=%d, axis=%d", params->f_on_value, params->f_off_value, params->depth, params->axis); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("on_value=%f, off_value=%f, depth=%d, axis=%d", params->f_on_value, + params->f_off_value, params->depth, params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_prelu_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct prelu_params *params, +int shl_prelu_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_prelu_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info("axis=%d", params->axis); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info("axis=%d", params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_proposal_debug_info(struct csi_tensor *cls_prob, - struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, - struct csi_tensor *output, - struct proposal_params *params, - const char *name) +int shl_proposal_debug_info(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params, const char *name) { - csi_debug_print_siso_base(cls_prob, output, &(params->base), name); - csi_debug_print_list_float(params->scales, params->scales_num, "scales="); - csi_debug_info(", "); - csi_debug_print_list_float(params->ratios, params->ratios_num, "ratios="); - csi_debug_info(", feature_stride=%d, threshold=%f, rpn_pre_nms_top_n=%d, rpn_post_nms_top_n=%d, rpn_min_size=%d, iou_loss=%d", - params->feature_stride, params->threshold, params->rpn_pre_nms_top_n, params->rpn_post_nms_top_n, params->rpn_min_size, params->iou_loss); - csi_debug_info(")\n"); + shl_debug_print_siso_base(cls_prob, output, &(params->base), name); + shl_debug_print_list_float(params->scales, params->scales_num, "scales="); + shl_debug_info(", "); + shl_debug_print_list_float(params->ratios, params->ratios_num, "ratios="); + shl_debug_info( + ", feature_stride=%d, threshold=%f, rpn_pre_nms_top_n=%d, rpn_post_nms_top_n=%d, " + "rpn_min_size=%d, iou_loss=%d", + params->feature_stride, params->threshold, params->rpn_pre_nms_top_n, + params->rpn_post_nms_top_n, params->rpn_min_size, params->iou_loss); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_psroipooling_debug_info(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params, - const char *name) +int shl_psroipooling_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, + struct csinn_psroipooling_params *params, const char *name) { - csi_debug_print_siso_base(data, output, &(params->base), name); - csi_debug_info("output_dim=%d, group_size=%d, spatial_scale=%f", - params->output_dim, params->group_size, params->spatial_scale); - csi_debug_info(")\n"); + shl_debug_print_siso_base(data, output, &(params->base), name); + shl_debug_info("output_dim=%d, group_size=%d, spatial_scale=%f", params->output_dim, + params->group_size, params->spatial_scale); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_reorg_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct reorg_params *params, - const char *name) +int shl_reorg_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reorg_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("stride=%d", params->stride); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("stride=%d", params->stride); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_reshape_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params, - const char *name) +int shl_reshape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->shape, params->shape_num, "shape="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->shape, params->shape_num, "shape="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_resize_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params, - const char *name) +int shl_resize_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("resize_mode=%d, align_corners=%d", params->resize_mode, params->align_corners); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("resize_mode=%d, align_corners=%d", params->resize_mode, params->align_corners); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_reverse_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct reverse_params *params, - const char *name) +int shl_reverse_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("axis=%d", params->axis); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("axis=%d", params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_roi_align_debug_info(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_align_params *params, +int shl_roi_align_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params, const char *name) { - csi_debug_print_siso_base(data, output, &(params->base), name); - csi_debug_info("pooled_h=%d, pool_w=%d, spatial_scale=%f, sample_ratio=%d", - params->pooled_size_h, params->pooled_size_w, params->spatial_scale, params->sample_ratio); - csi_debug_info(")\n"); + shl_debug_print_siso_base(data, output, &(params->base), name); + shl_debug_info("pooled_h=%d, pool_w=%d, spatial_scale=%f, sample_ratio=%d", + params->pooled_size_h, params->pooled_size_w, params->spatial_scale, + params->sample_ratio); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_scatter_nd_debug_info(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *updates, - struct csi_tensor *output, - struct scatter_nd_params *params, - const char *name) +int shl_scatter_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_segment_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params, +int shl_segment_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_segment_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info("segment_nums=%d, unsorted=%d", params->num_segments, params->unsorted); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info("segment_nums=%d, unsorted=%d", params->num_segments, params->unsorted); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_select_debug_info(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params, - const char *name) +int shl_select_debug_info(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_sequence_mask_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct sequence_mask_params *params, - const char *name) +int shl_sequence_mask_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_sequence_mask_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info("mask_value=%f, axis=%d", params->mask_value, params->axis); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info("mask_value=%f, axis=%d", params->mask_value, params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_shape_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params, - const char *name) +int shl_shape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_shuffle_channel_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct shuffle_channel_params *params, - const char *name) +int shl_shuffle_channel_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("group=%d", params->group); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("group=%d", params->group); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_sigmoid_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params, - const char *name) +int shl_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_slice_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params, - const char *name) +int shl_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->begin, params->slice_num, "begin="); - csi_debug_info(", "); - csi_debug_print_list_int(params->end, params->slice_num, "end="); - csi_debug_info(", "); - csi_debug_print_list_int(params->strides, params->slice_num, "strides="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->begin, params->slice_num, "begin="); + shl_debug_info(", "); + shl_debug_print_list_int(params->end, params->slice_num, "end="); + shl_debug_info(", "); + shl_debug_print_list_int(params->strides, params->slice_num, "strides="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_split_debug_info(struct csi_tensor *input, - struct csi_tensor **output, - struct split_params *params, - const char *name) +int shl_split_debug_info(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, const char *name) { - csi_debug_info("%s-%s = %s(", output[0]->name, output[params->output_num - 1]->name, name); - csi_debug_print_tensor(input); - csi_debug_print_params_base(&(params->base)); - csi_debug_info("axis=%d, ", params->axis); - csi_debug_print_list_int(params->split_index, params->output_num, "split_index="); - csi_debug_info(")\n"); + shl_debug_info("%s-%s = %s(", output[0]->name, output[params->output_num - 1]->name, name); + shl_debug_print_tensor(input); + shl_debug_print_params_base(&(params->base)); + shl_debug_info("axis=%d, ", params->axis); + shl_debug_print_list_int(params->split_index, params->output_num, "split_index="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_squeeze_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params, - const char *name) +int shl_squeeze_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->axis, params->axis_num, "axis="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->axis, params->axis_num, "axis="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_stack_debug_info(struct csi_tensor **input, - struct csi_tensor *output, - struct stack_params *params, - const char *name) +int shl_stack_debug_info(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params, const char *name) { - csi_debug_info("%s = %s(", output->name, name); + shl_debug_info("%s = %s(", output->name, name); for (int i = 0; i < params->inputs_count; i++) { - csi_debug_print_tensor(input[i]); + shl_debug_print_tensor(input[i]); } - csi_debug_print_params_base(&(params->base)); - csi_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis); - csi_debug_info(")\n"); + shl_debug_print_params_base(&(params->base)); + shl_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_strided_slice_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params, - const char *name) +int shl_strided_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->begin, params->slice_count, "begin="); - csi_debug_info(", "); - csi_debug_print_list_int(params->end, params->slice_count, "end="); - csi_debug_info(", "); - csi_debug_print_list_int(params->stride, params->slice_count, "stride="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->begin, params->slice_count, "begin="); + shl_debug_info(", "); + shl_debug_print_list_int(params->end, params->slice_count, "end="); + shl_debug_info(", "); + shl_debug_print_list_int(params->stride, params->slice_count, "stride="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_tile_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params, - const char *name) +int shl_tile_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->reps, params->reps_num, "reps="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->reps, params->reps_num, "reps="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_topk_debug_info(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct topk_params *params, +int shl_topk_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_topk_params *params, const char *name) { - csi_debug_print_diso_base(input0, input1, output, &(params->base), name); - csi_debug_info("k=%d", params->k); - csi_debug_info(")\n"); + shl_debug_print_diso_base(input0, input1, output, &(params->base), name); + shl_debug_info("k=%d", params->k); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_transpose_debug_info(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params, - const char *name) +int shl_transpose_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_print_list_int(params->permute, params->permute_num, "permute="); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_print_list_int(params->permute, params->permute_num, "permute="); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_unpooling_debug_info(struct csi_tensor *input, - struct csi_tensor *mask, - struct csi_tensor *output, - struct unpooling_params *params, +int shl_unpooling_debug_info(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params, const char *name) { - csi_debug_print_siso_base(input, output, &(params->base), name); - csi_debug_info("scale_h=%d, scale_w=%d, pad_out_h=%d, pad_out_w=%d", - params->scale_height, params->scale_width, params->pad_out_height , params->pad_out_width); - csi_debug_info(")\n"); + shl_debug_print_siso_base(input, output, &(params->base), name); + shl_debug_info("scale_h=%d, scale_w=%d, pad_out_h=%d, pad_out_w=%d", params->scale_height, + params->scale_width, params->pad_out_height, params->pad_out_width); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_unstack_debug_info(struct csi_tensor *input, - struct csi_tensor **output, - struct unstack_params *params, - const char *name) +int shl_unstack_debug_info(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params, const char *name) { - csi_debug_info("%s-%s = %s(", output[0]->name, output[params->outputs_count - 1]->name, name); - csi_debug_print_tensor(input); - csi_debug_print_params_base(&(params->base)); - csi_debug_info("outputs_count=%d, axis=%d", params->outputs_count, params->axis); + shl_debug_info("%s-%s = %s(", output[0]->name, output[params->outputs_count - 1]->name, name); + shl_debug_print_tensor(input); + shl_debug_print_params_base(&(params->base)); + shl_debug_info("outputs_count=%d, axis=%d", params->outputs_count, params->axis); return CSINN_TRUE; } -int csi_where_debug_info(struct csi_tensor *condition, - struct csi_tensor *x, - struct csi_tensor *y, - struct csi_tensor *output, - struct where_params *params, - const char *name) +int shl_where_debug_info(struct csinn_tensor *condition, struct csinn_tensor *x, + struct csinn_tensor *y, struct csinn_tensor *output, + struct csinn_where_params *params, const char *name) { - csi_debug_print_siso_base(x, output, &(params->base), name); - csi_debug_info(")\n"); + shl_debug_print_siso_base(x, output, &(params->base), name); + shl_debug_info(")\n"); return CSINN_TRUE; } -int csi_debug_callback_unset(char *func_name) +int shl_debug_callback_unset(char *func_name) { - csi_debug_info("callback function unset: %s\n", func_name); + shl_debug_info("callback function unset: %s\n", func_name); return CSINN_CALLBACK_UNSET; } -int csi_debug_dump_data(struct csi_tensor *input, char *filename) +int shl_debug_dump_data(struct csinn_tensor *input, char *filename) { float *data = input->data; - int size = csi_tensor_size(input); + int size = csinn_tensor_size(input); int i = 0; FILE *fp = fopen(filename, "w+"); for (i = 0; i < size; i++) { @@ -1046,10 +918,12 @@ int csi_debug_dump_data(struct csi_tensor *input, char *filename) char *op_strings[] = { [CSINN_OP_ABS] = "abs", [CSINN_OP_ADD] = "add", + [CSINN_OP_MUL] = "mul", [CSINN_OP_AVGPOOL2D] = "avgpool2d", [CSINN_OP_CONCAT] = "concat", [CSINN_OP_CONV2D] = "conv2d", [CSINN_OP_CONV2D_RELU] = "conv2d_relu", + [CSINN_OP_DATA_CONVERT] = "data_convert", [CSINN_OP_DEPTHWISE_CONV2D] = "dwconv2d", [CSINN_OP_DEPTHWISE_CONV2D_RELU] = "dwconv2d_relu", [CSINN_OP_FULLYCONNECTED] = "fullyconnected", @@ -1057,40 +931,42 @@ char *op_strings[] = { [CSINN_OP_LEAKY_RELU] = "leaky_relu", [CSINN_OP_MAXPOOL2D] = "maxpool2d", [CSINN_OP_RELU] = "relu", + [CSINN_OP_RELU6] = "relu6", [CSINN_OP_RESHAPE] = "reshape", + [CSINN_OP_TRANSPOSE] = "transpose", [CSINN_OP_SOFTMAX] = "softmax", [CSINN_OP_YUV_RGB_SCALE] = "yuv_rgb_scale", }; -#define FREQ 30 // FPGA: 30MHz +#define FREQ 50 // FPGA: 30MHz // TODO: support NHWC layout too -int csi_benchmark_layer(struct csi_node *node, uint64_t start_time, uint64_t end_time, +int shl_benchmark_layer(struct shl_node *node, uint64_t start_time, uint64_t end_time, int layer_idx) { char *op_name = op_strings[node->type]; - csi_debug_info("[%3d]: %-18s %6.2lfms ^*^ feature_map:", layer_idx, op_name, - (end_time - start_time) / 1000000.0f); + shl_debug_info("[%3d]: %-18s %6.2lfms ^*^ feature_map:", layer_idx, op_name, + (end_time - start_time) * FREQ / 1000.0f / 1000000.0f); - struct csi_tensor *in0 = (struct csi_tensor *)node->in[0]->data; - struct csi_tensor *out0 = (struct csi_tensor *)node->out[0]->data; + struct csinn_tensor *in0 = (struct csinn_tensor *)node->in[0]->data; + struct csinn_tensor *out0 = (struct csinn_tensor *)node->out[0]->data; // print first input node and first output node dim - csi_debug_print_list_int(in0->dim, in0->dim_count, ""); - csi_debug_info(" ==> "); - csi_debug_print_list_int(out0->dim, out0->dim_count, ""); + shl_debug_print_list_int(in0->dim, in0->dim_count, ""); + shl_debug_info(" ==> "); + shl_debug_print_list_int(out0->dim, out0->dim_count, ""); // print kernel dim if (node->type >= CSINN_OP_CONV1D && node->type <= CSINN_OP_CONV3D) { - struct csi_tensor *in1 = (struct csi_tensor *)node->in[1]->data; + struct csinn_tensor *in1 = (struct csinn_tensor *)node->in[1]->data; int64_t cacls = out0->dim[1] * out0->dim[2] * out0->dim[3] * in0->dim[1] * in1->dim[2] * in1->dim[3] * 2; if (node->type >= CSINN_OP_DEPTHWISE_CONV2D && node->type <= CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6) { cacls = out0->dim[1] * out0->dim[2] * out0->dim[3] * in1->dim[2] * in1->dim[3] * 2; } - csi_debug_info(" (%2.4lfGOPS)", cacls / ((end_time - start_time) * 30 / 1000.0f)); - csi_debug_info(" kernel:"); - csi_debug_print_list_int(in1->dim, in1->dim_count, ""); + shl_debug_info(" (%2.4lfGOPS)", cacls / ((end_time - start_time) * (FREQ) / 1000.0f)); + shl_debug_info(" kernel:"); + shl_debug_print_list_int(in1->dim, in1->dim_count, ""); } - csi_debug_info("\n"); + shl_debug_info("\n"); return CSINN_TRUE; } diff --git a/source/utils/memory.c b/source/utils/memory.c index dc7870c2..14dce3d0 100644 --- a/source/utils/memory.c +++ b/source/utils/memory.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include #include "csi_nn.h" -// #define CSI_MEM_DEBUG -// #define CSI_MEM_DEBUG_VALID_WRITE -// #define CSI_USE_ATAT_MALLOC -struct csi_mem_alloc_debug_element_ { +// #define SHL_MEM_DEBUG +// #define SHL_MEM_DEBUG_VALID_WRITE +// #define SHL_USE_ATAT_MALLOC +struct shl_mem_alloc_debug_element_ { void *ptr; int64_t size; int is_free; }; -struct csi_mem_alloc_debug_map_ { - struct csi_mem_alloc_debug_element_ *element; +struct shl_mem_alloc_debug_map_ { + struct shl_mem_alloc_debug_element_ *element; int element_number; int index; int64_t total_size; }; -static struct csi_mem_alloc_debug_map_ csi_mem_alloc_debug_map; +static struct shl_mem_alloc_debug_map_ shl_mem_alloc_debug_map; -void csi_mem_print_map() +void shl_mem_print_map() { - printf("total size = %ld\n", csi_mem_alloc_debug_map.total_size); - for (int i = 0; i <= csi_mem_alloc_debug_map.index; i++) { - struct csi_mem_alloc_debug_element_ *e = csi_mem_alloc_debug_map.element + i; + printf("total size = %ld\n", shl_mem_alloc_debug_map.total_size); + for (int i = 0; i <= shl_mem_alloc_debug_map.index; i++) { + struct shl_mem_alloc_debug_element_ *e = shl_mem_alloc_debug_map.element + i; printf("element %d: ptr = %p, size = %ld, is_free = %d\n", i, e->ptr, e->size, e->is_free); } } -static int csi_mem_map_insert(void *ptr, uint64_t size) +static int shl_mem_map_insert(void *ptr, uint64_t size) { - int element_number = csi_mem_alloc_debug_map.element_number; - int index = csi_mem_alloc_debug_map.index; + int element_number = shl_mem_alloc_debug_map.element_number; + int index = shl_mem_alloc_debug_map.index; if (element_number == 0 || index == element_number - 1) { - csi_mem_alloc_debug_map.element_number += 512; - csi_mem_alloc_debug_map.element = realloc(csi_mem_alloc_debug_map.element, - csi_mem_alloc_debug_map.element_number * - sizeof(struct csi_mem_alloc_debug_element_)); + shl_mem_alloc_debug_map.element_number += 512; + shl_mem_alloc_debug_map.element = realloc( + shl_mem_alloc_debug_map.element, + shl_mem_alloc_debug_map.element_number * sizeof(struct shl_mem_alloc_debug_element_)); } - csi_mem_alloc_debug_map.element[index].ptr = ptr; - csi_mem_alloc_debug_map.element[index].size = size; - csi_mem_alloc_debug_map.element[index].is_free = 0; - csi_mem_alloc_debug_map.index++; + shl_mem_alloc_debug_map.element[index].ptr = ptr; + shl_mem_alloc_debug_map.element[index].size = size; + shl_mem_alloc_debug_map.element[index].is_free = 0; + shl_mem_alloc_debug_map.index++; } -void *csi_mem_alloc(int64_t size) +void *shl_mem_alloc(int64_t size) { void *ret; -#ifdef CSI_MEM_DEBUG_VALID_WRITE +#ifdef SHL_MEM_DEBUG_VALID_WRITE ret = calloc(1, size + 8); int8_t *check_ptr = ret + size; /* magic number */ @@ -80,67 +80,72 @@ void *csi_mem_alloc(int64_t size) check_ptr[6] = 0x67; check_ptr[7] = 0xff; #else -#ifdef CSI_USE_ATAT_MALLOC - void *csi_atat_calloc(size_t n, size_t m); - ret = csi_atat_calloc(1, size); +#ifdef SHL_USE_ATAT_MALLOC + void *shl_atat_calloc(size_t n, size_t m); + ret = shl_atat_calloc(1, size); #else ret = calloc(1, size); #endif #endif if (ret == NULL) { - csi_debug_error("cannot alloc memory\n"); + shl_debug_error("cannot alloc memory\n"); } -#ifdef CSI_MEM_DEBUG - csi_mem_map_insert(ret, size); - csi_mem_alloc_debug_map.total_size += size; - printf("csi_mem_alloc: total size = %ld\n", csi_mem_alloc_debug_map.total_size); +#ifdef SHL_MEM_DEBUG + shl_mem_map_insert(ret, size); + shl_mem_alloc_debug_map.total_size += size; + printf("shl_mem_alloc: total size = %ld\n", shl_mem_alloc_debug_map.total_size); #endif return ret; } -void *csi_mem_calloc(size_t nmemb, size_t size) { return csi_mem_alloc(nmemb * size); } +void *shl_mem_calloc(size_t nmemb, size_t size) { return shl_mem_alloc(nmemb * size); } -void *csi_mem_realloc(void *ptr, size_t size) +void *shl_mem_realloc(void *ptr, size_t size) { - void *ret = csi_mem_alloc(size); + void *ret = shl_mem_alloc(size); if (!ptr) { return ret; } memcpy(ret, ptr, size); - csi_mem_free(ptr); + shl_mem_free(ptr); return ret; } -void *csi_mem_alloc_aligned(int64_t size, int aligned_bytes) +void *shl_mem_alloc_aligned(int64_t size, int aligned_bytes) { void *ptr = NULL; -#ifndef CSI_BUILD_RTOS +#ifdef SHL_BUILD_RTOS + size_t real_size = size + aligned_bytes; + void *tptr = shl_mem_alloc(real_size); + int mask = ~(aligned_bytes - 1); + int addr = ((int)tptr + aligned_bytes) & mask; + ptr = (void *)addr; +#else if (aligned_bytes == 0) { aligned_bytes = getpagesize(); } int ret = posix_memalign(&ptr, aligned_bytes, size); - if (ret || ptr == NULL) - csi_debug_error("cannot alloc aligned memory\n"); + if (ret || ptr == NULL) shl_debug_error("cannot alloc aligned memory\n"); #endif return ptr; } -void csi_mem_free(void *ptr) +void shl_mem_free(void *ptr) { -#ifdef CSI_MEM_DEBUG - for (int i = 0; i < csi_mem_alloc_debug_map.index; i++) { - struct csi_mem_alloc_debug_element_ *e = csi_mem_alloc_debug_map.element + i; +#ifdef SHL_MEM_DEBUG + for (int i = 0; i < shl_mem_alloc_debug_map.index; i++) { + struct shl_mem_alloc_debug_element_ *e = shl_mem_alloc_debug_map.element + i; if (e->ptr == ptr && e->is_free == 0) { e->is_free = 1; - csi_mem_alloc_debug_map.total_size -= e->size; - printf("csi_mem_free: total size = %ld\n", csi_mem_alloc_debug_map.total_size); -#ifdef CSI_MEM_DEBUG_VALID_WRITE + shl_mem_alloc_debug_map.total_size -= e->size; + printf("shl_mem_free: total size = %ld\n", shl_mem_alloc_debug_map.total_size); +#ifdef SHL_MEM_DEBUG_VALID_WRITE uint8_t *cptr = ptr + e->size; if ((cptr[0] == 0xff) && (cptr[1] == 0x23) && (cptr[2] == 0x33) && (cptr[3] == 0x44) && (cptr[4] == 0x45) && (cptr[5] == 0x55) && (cptr[6] == 0x67) && (cptr[7] == 0xff)) { break; } else { - printf("csi_mem_free: invalid write %p\n", ptr); + printf("shl_mem_free: invalid write %p\n", ptr); } #else break; @@ -148,9 +153,9 @@ void csi_mem_free(void *ptr) } } #endif -#ifdef CSI_USE_ATAT_MALLOC - void csi_atat_free(void *f); - csi_atat_free(ptr); +#ifdef SHL_USE_ATAT_MALLOC + void shl_atat_free(void *f); + shl_atat_free(ptr); #else free(ptr); #endif diff --git a/tests/Makefile b/tests/Makefile index 36dd9bab..97f8995c 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,6 +1,6 @@ TEST_ROOT := $(shell pwd) -all: test_ref +all: test_ref test_anole test_ref_x86: make -C validation_layer -f Makefile.ref_x86 @@ -14,6 +14,15 @@ test_c860: test_c906: make -C validation_layer -f Makefile.c906 +test_anole: + make -C validation_graph -f Makefile.anole + +test_pnna: + make -C validation_graph -f Makefile.pnna + +test_pnna_x86: + make -C validation_graph -f Makefile.pnna_x86 + test_i805: make -C validation_xt800 -f Makefile.i805 diff --git a/tests/autotest/conftest.py b/tests/autotest/conftest.py index 05c34561..8bf11be9 100644 --- a/tests/autotest/conftest.py +++ b/tests/autotest/conftest.py @@ -20,11 +20,14 @@ def pytest_addoption(parser): parser.addoption( - "--board", action="store", default="c860", help="board option: c860|c906|x86_ref" + "--board", action="store", default="c860", help="board option: c860|c906|c908|anole|x86_ref|c910" ) parser.addoption( "--accuracy", action="store", default="0.99", help="error measures accuracy" ) + parser.addoption( + "--vlen", action="store", default="8", help="8|16|32" + ) @pytest.fixture(scope='module') @@ -32,6 +35,7 @@ def cmdopt(request): config_param = {} config_param["board"] = request.config.getoption("--board") config_param["accuracy"] = request.config.getoption("--accuracy") + config_param["vlen"] = request.config.getoption("--vlen") return dict(config_param) diff --git a/tests/autotest/interface_test.py b/tests/autotest/interface_test.py index 6b94af17..c3298ebd 100644 --- a/tests/autotest/interface_test.py +++ b/tests/autotest/interface_test.py @@ -69,7 +69,8 @@ def run_base( cmd_execute, elf_data, python_data, - test_accuracy + test_accuracy, + python_cmd, ): hhb_cmd = ( f"{cmd_execute} " @@ -81,49 +82,68 @@ def run_base( print(hhb_cmd) ret = os.system(hhb_cmd) - assert ret == 0 + pytest.assume(ret == 0, f"{hhb_cmd}\n{python_cmd}") @pytest.fixture(scope='module') def compile_execute(cmdopt): board = cmdopt["board"] accuracy = cmdopt["accuracy"] + vlen = cmdopt["vlen"] if board == "c860": qemu = "qemu-cskyv2 -cpu ck860v" elif board == "c906": qemu = "qemu-riscv64 -cpu c906fdv" elif board == "c910": qemu = "qemu-riscv64 -cpu c910v" + elif board == "c908": + qemu = "qemu-riscv64 -cpu c908v" mkdir(valid_dir) - return qemu, accuracy + return qemu, accuracy, vlen -@conftest.custom_parametrize('elf_data', numberOffile(elf_path, "c")) -def test_inference(cmdopt, elf_data, compile_execute): - elf_data = elf_data.replace(".c", ".o.elf") - if "nchw" or "nhwc" in elf_data: - python_data = "_".join(elf_data.split("/")[-1].split("_")[:-1]) - else: - python_data = "_".join(elf_data.split("/")[-1].split("_")) - os.chdir(valid_dir) - cmd = "python " + python_path + "/" + python_data + ".py" - ret = os.system(cmd) - assert ret == 0 - run_base(compile_execute[0], elf_data, valid_dir + "/" + python_data + "_data_f32.bin", compile_execute[1]) - - -def get_testtype(op_type): - if "averagepool" in op_type or "maxpool" in op_type: - test_type = ["random","2x2s2","2x2s2_p1","3x3s2","3x3s2_p1","3x3s1_p1"] - elif op_type == "convolution": - test_type = ["random","gemm_conv1x1s1","conv3x3s1_im2col_sgemm","conv3x3s1_winograd64","conv3x3s1_winograd64","gemm_random"] +####TODO rm ########### +# def get_testtype(op_type): +# if "averagepool" in op_type or "maxpool" in op_type: +# test_type = ["random","2x2s2","2x2s2_p1","3x3s2","3x3s2_p1","3x3s1_p1"] +# elif op_type == "convolution": +# test_type = ["random","gemm_conv1x1s1","conv3x3s1_im2col_sgemm","conv3x3s1_winograd64","conv3x3s1_winograd64","conv3x3s1_winograd64_pack","gemm_random"] +# elif op_type == "depthwise_convolution": +# test_type = ["random","3x3s1","3x3s2"] +# elif op_type == "group_convolution": +# test_type = ["random", "conv3x3s1d1"] +# elif op_type == "relu": +# test_type = ["random", "16x3_8_4_2_1"] +# elif op_type == "add": +# test_type = ["", "vector", "size1", "flag0"] +# else: +# test_type =[] +# return test_type + +import itertools +def get_testvlen(op_type, vlen): + list_dtype = [int(vlen)] + list_vlen = [128, 256, 512] + if op_type == "convolution": + list_type = ["pack1_com", "pack1_gemm", "packnto1", "packnto1_conv1x1s1", "pack1ton", "pack1ton_conv1x1s1", "packn_com", "packn_conv1x1s1", "packn_conv3x3s1", "packn_conv3x3s1_linput"] + test_type = list(itertools.product(list_dtype, list_vlen, list_type)) + elif op_type == "group_convolution": + list_type = ["pack1ton_conv1x1s1"] + test_type = list(itertools.product(list_dtype, list_vlen, list_type)) elif op_type == "depthwise_convolution": - test_type = ["random","3x3s1","3x3s2"] + list_type = ["pack1_common", "pack1_conv3x3s2", "pack1_conv3x3s1", "packnto1", "pack1ton", "packn_com", "packn_conv3x3s2", "packn_conv3x3s1"] + test_type = list(itertools.product(list_dtype, list_vlen, list_type)) + elif op_type == "global_avgpool" or op_type == "global_maxpool": + list_type = ["packn", "pack1"] + test_type = list(itertools.product(list_dtype, list_vlen, list_type)) + elif op_type == "averagepool" or op_type == "maxpool": + list_type = ["packn_global", "global", "packn_2x2s2", "pack1_2x2s2", "packn_2x2s2p0", "pack1_2x2s2p0", "packn_2x2s2p1", "pack1_2x2s2p1", "packn_3x3s2", "pack1_3x3s2", "packn_3x3s2p0", "pack1_3x3s2p0", "packn_3x3s2p1", "pack1_3x3s2p1", "packn_3x3s1_p1", "pack1_3x3s1_p1"] + test_type = list(itertools.product(list_dtype, list_vlen, list_type)) else: test_type =[] return test_type - + @pytest.mark.usefixtures("compile_execute") class TestCSINN: @@ -145,18 +165,18 @@ def test_layer(self,elf_data,compile_execute): if "roipool" in data: cmd = f'docker run --rm -v {valid_dir}:mnt tvm_caffe:rfcn sh -c "cd mnt && python3 {path}"' else: - cmd = f"python {path}" + cmd = f"python3 {path}" ret = os.system(cmd) assert ret == 0 if flag == 1: - run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_data_f32.bin", compile_execute[1], cmd) else: if "argmax" in data or "argmin" in data: - run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_stride_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_stride_data_f32.bin", compile_execute[1], cmd) else: - run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_nchw_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_nchw_data_f32.bin", compile_execute[1], cmd) + - @pytest.mark.parametrize('elf_data', numberOffile(elf_path, "elf")) def test_rvv_layer(self,elf_data,compile_execute): flag = 0 @@ -165,30 +185,69 @@ def test_rvv_layer(self,elf_data,compile_execute): path = os.path.join(python_path, data + "_nchw.py") if not os.path.exists(path): path = os.path.join(python_path, data + ".py") - flag = 1 + flag = 1 if test_type != []: - for i in test_type: - cmd = f"python {path} {i}" + for i in test_type: + cmd = f"python3 {path} {i}" + print(cmd) ret = os.system(cmd) assert ret == 0 if flag == 1: - run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd) else: - run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1]) - else: - cmd = f"python {path}" + run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd) + else: + cmd = f"python3 {path}" ret = os.system(cmd) assert ret == 0 if flag == 1: - run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd) else: - run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1]) + run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd) + + + @pytest.mark.parametrize('elf_data', numberOffile(elf_path, "elf")) + def test_c908_layer(self,elf_data,compile_execute): + flag = 0 + data = elf_data.split("/")[-1].split(".")[0] + test_type = get_testvlen(data, compile_execute[2]) + compile_option = compile_execute[0] + path = os.path.join(python_path, data + "_nchw.py") + if not os.path.exists(path): + path = os.path.join(python_path, data + ".py") + flag = 1 + elif "convolution" in path or "averagepool" in path or "maxpool" in path: + path = os.path.join(python_path, data + "_vlen.py") + if test_type != []: + for i in test_type: + cmd = f"python3 {path} {i[0]} {i[1]} {i[2]}" + print(cmd) + ret = os.system(cmd) + pytest.assume(ret == 0) + if str(i[1]) == "256": + compile_option = "qemu-riscv64 -cpu rv64,x-v=true,vext_spec=v1.0,vlen=256,x-thead=true" + elif str(i[1]) == "512": + compile_option = "qemu-riscv64 -cpu rv64,x-v=true,vext_spec=v1.0,vlen=512,x-thead=true" + + if flag == 1: + run_base(compile_option, elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd) + else: + run_base(compile_option, elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd) + else: + cmd = f"python3 {path}" + ret = os.system(cmd) + pytest.assume(ret == 0) + if flag == 1: + run_base(compile_option, elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd) + else: + run_base(compile_option, elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd) + @pytest.mark.parametrize('unit_test_elf_data', numberOffile(unit_test_elf_path, "elf")) def test_opt_interface(self, unit_test_elf_data, compile_execute): - run_base(compile_execute[0], unit_test_elf_data, "", compile_execute[1]) + run_base(compile_execute[0], unit_test_elf_data, "", compile_execute[1], "") class TestHeterogeneous: diff --git a/tests/python_ref/add.py b/tests/python_ref/add.py index d70fef81..15acd60d 100755 --- a/tests/python_ref/add.py +++ b/tests/python_ref/add.py @@ -37,6 +37,20 @@ def add_f32(): size2 = in_channel src_out = np.add(src_in1, src_in2) + elif(sys.argv[1] == "size1"): + vector = 2 + src_in2 = np.random.normal(zero_point2, std2, 1) + src_in2 = src_in2.astype(np.float32) + size2 = 1 + src_out = np.add(src_in1, src_in2) + + elif(sys.argv[1] == "flag0"): + vector = 3 + src_in2 = np.random.normal(zero_point2, std2, (in_size_y, in_size_x, 1)) + src_in2 = src_in2.astype(np.float32) + size2 = in_size_y * in_size_x + src_out = np.add(src_in1, src_in2) + src_in_1 = src_in1.reshape(size_all) src_in_2 = src_in2.reshape(size2) diff --git a/tests/python_ref/averagepool_nchw.py b/tests/python_ref/averagepool_nchw.py index 4824432a..83d6e141 100644 --- a/tests/python_ref/averagepool_nchw.py +++ b/tests/python_ref/averagepool_nchw.py @@ -40,30 +40,34 @@ def avgpool2d_f32(test_type): stride_h = stride_w = 2 kernel_h = kernel_w = 2 pad_left = pad_top = 0 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 elif test_type == "2x2s2_p1": stride_h = stride_w = 2 kernel_h = kernel_w = 2 pad_left = pad_top = 1 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width elif test_type == "3x3s2": stride_h = stride_w = 2 kernel_h = kernel_w = 3 pad_left = pad_top = 0 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width elif test_type == "3x3s2_p1": stride_h = stride_w = 2 kernel_h = kernel_w = 3 pad_left = pad_top = 1 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 elif test_type == "3x3s1_p1": stride_h = stride_w = 1 diff --git a/tests/python_ref/averagepool_vlen.py b/tests/python_ref/averagepool_vlen.py new file mode 100644 index 00000000..2deb8a90 --- /dev/null +++ b/tests/python_ref/averagepool_vlen.py @@ -0,0 +1,200 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import functional as fn +import math + + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + +def avgpool2d_f32(test_dtype, test_vlen, test_type): + para = [] + # init the input data and parameters + batch = int(np.random.randint(1, high=2, size=1)) + channel = int(np.random.randint(2, high=6, size=1)) + in_height = int(np.random.randint(32, high=64, size=1)) + in_width = int(np.random.randint(32, high=64, size=1)) + stride_h = int(np.random.randint(1, high=4, size=1)) + stride_w = int(np.random.randint(1, high=4, size=1)) + kernel_h = int(np.random.randint(stride_h, high=9, size=1)) + kernel_w = int(np.random.randint(stride_w, high=9, size=1)) + pad_left = int(np.random.randint(0, high=2, size=1)) + pad_right = int(np.random.randint(0, high=2, size=1)) + pad_top = int(np.random.randint(0, high=2, size=1)) + pad_down = int(np.random.randint(0, high=2, size=1)) + c_model = False + + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + + if "2x2s2" in test_type and test_type[-2] != "p": + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 0 + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + if test_type == "packn_2x2s2": + channel = int(n*packn) + elif test_type == "pack1_2x2s2": + channel = int(n*packn) + 1 + + elif "2x2s2p0" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 0 + pad_down = pad_right = 0 + in_height = 2 * in_height + in_width = 2 * in_width + c_model = True + if test_type == "packn_2x2s2p0": + channel = int(n*packn) + elif test_type == "pack1_2x2s2p0": + channel = int(n*packn) + 1 + + + elif "2x2s2p1" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 1 + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width + if test_type == "packn_2x2s2p1": + channel = int(n*packn) + elif test_type == "pack1_2x2s2p1": + channel = int(n*packn) + 1 + + + elif "3x3s2" in test_type and test_type[-2] != "p": + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 0 + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width + if test_type == "packn_3x3s2": + channel = int(n*packn) + elif test_type == "pack1_3x3s2": + channel = int(n*packn) + 1 + + elif "3x3s2p0" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 0 + pad_down = pad_right = 0 + in_height = 2 * in_height + in_width = 2 * in_width + c_model = False + if test_type == "packn_3x3s2p0": + channel = int(n*packn) + elif test_type == "pack1_3x3s2p0": + channel = int(n*packn) + 1 + + elif "3x3s2p1" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 1 + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + if test_type == "packn_3x3s2p1": + channel = int(n*packn) + elif test_type == "pack1_3x3s2p1": + channel = int(n*packn) + 1 + + elif "3x3s1_p1" in test_type: + stride_h = stride_w = 1 + kernel_h = kernel_w = 3 + pad_left = pad_right = pad_top = pad_down = 1 + if test_type == "packn_3x3s1_p1": + channel = int(n*packn) + elif test_type == "pack1_3x3s1_p1": + channel = int(n*packn) + 1 + + elif "global" in test_type: + if test_type == "packn_global": + channel = int(n*packn) + elif test_type == "global": + channel = int(n*packn) + 1 + in_height = kernel_h + in_width = kernel_w + pad_left = pad_right = pad_top = pad_down = 0 + + + include_pad = int(np.random.randint(1, high=2, size=1)) # 0: false 1: true + + + zero_point = int(np.random.randint(-8, high=8, size=1)) + std = int(np.random.randint(1, high=3, size=1)) + + src_in = np.random.normal(zero_point, std, (batch, channel, in_height, in_width)) + + t_src_in = tensor(src_in) + t_src_in1 = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0) + + t_src_out = fn.avg_pool2d(t_src_in1, kernel_size=(kernel_h, kernel_w), stride=(stride_h, stride_w), count_include_pad = True if include_pad else False, ceil_mode=c_model).numpy() + + + out_height = np.shape(t_src_out)[2] + out_width = np.shape(t_src_out)[3] + + + # nc1c0hw ==> nc1hwc0 + if "packn" in test_type: + t_src_in = t_src_in.reshape([batch, math.ceil(channel/packn), packn, in_height, in_width]).permute([0, 1, 3, 4, 2]) + t_src_out = t_src_out.reshape([batch, math.ceil(channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2]) + + c_model = 1 if c_model else 0 + src_in_1 = t_src_in.flatten() + src_out_1 = t_src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + 16 + + para.append(total_size) + para.append(batch) + para.append(channel) + para.append(in_height) + para.append(in_width) + para.append(stride_h) + para.append(stride_w) + para.append(kernel_h) + para.append(kernel_w) + para.append(pad_left) + para.append(pad_right) + para.append(pad_top) + para.append(pad_down) + para.append(out_height) + para.append(out_width) + para.append(include_pad) + para.append(c_model) + print(para) + + with open("averagepool_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + avgpool2d_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/batch_norm_nchw.py b/tests/python_ref/batch_norm_nchw.py index 878e171b..4b37f220 100644 --- a/tests/python_ref/batch_norm_nchw.py +++ b/tests/python_ref/batch_norm_nchw.py @@ -17,7 +17,7 @@ def batch_norm_f32(): in_size = int(np.random.randint(16, high=32, size=1)) dim.append(in_size) - dim[0] = 1 + dim[0] = 1 # batch = 1 for anole zero_point1 = int(np.random.randint(-6, high=6, size=1)) std1 = int(np.random.randint(1, high=20, size=1)) diff --git a/tests/python_ref/convolution_nchw.py b/tests/python_ref/convolution_nchw.py index 9681e3d1..6d0ed1c9 100644 --- a/tests/python_ref/convolution_nchw.py +++ b/tests/python_ref/convolution_nchw.py @@ -30,18 +30,25 @@ def convolution_f32(test_type): kernel_y = 1 dilation_x = 1 dilation_y = 1 + out_channel = 8 + 4 + 2 + 1 + in_size_x = 7 + in_size_y = 9 - elif test_type == "conv3x3s1_im2col_sgemm" or test_type == "conv3x3s1_winograd64": + elif test_type == "conv3x3s1_im2col_sgemm" or test_type == "conv3x3s1_winograd64" or test_type == "conv3x3s1_winograd64_pack": stride_x = 1 stride_y = 1 kernel_x = 3 kernel_y = 3 dilation_x = 1 dilation_y = 1 - if test_type == "conv3x3s1_winograd64": + if "conv3x3s1_winograd64" in test_type: n = int(np.random.randint(1, high=4, size=1)) in_channel = 8 * n out_channel = 8 * n + if test_type == "conv3x3s1_winograd64_pack": + in_size_x = 20 + in_size_y = 32 + elif test_type == "gemm_random": stride_x = int(np.random.randint(2, high=3, size=1)) diff --git a/tests/python_ref/convolution_vlen.py b/tests/python_ref/convolution_vlen.py new file mode 100644 index 00000000..fd64b6f3 --- /dev/null +++ b/tests/python_ref/convolution_vlen.py @@ -0,0 +1,205 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import functional as fn +import math + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + +def convolution_f32(test_dtype, test_vlen, test_type): + + para = [] + batch = int(np.random.randint(1, high=2, size=1)) + in_size_x = int(np.random.randint(6, high=7, size=1)) #width + in_size_y = int(np.random.randint(6, high=7, size=1)) #height + stride_x = int(np.random.randint(2, high=3, size=1)) + stride_y = int(np.random.randint(2, high=3, size=1)) + kernel_x = int(np.random.randint(stride_x, high=7, size=1)) + kernel_y = int(np.random.randint(stride_y, high=7, size=1)) + dilation_x = int(np.random.randint(1, high=2, size=1)) + dilation_y = int(np.random.randint(1, high=2, size=1)) + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + print(packn) + + if "pack1_" in test_type: + in_channel = packn * n + 1 + out_channel = packn * n + 1 + if test_type == "pack1_conv1x1s1": + stride_x = 1 + stride_y = 1 + kernel_x = 1 + kernel_y = 1 + dilation_x = 1 + dilation_y = 1 + elif test_type == "pack1_gemm": + stride_x = 1 + stride_y = 1 + kernel_x = 1 + kernel_y = 1 + out_channel = 8 + 4 + 2 + 1 + in_size_x = 7 + in_size_y = 9 + + + elif "packnto1" in test_type: + in_channel = packn * n + out_channel = packn * n + 1 + if test_type == "packnto1_conv1x1s1": + stride_x = 1 + stride_y = 1 + kernel_x = 1 + kernel_y = 1 + dilation_x = 1 + dilation_y = 1 + + elif "pack1ton" in test_type: + in_channel = packn * n + 1 + out_channel = packn * n + if test_type == "pack1ton_conv1x1s1": + stride_x = 1 + stride_y = 1 + kernel_x = 1 + kernel_y = 1 + dilation_x = 1 + dilation_y = 1 + + elif "packn_" in test_type: + in_channel = packn * n + out_channel = packn * n + if test_type == "packn_conv1x1s1": + stride_x = 1 + stride_y = 1 + kernel_x = 1 + kernel_y = 1 + dilation_x = 1 + dilation_y = 1 + elif "packn_conv3x3s1" in test_type: + stride_x = 1 + stride_y = 1 + kernel_x = 3 + kernel_y = 3 + dilation_x = 1 + dilation_y = 1 + + if test_type == "packn_conv3x3s1_linput": + in_size_x = int(np.random.randint(13, high=20, size=1)) #width + in_size_y = int(np.random.randint(13, high=20, size=1)) #height + + + + + kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1) + kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1) + pad_left = pad_right = pad_top = pad_down = 0 + + pad_x = (in_size_x - kernel_x_t) - int((in_size_x - kernel_x_t) / stride_x) * stride_x + if(pad_x !=0): + pad_x = int((in_size_x - kernel_x_t) / stride_x) * stride_x + stride_x - (in_size_x - kernel_x_t) + pad_left = int(np.random.randint(0, high=pad_x, size=1)) + pad_right = pad_x - pad_left + + pad_y = (in_size_y - kernel_y_t) - int((in_size_y - kernel_y_t) / stride_y) * stride_y + if(pad_y != 0): + pad_y = int((in_size_y - kernel_y_t) / stride_y) * stride_y + stride_y - (in_size_y - kernel_y_t) + pad_top = int(np.random.randint(0, high=pad_y, size=1)) + pad_down = pad_y - pad_top + + + zero_point1 = int(np.random.randint(-3, high=3, size=1)) + std1 = int(np.random.randint(1, high=3, size=1)) + zero_point2 = int(np.random.randint(-3, high=3, size=1)) + std2 = int(np.random.randint(1, high=3, size=1)) + zero_point3 = int(np.random.randint(-6, high=6, size=1)) + std3 = int(np.random.randint(1, high=10, size=1)) + + src_in = np.random.normal(zero_point1, std1, (batch, in_channel, in_size_y, in_size_x)) + weight = np.random.normal(zero_point2, std2, (out_channel, in_channel, kernel_y, kernel_x)) + bias = np.random.normal(zero_point3, std3, out_channel) + src_in = src_in.astype(np.float32) + weight = weight.astype(np.float32) + bias = bias.astype(np.float32) + + + t_src_in = tensor(src_in) + t_weight = tensor(weight) + t_bias = tensor(bias) + + t_src_in = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0) + t_src_out1 = fn.conv2d(t_src_in, t_weight, bias=t_bias, stride=(stride_y, stride_x), dilation=(dilation_y, dilation_x)).numpy() + + out_size_x = np.shape(t_src_out1)[3] + out_size_y = np.shape(t_src_out1)[2] + + + + # nc1c0hw ==> nc1hwc0 + if "packnto1" in test_type or "packn_" in test_type: + src_in = src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).transpose( [0, 1, 3, 4, 2]) + + if "pack1ton" in test_type or "packn_" in test_type: + t_src_out1 = t_src_out1.reshape([batch, math.ceil(out_channel/packn), packn, out_size_y, out_size_x]).transpose( [0, 1, 3, 4, 2]) + + + + src_in_1 = src_in.flatten() + weight_1 = weight.flatten() + src_out_1 = t_src_out1.flatten() + + + + total_size = (len(src_in_1) + len(src_out_1)) + len(weight_1) + len(bias) + 17 + + para.append(total_size) + para.append(batch) + para.append(in_channel) + para.append(in_size_y) #height + para.append(in_size_x) #width + para.append(stride_y) + para.append(stride_x) + para.append(kernel_y) + para.append(kernel_x) + para.append(pad_left) + para.append(pad_right) + para.append(pad_top) + para.append(pad_down) + para.append(out_channel) + para.append(dilation_x) + para.append(dilation_y) + para.append(out_size_x) #width + para.append(out_size_y) #height + print(para) + + + with open("convolution_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(weight_1)), *weight_1) + fp.write(data) + data = struct.pack(('%df' % len(bias)), *bias) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + convolution_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/depthwise_convolution_nchw.py b/tests/python_ref/depthwise_convolution_nchw.py index 3123a138..df434f87 100644 --- a/tests/python_ref/depthwise_convolution_nchw.py +++ b/tests/python_ref/depthwise_convolution_nchw.py @@ -29,6 +29,8 @@ def depthwise_convolution_f32(test_type): kernel_x = 3 kernel_y = 3 dilation_x = dilation_y = 1 + in_size_y = 35 + in_size_x = 33 elif test_type == "3x3s2": stride_x = 2 @@ -36,6 +38,7 @@ def depthwise_convolution_f32(test_type): kernel_x = 3 kernel_y = 3 dilation_x = dilation_y = 1 + in_size_x = 46 kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1) kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1) diff --git a/tests/python_ref/depthwise_convolution_vlen.py b/tests/python_ref/depthwise_convolution_vlen.py new file mode 100644 index 00000000..90117ae1 --- /dev/null +++ b/tests/python_ref/depthwise_convolution_vlen.py @@ -0,0 +1,165 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import functional as fn +import math + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + + +def depthwise_convolution_f32(test_dtype, test_vlen, test_type): + para = [] + batch = int(np.random.randint(1, high=2, size=1)) + in_size_x = int(np.random.randint(6, high=7, size=1)) #width + in_size_y = int(np.random.randint(6, high=7, size=1)) #height + stride_x = int(np.random.randint(2, high=3, size=1)) + stride_y = int(np.random.randint(2, high=3, size=1)) + kernel_x = int(np.random.randint(stride_x, high=7, size=1)) + kernel_y = int(np.random.randint(stride_y, high=7, size=1)) + dilation_x = int(np.random.randint(1, high=2, size=1)) + dilation_y = int(np.random.randint(1, high=2, size=1)) + + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + print(packn) + + if "pack1_" in test_type: + in_channel = packn * n + 1 + out_channel = packn * n + 1 + if test_type == "pack1_conv3x3s2": + stride_x = 2 + stride_y = 2 + kernel_x = 3 + kernel_y = 3 + elif test_type == "pack1_conv3x3s1": + stride_x = 1 + stride_y = 1 + kernel_x = 3 + kernel_y = 3 + + elif test_type == "packnto1": + in_channel = packn * n + out_channel = packn * n + 1 + + elif "pack1ton" in test_type: + in_channel = packn * n + 1 + out_channel = packn * n + + elif "packn_" in test_type: + in_channel = packn * n + out_channel = packn * n + if test_type == "packn_conv3x3s2": + stride_x = 2 + stride_y = 2 + kernel_x = 3 + kernel_y = 3 + elif test_type == "packn_conv3x3s1": + stride_x = 1 + stride_y = 1 + kernel_x = 3 + kernel_y = 3 + + + + + kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1) + kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1) + pad_left = pad_right = pad_top = pad_down = 0 + + pad_x = (in_size_x - kernel_x_t) - int((in_size_x - kernel_x_t) / stride_x) * stride_x + if(pad_x !=0): + pad_left = int(np.random.randint(0, high=pad_x, size=1)) + pad_right = pad_x - pad_left + + pad_y = (in_size_y - kernel_y_t) - int((in_size_y - kernel_y_t) / stride_y) * stride_y + if(pad_y != 0): + pad_top = int(np.random.randint(0, high=pad_y, size=1)) + pad_down = pad_y - pad_top + zero_point1 = int(np.random.randint(-2, high=2, size=1)) + std1 = int(np.random.randint(1, high=3, size=1)) + zero_point2 = int(np.random.randint(-2, high=2, size=1)) + std2 = int(np.random.randint(1, high=3, size=1)) + zero_point3 = int(np.random.randint(-3, high=3, size=1)) + std3 = int(np.random.randint(1, high=20, size=1)) + + src_in = np.random.normal(zero_point1, std1, (batch, in_channel, in_size_y, in_size_x)) + weight = np.random.normal(zero_point2, std2, (in_channel, 1, kernel_y, kernel_x)) + bias = np.random.normal(zero_point3, std3, in_channel) + src_in = src_in.astype(np.float32) + weight = weight.astype(np.float32) + bias = bias.astype(np.float32) + + t_src_in = tensor(src_in) + t_weight = tensor(weight) + t_bias = tensor(bias) + t_src_in1 = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0) + t_src_out = fn.conv2d(t_src_in1, t_weight, bias=t_bias, stride=(stride_y, stride_x), padding=0, dilation=(dilation_y, dilation_x), groups=in_channel).numpy() + + out_size_x = np.shape(t_src_out)[3] + out_size_y = np.shape(t_src_out)[2] + out_channel = np.shape(t_src_out)[1] + + # nc1c0hw ==> nc1hwc0 + if "packn_" in test_type: + t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2]) + t_src_out = t_src_out.reshape([batch, math.ceil(out_channel/packn), packn, out_size_y, out_size_x]).transpose([0, 1, 3, 4, 2]) + + src_in_1 = t_src_in.flatten() + weight_1 = weight.flatten() + src_out_1 = t_src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + len(weight_1) + len(bias) + 17 + + para.append(total_size) + para.append(batch) # 0 + para.append(in_channel) # 1 + para.append(in_size_y) # 2 + para.append(in_size_x) # 3 + para.append(stride_y) # 4 + para.append(stride_x) # 5 + para.append(kernel_y) # 6 + para.append(kernel_x) # 7 + para.append(pad_left) # 8 + para.append(pad_right) # 9 + para.append(pad_top) # 10 + para.append(pad_down) # 11 + para.append(out_channel)# 12 + para.append(dilation_y) # 13 + para.append(dilation_x) # 14 + para.append(out_size_y) # 15 + para.append(out_size_x) # 16 + print(para) + + + with open("depthwise_convolution_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(weight_1)), *weight_1) + fp.write(data) + data = struct.pack(('%df' % len(bias)), *bias) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + depthwise_convolution_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/global_avgpool_vlen.py b/tests/python_ref/global_avgpool_vlen.py new file mode 100644 index 00000000..464e4855 --- /dev/null +++ b/tests/python_ref/global_avgpool_vlen.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import AdaptiveAvgPool2d +import math + + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + +def global_avgpool2d_f32(test_dtype, test_vlen, test_type): + para = [] + # init the input data and parameters + batch = int(np.random.randint(1, high=2, size=1)) + in_size_x = int(np.random.randint(64, high=128, size=1)) + in_size_y = int(np.random.randint(64, high=128, size=1)) + in_channel = int(np.random.randint(1, high=64, size=1)) + + out_height = int(np.random.randint(1, high=2, size=1)) + out_width = int(np.random.randint(1, high=2, size=1)) + + zero_point = int(np.random.randint(-600, high=600, size=1)) + std = int(np.random.randint(1, high=200, size=1)) + + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + if test_type == "packn": + in_channel = int(n*packn) + elif test_type == "pack1": + in_channel = int(n*packn) + 1 + + src_in = np.random.normal(zero_point, std, (batch, in_channel, in_size_y, in_size_x)) + + t_src_in = tensor(src_in) + gmp = AdaptiveAvgPool2d((out_height, out_width)) + t_src_out = gmp(t_src_in).numpy() + + + # nc1c0hw ==> nc1hwc0 + if "packn" in test_type: + t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2]) + t_src_out = t_src_out.reshape([batch, math.ceil(in_channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2]) + + + + src_in_1 = src_in.flatten() + src_out_1 = t_src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + 6 + + para.append(total_size) + para.append(batch) + para.append(in_channel) + para.append(in_size_y) + para.append(in_size_x) + para.append(out_height) + para.append(out_width) + + print(para) + + + with open("global_avgpool_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + global_avgpool2d_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/global_maxpool_vlen.py b/tests/python_ref/global_maxpool_vlen.py new file mode 100644 index 00000000..efcce170 --- /dev/null +++ b/tests/python_ref/global_maxpool_vlen.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import AdaptiveMaxPool2d +import math + + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + +def global_maxpool2d_f32(test_dtype, test_vlen, test_type): + para = [] + # init the input data and parameters + batch = int(np.random.randint(1, high=2, size=1)) + in_size_x = int(np.random.randint(64, high=128, size=1)) + in_size_y = int(np.random.randint(64, high=128, size=1)) + in_channel = int(np.random.randint(1, high=64, size=1)) + + out_height = int(np.random.randint(1, high=2, size=1)) + out_width = int(np.random.randint(1, high=2, size=1)) + + zero_point = int(np.random.randint(-600, high=600, size=1)) + std = int(np.random.randint(1, high=20, size=1)) + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + if test_type == "packn": + in_channel = int(n*packn) + elif test_type == "pack1": + in_channel = int(n*packn) + 1 + + src_in = np.random.normal(zero_point, std, (batch, in_channel, in_size_y, in_size_x)) + + t_src_in = tensor(src_in) + gmp = AdaptiveMaxPool2d((out_height, out_width)) + t_src_out = gmp(t_src_in).numpy() + + # nc1c0hw ==> nc1hwc0 + if "packn" in test_type: + t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2]) + t_src_out = t_src_out.reshape([batch, math.ceil(in_channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2]) + + + src_in_1 = src_in.flatten() + src_out_1 = t_src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + 6 + + para.append(total_size) + para.append(batch) + para.append(in_channel) + para.append(in_size_y) + para.append(in_size_x) + para.append(out_height) + para.append(out_width) + + print(para) + + + with open("global_maxpool_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + global_maxpool2d_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/group_convolution_nchw.py b/tests/python_ref/group_convolution_nchw.py index e374652b..fd1e93d3 100644 --- a/tests/python_ref/group_convolution_nchw.py +++ b/tests/python_ref/group_convolution_nchw.py @@ -7,20 +7,30 @@ from torch import tensor from torch.nn import functional as fn -def group_convolution_f32(): +def group_convolution_f32(test_type): para = [] # init the input data and parameters batch = int(np.random.randint(1, high=4, size=1)) in_size_x = int(np.random.randint(32, high=33, size=1)) in_size_y = int(np.random.randint(32, high=33, size=1)) in_channel = int(np.random.randint(8, high=16, size=1)) - stride_x = int(np.random.randint(1, high=3, size=1)) - stride_y = int(np.random.randint(1, high=3, size=1)) - kernel_x = int(np.random.randint(stride_x + 1, high=7, size=1)) - kernel_y = int(np.random.randint(stride_y + 1, high=7, size=1)) + # init the input data and parameters + if test_type == "random": + stride_x = int(np.random.randint(1, high=3, size=1)) + stride_y = int(np.random.randint(1, high=3, size=1)) + kernel_x = int(np.random.randint(stride_x + 1, high=7, size=1)) + kernel_y = int(np.random.randint(stride_y + 1, high=7, size=1)) + dilation_x = int(np.random.randint(1, high=5, size=1)) + dilation_y = int(np.random.randint(1, high=5, size=1)) + elif test_type == "conv3x3s1d1": + stride_x = 1 + stride_y = 1 + kernel_x = 3 + kernel_y = 3 + dilation_x = 1 + dilation_y = 1 + group = int(np.random.randint(2, high=7, size=1)) - dilation_x = int(np.random.randint(1, high=5, size=1)) - dilation_y = int(np.random.randint(1, high=5, size=1)) in_channel = int(in_channel / group) * group kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1) kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1) @@ -108,5 +118,6 @@ def group_convolution_f32(): if __name__ == '__main__': - group_convolution_f32() + test_type = sys.argv[1] + group_convolution_f32(test_type) print("end") diff --git a/tests/python_ref/l2_norm_anole.py b/tests/python_ref/l2_norm_anole.py new file mode 100644 index 00000000..04d2fa66 --- /dev/null +++ b/tests/python_ref/l2_norm_anole.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +import random +import tensorflow as tf + + +def l2_normalization_f32(): + para = [] + dim = [] + # init the input data and parameters + dim_count = int(np.random.randint(4, high=5, size=1)) + for i in range(0, dim_count): + in_size = int(np.random.randint(16, high=32, size=1)) + dim.append(in_size) + + # dim = [1,3,112,112] + + zero_point = int(np.random.randint(-6, high=6, size=1)) + std = int(np.random.randint(1, high=20, size=1)) + src_in = np.random.normal(zero_point, std, size=dim) + src_in = src_in.astype(np.float32) + + value = (1e-05, 1e-04, 1e-03) + epsi = random.sample(value, 1) + + # across_spatial = false --> axis = 2 (channel_axis) for anole version 1.1.15 + out_calcu = tf.nn.l2_normalize(tf.convert_to_tensor(src_in), epsilon=epsi, axis=(1)) + + sess = tf.Session() + + src_out = sess.run(out_calcu) + + src_in_1 = src_in.flatten() + src_out_1 = src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + len(dim) + 2 + + para.append(total_size) + para.append(len(dim)) + print(para) + print(epsi) + + + with open("l2_norm_anole_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(epsi)), *epsi) + fp.write(data) + data = struct.pack(('%di' % len(dim)), *dim) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + l2_normalization_f32() + print("end") diff --git a/tests/python_ref/maxpool_nchw.py b/tests/python_ref/maxpool_nchw.py index 81313ccd..da5ff440 100644 --- a/tests/python_ref/maxpool_nchw.py +++ b/tests/python_ref/maxpool_nchw.py @@ -12,8 +12,8 @@ def maxpool2d_f32(test_type): # init the input data and parameters batch = int(np.random.randint(1, high=4, size=1)) channel = int(np.random.randint(2, high=6, size=1)) - in_height = int(np.random.randint(32, high=64, size=1)) - in_width = int(np.random.randint(32, high=64, size=1)) + in_height = int(np.random.randint(16, high=32, size=1)) + in_width = int(np.random.randint(16, high=32, size=1)) if test_type == "random": stride_h = int(np.random.randint(1, high=4, size=1)) @@ -40,35 +40,40 @@ def maxpool2d_f32(test_type): stride_h = stride_w = 2 kernel_h = kernel_w = 2 pad_left = pad_top = 0 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + elif test_type == "2x2s2_p1": stride_h = stride_w = 2 kernel_h = kernel_w = 2 pad_left = pad_top = 1 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width elif test_type == "3x3s2": stride_h = stride_w = 2 kernel_h = kernel_w = 3 pad_left = pad_top = 0 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width elif test_type == "3x3s2_p1": stride_h = stride_w = 2 kernel_h = kernel_w = 3 pad_left = pad_top = 1 - pad_right = int(np.random.randint(0, high=1, size=1)) - pad_down = int(np.random.randint(0, high=1, size=1)) + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 elif test_type == "3x3s1_p1": stride_h = stride_w = 1 kernel_h = kernel_w = 3 pad_left = pad_right = pad_top = pad_down = 1 - + zero_point = int(np.random.randint(-8, high=8, size=1)) diff --git a/tests/python_ref/maxpool_vlen.py b/tests/python_ref/maxpool_vlen.py new file mode 100644 index 00000000..97b5de7b --- /dev/null +++ b/tests/python_ref/maxpool_vlen.py @@ -0,0 +1,195 @@ +#!/usr/bin/python +#-*- coding:utf-8 -*- + +import sys +import struct +import numpy as np +from torch import tensor +from torch.nn import functional as fn +import math + +def getpackn(test_dtype, test_vlen): + if int(test_dtype) == 8: + return int(test_vlen)/int(test_dtype)/2 + else: + return int(test_vlen)/int(test_dtype) + +def maxpool2d_f32(test_dtype, test_vlen, test_type): + para = [] + # init the input data and parameters + batch = int(np.random.randint(1, high=2, size=1)) + channel = int(np.random.randint(2, high=6, size=1)) + in_height = int(np.random.randint(16, high=32, size=1)) + in_width = int(np.random.randint(16, high=32, size=1)) + stride_h = int(np.random.randint(1, high=4, size=1)) + stride_w = int(np.random.randint(1, high=4, size=1)) + kernel_h = int(np.random.randint(stride_h, high=9, size=1)) + kernel_w = int(np.random.randint(stride_w, high=9, size=1)) + pad_left = int(np.random.randint(0, high=2, size=1)) + pad_right = int(np.random.randint(0, high=2, size=1)) + pad_top = int(np.random.randint(0, high=2, size=1)) + pad_down = int(np.random.randint(0, high=2, size=1)) + c_model = False + + packn = int(getpackn(test_dtype, test_vlen)) + n = int(np.random.randint(1, high=2, size=1)) + + if "2x2s2" in test_type and test_type[-2] != "p": + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 0 + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + if test_type == "packn_2x2s2": + channel = int(n*packn) + elif test_type == "pack1_2x2s2": + channel = int(n*packn) + 1 + + elif "2x2s2p0" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 0 + pad_down = pad_right = 0 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + c_model = True + if test_type == "packn_2x2s2p0": + channel = int(n*packn) + elif test_type == "pack1_2x2s2p0": + channel = int(n*packn) + 1 + + + elif "2x2s2p1" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 2 + pad_left = pad_top = 1 + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width + if test_type == "packn_2x2s2p1": + channel = int(n*packn) + elif test_type == "pack1_2x2s2p1": + channel = int(n*packn) + 1 + + + elif "3x3s2" in test_type and test_type[-2] != "p": + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 0 + pad_down = pad_right = 1 + in_height = 2 * in_height + in_width = 2 * in_width + if test_type == "packn_3x3s2": + channel = int(n*packn) + elif test_type == "pack1_3x3s2": + channel = int(n*packn) + 1 + + elif "3x3s2p0" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 0 + pad_down = pad_right = 0 + in_height = 2 * in_height + in_width = 2 * in_width + c_model = True + if test_type == "packn_3x3s2p0": + channel = int(n*packn) + elif test_type == "pack1_3x3s2p0": + channel = int(n*packn) + 1 + + elif "3x3s2p1" in test_type: + stride_h = stride_w = 2 + kernel_h = kernel_w = 3 + pad_left = pad_top = 1 + pad_down = pad_right = 1 + in_height = 2 * in_height + 1 + in_width = 2 * in_width + 1 + if test_type == "packn_3x3s2p1": + channel = int(n*packn) + elif test_type == "pack1_3x3s2p1": + channel = int(n*packn) + 1 + + elif "3x3s1_p1" in test_type: + stride_h = stride_w = 1 + kernel_h = kernel_w = 3 + pad_left = pad_right = pad_top = pad_down = 1 + if test_type == "packn_3x3s1_p1": + channel = int(n*packn) + elif test_type == "pack1_3x3s1_p1": + channel = int(n*packn) + 1 + + + elif "global" in test_type: + if test_type == "packn_global": + channel = int(n*packn) + elif test_type == "global": + channel = int(n*packn) + 1 + in_height = kernel_h + in_width = kernel_w + pad_left = pad_right = pad_top = pad_down = 0 + + + + + zero_point = int(np.random.randint(-8, high=8, size=1)) + std = int(np.random.randint(1, high=3, size=1)) + + src_in = np.random.normal(zero_point, std, (batch, channel, in_height, in_width)) + + t_src_in = tensor(src_in) + t_src_in1 = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0) + + t_src_out = fn.max_pool2d(t_src_in1, kernel_size=(kernel_h, kernel_w), stride=(stride_h, stride_w), ceil_mode=c_model).numpy() + + + out_height = np.shape(t_src_out)[2] + out_width = np.shape(t_src_out)[3] + + # nc1c0hw ==> nc1hwc0 + if "packn" in test_type: + t_src_in = t_src_in.reshape([batch, math.ceil(channel/packn), packn, in_height, in_width]).permute([0, 1, 3, 4, 2]) + t_src_out = t_src_out.reshape([batch, math.ceil(channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2]) + + c_model = 1 if c_model else 0 + src_in_1 = t_src_in.flatten() + src_out_1 = t_src_out.flatten() + + total_size = (len(src_in_1) + len(src_out_1)) + 15 + + para.append(total_size) + para.append(batch) + para.append(channel) + para.append(in_height) + para.append(in_width) + para.append(stride_h) + para.append(stride_w) + para.append(kernel_h) + para.append(kernel_w) + para.append(pad_left) + para.append(pad_right) + para.append(pad_top) + para.append(pad_down) + para.append(out_height) + para.append(out_width) + para.append(c_model) + print(para) + + with open("maxpool_nchw_data_f32.bin", "wb") as fp: + data = struct.pack(('%di' % len(para)), *para) + fp.write(data) + data = struct.pack(('%df' % len(src_in_1)), *src_in_1) + fp.write(data) + data = struct.pack(('%df' % len(src_out_1)), *src_out_1) + fp.write(data) + fp.close() + + return 0 + + +if __name__ == '__main__': + test_dtype = sys.argv[1] + test_vlen = sys.argv[2] + test_type = sys.argv[3] + maxpool2d_f32(test_dtype, test_vlen, test_type) + print("end") diff --git a/tests/python_ref/mean_graph.py b/tests/python_ref/mean_graph.py index e76ef8c6..5f98857c 100644 --- a/tests/python_ref/mean_graph.py +++ b/tests/python_ref/mean_graph.py @@ -19,7 +19,7 @@ def reduce_mean_f32(): zero_point = int(np.random.randint(-6, high=6, size=1)) std = int(np.random.randint(50, high=60, size=1)) - axis_count = int(np.random.randint(1, high=2, size=1)) + axis_count = int(np.random.randint(1, high=2, size=1)) # must be 1 for anole axis_dim = [2, 3] axis_shape = random.sample(axis_dim, axis_count) diff --git a/tests/python_ref/relu.py b/tests/python_ref/relu.py index 49ee6eff..3c783d52 100755 --- a/tests/python_ref/relu.py +++ b/tests/python_ref/relu.py @@ -7,13 +7,19 @@ import tensorflow as tf -def relu_f32(): +def relu_f32(test_type): para = [] # init the input data and parameters - batch = int(np.random.randint(1, high=4, size=1)) - in_size_x = int(np.random.randint(32, high=64, size=1)) - in_size_y = int(np.random.randint(32, high=64, size=1)) - in_channel = int(np.random.randint(1, high=64, size=1)) + if test_type == "random": + batch = int(np.random.randint(1, high=4, size=1)) + in_size_x = int(np.random.randint(32, high=64, size=1)) + in_size_y = int(np.random.randint(32, high=64, size=1)) + in_channel = int(np.random.randint(1, high=64, size=1)) + elif test_type == "16x3_8_4_2_1": + batch = 1 + in_size_x = 3 + in_size_y = 3 + in_channel = 7 zero_point = int(np.random.randint(-6, high=6, size=1)) std = int(np.random.randint(1, high=20, size=1)) @@ -51,5 +57,6 @@ def relu_f32(): if __name__ == '__main__': - relu_f32() + test_type = sys.argv[1] + relu_f32(test_type) print("end") diff --git a/tests/unit_test/Makefile.rvv b/tests/unit_test/Makefile.rvv index 8bbf1434..85201a1c 100644 --- a/tests/unit_test/Makefile.rvv +++ b/tests/unit_test/Makefile.rvv @@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections CFLAGS += -DCSINN_API=15 -LIB_NAME = csi_nn2_rvv +LIB_NAME = shl_rvv CC = riscv64-unknown-linux-gnu-gcc diff --git a/tests/unit_test/add.c b/tests/unit_test/add.c index bb613960..83058258 100644 --- a/tests/unit_test/add.c +++ b/tests/unit_test/add.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/basic_math.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_add(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = 1; input0->dim[1] = in_c; input0->dim[2] = in_h; input0->dim[3] = in_w; input0->dim_count = 4; input0->name = "input0"; - int in0_size = csi_tensor_size(input0); + int in0_size = csinn_tensor_size(input0); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim[0] = 1; input1->dim[1] = in_c; input1->dim[2] = in_h; input1->dim[3] = in_w; input1->dim_count = 4; input1->name = "input1"; - int in1_size = csi_tensor_size(input1); + int in1_size = csinn_tensor_size(input1); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = in_c; output->dim[2] = in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct diso_params params; - params.base.name = "params"; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; input0->data = input0_data; input1->data = input1_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input0, input1, output, ¶ms); + func(input0, input1, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input0); - csi_free_tensor(input1); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input0); + csinn_free_tensor(input1); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of add for RVV.\n"); - verify_add(add_fp32_in0, add_fp32_in1, add_fp32_out, csi_nn_rvv_add_fp32, 2, 5, 11, + verify_add(add_fp32_in0, add_fp32_in1, add_fp32_out, shl_rvv_add_fp32, 2, 5, 11, CSINN_DTYPE_FLOAT32); - verify_add(add_fp16_in0, add_fp16_in1, add_fp16_out, csi_nn_rvv_add_fp16, 2, 5, 11, + verify_add(add_fp16_in0, add_fp16_in1, add_fp16_out, shl_rvv_add_fp16, 2, 5, 11, CSINN_DTYPE_FLOAT16); - // verify_add(add_int8_in0, add_int8_in1, add_int8_out, csi_nn_rvv_add_int8, 2, 5, 11, + // verify_add(add_int8_in0, add_int8_in1, add_int8_out, shl_rvv_add_int8, 2, 5, 11, // CSINN_DTYPE_INT8); return done_testing(); } diff --git a/tests/unit_test/avgpool.c b/tests/unit_test/avgpool.c index 767e6864..7e0933fb 100644 --- a/tests/unit_test/avgpool.c +++ b/tests/unit_test/avgpool.c @@ -16,101 +16,100 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/avgpool.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_avgpool2d(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, int out_c, int out_h, int out_w, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_c; output->dim[2] = out_h; output->dim[3] = out_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); - - struct pool_params params; - params.base.name = "params"; - params.ceil_mode = 0; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.filter_height = kernel_h; - params.filter_width = kernel_w; - params.pad_left = pad_w; - params.pad_right = pad_w; - params.pad_top = pad_h; - params.pad_down = pad_h; - params.count_include_pad = 1; + int out_size = csinn_tensor_size(output); + + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->ceil_mode = 0; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->filter_height = kernel_h; + params->filter_width = kernel_w; + params->pad_left = pad_w; + params->pad_right = pad_w; + params->pad_top = pad_h; + params->pad_down = pad_h; + params->count_include_pad = 1; input->data = input_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input, output, ¶ms); + func(input, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of avgpool for RVV.\n"); - verify_avgpool2d(avgpool2x2s2_fp32_in, avgpool2x2s2_fp32_out, csi_nn_rvv_avgpool2x2s2_fp32, 2, - 6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); - verify_avgpool2d(avgpool2x2s2_fp16_in, avgpool2x2s2_fp16_out, csi_nn_rvv_avgpool2x2s2_fp16, 2, - 6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); + verify_avgpool2d(avgpool2x2s2_fp32_in, avgpool2x2s2_fp32_out, shl_rvv_avgpool2x2s2_fp32, 2, 6, + 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); + verify_avgpool2d(avgpool2x2s2_fp16_in, avgpool2x2s2_fp16_out, shl_rvv_avgpool2x2s2_fp16, 2, 6, + 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); verify_avgpool2d(avgpool2x2s2_p1_fp32_in, avgpool2x2s2_p1_fp32_out, - csi_nn_rvv_avgpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, + shl_rvv_avgpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, CSINN_DTYPE_FLOAT32); verify_avgpool2d(avgpool2x2s2_p1_fp16_in, avgpool2x2s2_p1_fp16_out, - csi_nn_rvv_avgpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, + shl_rvv_avgpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, CSINN_DTYPE_FLOAT16); - verify_avgpool2d(avgpool3x3s2_fp32_in, avgpool3x3s2_fp32_out, csi_nn_rvv_avgpool3x3s2_fp32, 2, - 7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); - verify_avgpool2d(avgpool3x3s2_fp16_in, avgpool3x3s2_fp16_out, csi_nn_rvv_avgpool3x3s2_fp16, 2, - 7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); + verify_avgpool2d(avgpool3x3s2_fp32_in, avgpool3x3s2_fp32_out, shl_rvv_avgpool3x3s2_fp32, 2, 7, + 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); + verify_avgpool2d(avgpool3x3s2_fp16_in, avgpool3x3s2_fp16_out, shl_rvv_avgpool3x3s2_fp16, 2, 7, + 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); verify_avgpool2d(avgpool3x3s2_p1_fp32_in, avgpool3x3s2_p1_fp32_out, - csi_nn_rvv_avgpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, + shl_rvv_avgpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, CSINN_DTYPE_FLOAT32); verify_avgpool2d(avgpool3x3s2_p1_fp16_in, avgpool3x3s2_p1_fp16_out, - csi_nn_rvv_avgpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, + shl_rvv_avgpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, CSINN_DTYPE_FLOAT16); verify_avgpool2d(avgpool3x3s1_p1_fp32_in, avgpool3x3s1_p1_fp32_out, - csi_nn_rvv_avgpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, + shl_rvv_avgpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, CSINN_DTYPE_FLOAT32); verify_avgpool2d(avgpool3x3s1_p1_fp16_in, avgpool3x3s1_p1_fp16_out, - csi_nn_rvv_avgpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, + shl_rvv_avgpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, CSINN_DTYPE_FLOAT16); - verify_avgpool2d(global_avgpool_fp32_in, global_avgpool_fp32_out, - csi_nn_rvv_global_avgpool2d_fp32, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, - CSINN_DTYPE_FLOAT32); - verify_avgpool2d(global_avgpool_fp16_in, global_avgpool_fp16_out, - csi_nn_rvv_global_avgpool2d_fp16, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, - CSINN_DTYPE_FLOAT16); + verify_avgpool2d(global_avgpool_fp32_in, global_avgpool_fp32_out, shl_rvv_global_avgpool2d_fp32, + 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT32); + verify_avgpool2d(global_avgpool_fp16_in, global_avgpool_fp16_out, shl_rvv_global_avgpool2d_fp16, + 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT16); return done_testing(); } \ No newline at end of file diff --git a/tests/unit_test/concat.c b/tests/unit_test/concat.c index 8b9067f6..2f58ead5 100644 --- a/tests/unit_test/concat.c +++ b/tests/unit_test/concat.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/concat.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, int axis, enum csinn_dtype_enum dtype) { - struct csi_tensor *input[2]; + struct csinn_tensor *input[2]; - input[0] = csi_alloc_tensor(NULL); + input[0] = csinn_alloc_tensor(NULL); input[0]->dim[0] = 1; input[0]->dim[1] = in_c; input[0]->dim[2] = in_h; @@ -37,7 +38,7 @@ void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*f input[0]->dim_count = 4; input[0]->name = "input0"; - input[1] = csi_alloc_tensor(NULL); + input[1] = csinn_alloc_tensor(NULL); input[1]->dim[0] = 1; input[1]->dim[1] = in_c; input[1]->dim[2] = in_h; @@ -45,42 +46,43 @@ void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*f input[1]->dim_count = 4; input[1]->name = "input1"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = in_c; output->dim[2] = 2 * in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct concat_params params; - params.base.name = "params"; - params.axis = axis; - params.inputs_count = 2; + struct csinn_concat_params *params = + csinn_alloc_params(sizeof(struct csinn_concat_params), NULL); + params->base.name = "params"; + params->axis = axis; + params->inputs_count = 2; input[0]->data = input0_data; input[1]->data = input1_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func((struct csi_tensor **)input, output, ¶ms); + func((struct csinn_tensor **)input, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input[0]); - csi_free_tensor(input[1]); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input[0]); + csinn_free_tensor(input[1]); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of concat for RVV.\n"); - verify_concat(concat_fp32_in0, concat_fp32_in1, concat_fp32_out, csi_nn_rvv_concat_fp32, 2, 3, - 10, 2, CSINN_DTYPE_FLOAT32); - verify_concat(concat_fp16_in0, concat_fp16_in1, concat_fp16_out, csi_nn_rvv_concat_fp16, 2, 3, - 10, 2, CSINN_DTYPE_FLOAT16); - // verify_concat(concat_int8_in0, concat_int8_in1, concat_int8_out, csi_nn_rvv_concat_int8, 2, + verify_concat(concat_fp32_in0, concat_fp32_in1, concat_fp32_out, shl_rvv_concat_fp32, 2, 3, 10, + 2, CSINN_DTYPE_FLOAT32); + verify_concat(concat_fp16_in0, concat_fp16_in1, concat_fp16_out, shl_rvv_concat_fp16, 2, 3, 10, + 2, CSINN_DTYPE_FLOAT16); + // verify_concat(concat_int8_in0, concat_int8_in1, concat_int8_out, shl_rvv_concat_int8, 2, // 3, 10, 2, CSINN_DTYPE_FLOAT32); return done_testing(); } diff --git a/tests/unit_test/conv2d_1x1s1_gemm.c b/tests/unit_test/conv2d_1x1s1_gemm.c index b6e3151a..3dbbb693 100644 --- a/tests/unit_test/conv2d_1x1s1_gemm.c +++ b/tests/unit_test/conv2d_1x1s1_gemm.c @@ -16,42 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/conv2d.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_conv2d_1x1s1_reorder(void *kernel_data, void *ref_kernel, void (*reorder)(), int out_ch, int in_ch, enum csinn_dtype_enum dtype) { - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_ch; kernel->dim[1] = in_ch; kernel->dim[2] = 1; kernel->dim[3] = 1; kernel->dim_count = 4; kernel->name = "kernel"; - int kernel_size = csi_tensor_size(kernel); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 0; - params.pad_right = 0; - params.pad_top = 0; - params.pad_down = 0; - params.group = 1; + int kernel_size = csinn_tensor_size(kernel); + + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = 1; + params->stride_width = 1; + params->pad_left = 0; + params->pad_right = 0; + params->pad_top = 0; + params->pad_down = 0; + params->group = 1; kernel->data = kernel_data; - reorder(kernel, ¶ms); + reorder(kernel, params); evaluate_error(kernel->data, ref_kernel, kernel_size, dtype); - csi_free_tensor(kernel); + csinn_free_tensor(kernel); } void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias_data, @@ -59,16 +60,16 @@ void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias int in_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; kernel->dim[1] = in_c; kernel->dim[2] = 1; @@ -76,43 +77,44 @@ void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias kernel->dim_count = 4; kernel->name = "kernel"; - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_c; bias->dim_count = 1; bias->name = "bias"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_c; output->dim[2] = in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 0; - params.pad_right = 0; - params.pad_top = 0; - params.pad_down = 0; - params.group = 1; + int out_size = csinn_tensor_size(output); + + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = 1; + params->stride_width = 1; + params->pad_left = 0; + params->pad_right = 0; + params->pad_top = 0; + params->pad_down = 0; + params->group = 1; input->data = input_data; kernel->data = kernel_data; bias->data = bias_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - compute(input, output, kernel, bias, ¶ms); + compute(input, output, kernel, bias, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); - csi_free_tensor(kernel); - csi_free_tensor(bias); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); + csinn_free_tensor(kernel); + csinn_free_tensor(bias); } int main(int argc, char **argv) @@ -120,17 +122,17 @@ int main(int argc, char **argv) init_testsuite("Test function of convolution 1x1s1 for RVV.\n"); verify_conv2d_1x1s1_reorder(conv2d1x1s1_fp32_ker, conv2d1x1s1_fp32_ker1, - csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32, 19, 16, + shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32, 19, 16, CSINN_DTYPE_FLOAT32); verify_conv2d_1x1s1_compute(conv2d1x1s1_fp32_in, conv2d1x1s1_fp32_ker1, conv2d1x1s1_fp32_bias, - conv2d1x1s1_fp32_out, csi_nn_rvv_conv1x1s1_gemm_fp32, 19, 16, 4, 5, + conv2d1x1s1_fp32_out, shl_rvv_conv1x1s1_gemm_fp32, 19, 16, 4, 5, CSINN_DTYPE_FLOAT32); verify_conv2d_1x1s1_reorder(conv2d1x1s1_fp16_ker, conv2d1x1s1_fp16_ker1, - csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16, 19, 16, + shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16, 19, 16, CSINN_DTYPE_FLOAT16); verify_conv2d_1x1s1_compute(conv2d1x1s1_fp16_in, conv2d1x1s1_fp16_ker1, conv2d1x1s1_fp16_bias, - conv2d1x1s1_fp16_out, csi_nn_rvv_conv1x1s1_gemm_fp16, 19, 16, 4, 5, + conv2d1x1s1_fp16_out, shl_rvv_conv1x1s1_gemm_fp16, 19, 16, 4, 5, CSINN_DTYPE_FLOAT16); return done_testing(); diff --git a/tests/unit_test/conv2d_im2col_gemm.c b/tests/unit_test/conv2d_im2col_gemm.c index f837fa89..6d0a627d 100644 --- a/tests/unit_test/conv2d_im2col_gemm.c +++ b/tests/unit_test/conv2d_im2col_gemm.c @@ -16,43 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/conv2d.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_conv2d_im2col_reorder(void *kernel_data, void *ref_kernel, void (*reorder)(), int out_ch, int in_ch, int k_h, int k_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_ch; kernel->dim[1] = in_ch; kernel->dim[2] = k_h; kernel->dim[3] = k_w; kernel->dim_count = 4; kernel->name = "kernel"; - int kernel_size = csi_tensor_size(kernel); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 1; - params.pad_right = 1; - params.pad_top = 1; - params.pad_down = 1; - params.group = 1; + int kernel_size = csinn_tensor_size(kernel); + + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = 1; + params->stride_width = 1; + params->pad_left = 1; + params->pad_right = 1; + params->pad_top = 1; + params->pad_down = 1; + params->group = 1; kernel->data = kernel_data; - reorder(kernel, ¶ms); + reorder(kernel, params); evaluate_error(kernel->data, ref_kernel, kernel_size, dtype); - csi_free_tensor(kernel); + csinn_free_tensor(kernel); } void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bias_data, @@ -60,16 +61,16 @@ void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bia int out_c, int out_h, int out_w, int k_h, int k_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; kernel->dim[1] = in_c; kernel->dim[2] = k_h; @@ -77,43 +78,44 @@ void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bia kernel->dim_count = 4; kernel->name = "kernel"; - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_c; bias->dim_count = 1; bias->name = "bias"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_c; output->dim[2] = out_h; output->dim[3] = out_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 1; - params.pad_right = 1; - params.pad_top = 1; - params.pad_down = 1; - params.group = 1; + int out_size = csinn_tensor_size(output); + + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = 1; + params->stride_width = 1; + params->pad_left = 1; + params->pad_right = 1; + params->pad_top = 1; + params->pad_down = 1; + params->group = 1; input->data = input_data; kernel->data = kernel_data; bias->data = bias_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - compute(input, output, kernel, bias, ¶ms); + compute(input, output, kernel, bias, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); - csi_free_tensor(kernel); - csi_free_tensor(bias); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); + csinn_free_tensor(kernel); + csinn_free_tensor(bias); } int main(int argc, char **argv) @@ -121,19 +123,19 @@ int main(int argc, char **argv) init_testsuite("Test function of convolution im2col_gemm for RVV.\n"); verify_conv2d_im2col_reorder(conv2d_im2col_fp32_ker, conv2d_im2col_fp32_ker1, - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32, 19, 3, 3, 3, + shl_rvv_conv_im2col_gemm_reorder_kernel_fp32, 19, 3, 3, 3, CSINN_DTYPE_FLOAT32); verify_conv2d_im2col_compute(conv2d_im2col_fp32_in, conv2d_im2col_fp32_ker1, conv2d_im2col_fp32_bias, conv2d_im2col_fp32_out, - csi_nn_rvv_conv_im2col_gemm_fp32, 3, 4, 5, 19, 4, 5, 3, 3, + shl_rvv_conv_im2col_gemm_fp32, 3, 4, 5, 19, 4, 5, 3, 3, CSINN_DTYPE_FLOAT32); verify_conv2d_im2col_reorder(conv2d_im2col_fp16_ker, conv2d_im2col_fp16_ker1, - csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16, 19, 3, 3, 3, + shl_rvv_conv_im2col_gemm_reorder_kernel_fp16, 19, 3, 3, 3, CSINN_DTYPE_FLOAT16); verify_conv2d_im2col_compute(conv2d_im2col_fp16_in, conv2d_im2col_fp16_ker1, conv2d_im2col_fp16_bias, conv2d_im2col_fp16_out, - csi_nn_rvv_conv_im2col_gemm_fp16, 3, 4, 5, 19, 4, 5, 3, 3, + shl_rvv_conv_im2col_gemm_fp16, 3, 4, 5, 19, 4, 5, 3, 3, CSINN_DTYPE_FLOAT16); return done_testing(); diff --git a/tests/unit_test/conv2d_winograd.c b/tests/unit_test/conv2d_winograd.c index bc0db0e5..01b45c86 100644 --- a/tests/unit_test/conv2d_winograd.c +++ b/tests/unit_test/conv2d_winograd.c @@ -16,38 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/conv2d.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_conv2d_winograd3x3s1_trans(void *kernel_data, void *ref_kernel, void (*reorder)(), int out_ch, int in_ch, int k_h, int k_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_ch; kernel->dim[1] = in_ch; kernel->dim[2] = k_h; kernel->dim[3] = k_w; kernel->dim_count = 4; kernel->name = "kernel"; - int kernel_size = csi_tensor_size(kernel); + int kernel_size = csinn_tensor_size(kernel); - struct csi_tensor *t_kernel = csi_alloc_tensor(NULL); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 1; - params.pad_right = 1; - params.pad_top = 1; - params.pad_down = 1; - params.group = 1; + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); kernel->data = kernel_data; int ker_out_size = out_ch * in_ch * 8 * 8; // b6f3 @@ -55,8 +45,8 @@ void verify_conv2d_winograd3x3s1_trans(void *kernel_data, void *ref_kernel, void reorder(kernel, t_kernel); evaluate_error(t_kernel->data, ref_kernel, ker_out_size, dtype); - csi_free_tensor(kernel); - csi_free_tensor(t_kernel); + csinn_free_tensor(kernel); + csinn_free_tensor(t_kernel); } void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, void *bias_data, @@ -64,16 +54,16 @@ void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, vo int in_w, int out_c, int out_h, int out_w, int k_h, int k_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; kernel->dim[1] = in_c; kernel->dim[2] = k_h; @@ -82,45 +72,46 @@ void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, vo kernel->name = "kernel"; int ker_out_size = out_c * in_c * 8 * 8; // b6f3 - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_c; bias->dim_count = 1; bias->name = "bias"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_c; output->dim[2] = out_h; output->dim[3] = out_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); - - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = 1; - params.stride_width = 1; - params.pad_left = 1; - params.pad_right = 1; - params.pad_top = 1; - params.pad_down = 1; - params.group = 1; - params.conv_extra.kernel_tm = csi_alloc_tensor(NULL); + int out_size = csinn_tensor_size(output); + + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = 1; + params->stride_width = 1; + params->pad_left = 1; + params->pad_right = 1; + params->pad_top = 1; + params->pad_down = 1; + params->group = 1; + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); input->data = input_data; - params.conv_extra.kernel_tm->data = csi_mem_alloc(ker_out_size * sizeof(float)); - memcpy(params.conv_extra.kernel_tm->data, kernel_data, ker_out_size * sizeof(float)); + params->conv_extra.kernel_tm->data = shl_mem_alloc(ker_out_size * sizeof(float)); + memcpy(params->conv_extra.kernel_tm->data, kernel_data, ker_out_size * sizeof(float)); bias->data = bias_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - compute(input, output, kernel, bias, ¶ms); + compute(input, output, kernel, bias, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); - csi_free_tensor(kernel); - csi_free_tensor(bias); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); + csinn_free_tensor(kernel); + csinn_free_tensor(bias); } int main(int argc, char **argv) @@ -128,20 +119,20 @@ int main(int argc, char **argv) init_testsuite("Test function of convolution winograd3x3s1 for RVV.\n"); verify_conv2d_winograd3x3s1_trans(conv2d_winograd_fp32_ker, conv2d_winograd_fp32_ker1, - csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32, - 16, 8, 3, 3, CSINN_DTYPE_FLOAT32); + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32, 16, 8, 3, 3, + CSINN_DTYPE_FLOAT32); verify_conv2d_winograd3x3s1_compute(conv2d_winograd_fp32_in, conv2d_winograd_fp32_ker1, conv2d_winograd_fp32_bias, conv2d_winograd_fp32_out, - csi_nn_rvv_conv3x3s1_winograd64_packn_fp32, 8, 14, 14, 16, - 14, 14, 3, 3, CSINN_DTYPE_FLOAT32); + shl_rvv_wg_b6f3s1_packn_fp32, 8, 14, 14, 16, 14, 14, 3, 3, + CSINN_DTYPE_FLOAT32); verify_conv2d_winograd3x3s1_trans(conv2d_winograd_fp16_ker, conv2d_winograd_fp16_ker1, - csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16, - 16, 8, 3, 3, CSINN_DTYPE_FLOAT16); + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16, 16, 8, 3, 3, + CSINN_DTYPE_FLOAT16); verify_conv2d_winograd3x3s1_compute(conv2d_winograd_fp16_in, conv2d_winograd_fp16_ker1, conv2d_winograd_fp16_bias, conv2d_winograd_fp16_out, - csi_nn_rvv_conv3x3s1_winograd64_packn_fp16, 8, 14, 14, 16, - 14, 14, 3, 3, CSINN_DTYPE_FLOAT16); + shl_rvv_wg_b6f3s1_packn_fp16, 8, 14, 14, 16, 14, 14, 3, 3, + CSINN_DTYPE_FLOAT16); return done_testing(); } diff --git a/tests/unit_test/dwconv2d.c b/tests/unit_test/dwconv2d.c index d4f3cd04..318374ef 100644 --- a/tests/unit_test/dwconv2d.c +++ b/tests/unit_test/dwconv2d.c @@ -16,12 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/dwconv2d.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void *ref_data, @@ -29,16 +30,16 @@ void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = in_c; kernel->dim[1] = 1; kernel->dim[2] = kernel_h; @@ -46,66 +47,67 @@ void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void kernel->dim_count = 4; kernel->name = "kernel"; - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = in_c; bias->dim_count = 1; bias->name = "bias"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_c; output->dim[2] = out_h; output->dim[3] = out_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct conv2d_params params; - params.base.name = "params"; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_w; - params.pad_right = pad_w; - params.pad_top = pad_h; - params.pad_down = pad_h; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.name = "params"; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_w; + params->pad_right = pad_w; + params->pad_top = pad_h; + params->pad_down = pad_h; input->data = input_data; kernel->data = kernel_data; bias->data = bias_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input, output, kernel, bias, ¶ms); + func(input, output, kernel, bias, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); - csi_free_tensor(kernel); - csi_free_tensor(bias); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); + csinn_free_tensor(kernel); + csinn_free_tensor(bias); } int main(int argc, char **argv) { init_testsuite("Test function of depthwise_convolution for RVV.\n"); verify_dwconv2d(dwconv3x3s1_fp32_in, dwconv3x3s1_fp32_ker, dwconv3x3s1_fp32_bias, - dwconv3x3s1_fp32_out, csi_nn_rvv_dwconv3x3s1_fp32, 2, 4, 10, 2, 4, 10, 3, 3, 1, - 1, 1, 1, CSINN_DTYPE_FLOAT32); + dwconv3x3s1_fp32_out, shl_rvv_dwconv3x3s1_fp32, 2, 4, 10, 2, 4, 10, 3, 3, 1, 1, + 1, 1, CSINN_DTYPE_FLOAT32); verify_dwconv2d(dwconv3x3s1_fp16_in, dwconv3x3s1_fp16_ker, dwconv3x3s1_fp16_bias, - dwconv3x3s1_fp16_out, csi_nn_rvv_dwconv3x3s1_fp16, 2, 4, 10, 2, 4, 10, 3, 3, 1, - 1, 1, 1, CSINN_DTYPE_FLOAT16); + dwconv3x3s1_fp16_out, shl_rvv_dwconv3x3s1_fp16, 2, 4, 10, 2, 4, 10, 3, 3, 1, 1, + 1, 1, CSINN_DTYPE_FLOAT16); // verify_dwconv2d(dwconv3x3s1_int8_in, dwconv3x3s1_int8_ker, dwconv3x3s1_int8_bias, - // dwconv3x3s1_int8_out, csi_nn_rvv_dwconv3x3s1_int8, 2, 4, 10, 2, 4, 10, 3, 3, + // dwconv3x3s1_int8_out, shl_rvv_dwconv3x3s1_int8, 2, 4, 10, 2, 4, 10, 3, 3, // 1, 1, 1, 1, CSINN_DTYPE_INT8); verify_dwconv2d(dwconv3x3s2_fp32_in, dwconv3x3s2_fp32_ker, dwconv3x3s2_fp32_bias, - dwconv3x3s2_fp32_out, csi_nn_rvv_dwconv3x3s2_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, - 2, 1, 1, CSINN_DTYPE_FLOAT32); + dwconv3x3s2_fp32_out, shl_rvv_dwconv3x3s2_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, + 1, 1, CSINN_DTYPE_FLOAT32); verify_dwconv2d(dwconv3x3s2_fp16_in, dwconv3x3s2_fp16_ker, dwconv3x3s2_fp16_bias, - dwconv3x3s2_fp16_out, csi_nn_rvv_dwconv3x3s2_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, - 2, 1, 1, CSINN_DTYPE_FLOAT16); + dwconv3x3s2_fp16_out, shl_rvv_dwconv3x3s2_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, + 1, 1, CSINN_DTYPE_FLOAT16); // verify_dwconv2d(dwconv3x3s2_int8_in, dwconv3x3s2_int8_ker, dwconv3x3s2_int8_bias, - // dwconv3x3s2_int8_out, csi_nn_rvv_dwconv3x3s2_int8, 2, 6, 18, 2, 3, 9, 3, 3, + // dwconv3x3s2_int8_out, shl_rvv_dwconv3x3s2_int8, 2, 6, 18, 2, 3, 9, 3, 3, // 2, 2, 1, 1, CSINN_DTYPE_INT8); return done_testing(); diff --git a/tests/unit_test/fullyconnected.c b/tests/unit_test/fullyconnected.c index 581a7f72..33637ecb 100644 --- a/tests/unit_test/fullyconnected.c +++ b/tests/unit_test/fullyconnected.c @@ -16,98 +16,99 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/fullyconnected.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_fc_reorder(void *weight_data, void *ref_weight, void (*reorder)(), int in_nodes, int out_nodes, enum csinn_dtype_enum dtype) { - struct csi_tensor *weight = csi_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); weight->dim[0] = out_nodes; weight->dim[1] = in_nodes; weight->dim_count = 2; weight->name = "weight"; - int weight_size = csi_tensor_size(weight); + int weight_size = csinn_tensor_size(weight); weight->data = weight_data; reorder(weight); evaluate_error(weight->data, ref_weight, weight_size, dtype); - csi_free_tensor(weight); + csinn_free_tensor(weight); } void verify_fc_compute(void *input_data, void *weight_data, void *bias_data, void *ref_data, int (*compute)(), int in_nodes, int out_nodes, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_nodes; input->dim_count = 2; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *weight = csi_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); weight->dim[0] = out_nodes; weight->dim[1] = in_nodes; weight->dim_count = 2; weight->name = "weight"; - int weight_size = csi_tensor_size(weight); + int weight_size = csinn_tensor_size(weight); - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_nodes; bias->dim_count = 1; bias->name = "bias"; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_nodes; output->dim_count = 2; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct fc_params params; - params.base.name = "params"; + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); + params->base.name = "params"; input->data = input_data; weight->data = weight_data; bias->data = bias_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - compute(input, output, weight, bias, ¶ms); + compute(input, output, weight, bias, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); - csi_free_tensor(weight); - csi_free_tensor(bias); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); + csinn_free_tensor(weight); + csinn_free_tensor(bias); } int main(int argc, char **argv) { init_testsuite("Test function of fullyconnected for RVV.\n"); - verify_fc_reorder(fc_fp32_weight, fc_fp32_weight_ref, csi_nn_rvv_fc_gemv_transform_weight_fp32, - 17, 31, CSINN_DTYPE_FLOAT32); + verify_fc_reorder(fc_fp32_weight, fc_fp32_weight_ref, shl_rvv_fc_gemv_transform_weight_fp32, 17, + 31, CSINN_DTYPE_FLOAT32); verify_fc_compute(fc_fp32_in, fc_fp32_weight_ref, fc_fp32_bias, fc_fp32_out, - csi_nn_rvv_fullyconnected_packn_fp32, 17, 31, CSINN_DTYPE_FLOAT32); + shl_rvv_fullyconnected_packn_fp32, 17, 31, CSINN_DTYPE_FLOAT32); - verify_fc_reorder(fc_fp16_weight, fc_fp16_weight_ref, csi_nn_rvv_fc_gemv_transform_weight_fp16, - 17, 31, CSINN_DTYPE_FLOAT16); + verify_fc_reorder(fc_fp16_weight, fc_fp16_weight_ref, shl_rvv_fc_gemv_transform_weight_fp16, 17, + 31, CSINN_DTYPE_FLOAT16); verify_fc_compute(fc_fp16_in, fc_fp16_weight_ref, fc_fp16_bias, fc_fp16_out, - csi_nn_rvv_fullyconnected_packn_fp16, 17, 31, CSINN_DTYPE_FLOAT16); + shl_rvv_fullyconnected_packn_fp16, 17, 31, CSINN_DTYPE_FLOAT16); // verify_fc_reorder(fc_int8_weight, fc_int8_weight_ref, - // csi_nn_rvv_fc_gemv_transform_weight_int8, + // shl_rvv_fc_gemv_transform_weight_int8, // 17, 31, CSINN_DTYPE_INT8); // verify_fc_compute(fc_int8_in, fc_int8_weight_ref, fc_int8_bias, fc_int8_out, - // csi_nn_rvv_fullyconnected_packn_int8, 17, 31, CSINN_DTYPE_INT8); + // shl_rvv_fullyconnected_packn_int8, 17, 31, CSINN_DTYPE_INT8); return done_testing(); } diff --git a/tests/unit_test/gemm.c b/tests/unit_test/gemm.c index c2bc8b63..6d97ca6b 100644 --- a/tests/unit_test/gemm.c +++ b/tests/unit_test/gemm.c @@ -16,59 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/gemm.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_gemm_reorderA(void *ma_data, void *ref_ma_data, void (*reorder)(), int m, int k, int ldx, enum csinn_dtype_enum dtype) { - void *out_data = csi_mem_alloc(m * k * sizeof(float)); + void *out_data = shl_mem_alloc(m * k * sizeof(float)); reorder(ma_data, out_data, m, k, ldx); evaluate_error(out_data, ref_ma_data, m * k, dtype); - csi_mem_free(out_data); + shl_mem_free(out_data); } void verify_gemm_reorderB(void *mb_data, void *ref_mb_data, void (*reorder)(), int k, int n, int ldx, enum csinn_dtype_enum dtype) { - void *out_data = csi_mem_alloc(k * n * sizeof(float)); + void *out_data = shl_mem_alloc(k * n * sizeof(float)); reorder(mb_data, out_data, k, n, ldx); evaluate_error(out_data, ref_mb_data, k * n, dtype); - csi_mem_free(out_data); + shl_mem_free(out_data); } void verify_gemm_compute(void *ma_data, void *mb_data, void *bias_data, void *ref_data, void (*compute)(), int m, int k, int n, int ldx, enum csinn_dtype_enum dtype) { - void *out_data = csi_mem_alloc(m * n * sizeof(float)); - compute(out_data, ma_data, mb_data, m, k, n, ldx, bias_data); + void *out_data = shl_mem_alloc(m * n * sizeof(float)); + compute(out_data, ma_data, mb_data, bias_data, m, k, n, ldx); evaluate_error(out_data, ref_data, m * n, dtype); - csi_mem_free(out_data); + shl_mem_free(out_data); } int main(int argc, char **argv) { init_testsuite("Test function of gemm for RVV.\n"); - verify_gemm_reorderA(gemm_fp32_a, gemm_fp32_a1, csi_nn_rvv_reorder_kernel_n8_fp32, 31, 16, 16, + verify_gemm_reorderA(gemm_fp32_a, gemm_fp32_a1, shl_rvv_reorder_kernel_n8_fp32, 31, 16, 16, CSINN_DTYPE_FLOAT32); - verify_gemm_reorderB(gemm_fp32_b, gemm_fp32_b1, csi_nn_rvv_reorder_input_z8_fp32, 16, 20, 20, + verify_gemm_reorderB(gemm_fp32_b, gemm_fp32_b1, shl_rvv_reorder_input_z8_fp32, 16, 20, 20, CSINN_DTYPE_FLOAT32); verify_gemm_compute(gemm_fp32_a1, gemm_fp32_b1, gemm_fp32_bias, gemm_fp32_c, - csi_nn_rvv_gemm_8x8_fp32, 31, 16, 20, 20, CSINN_DTYPE_FLOAT32); + shl_rvv_gemm_8x8_fp32, 31, 16, 20, 20, CSINN_DTYPE_FLOAT32); - verify_gemm_reorderA(gemm_fp16_a, gemm_fp16_a1, csi_nn_rvv_reorder_kernel_n8_fp16, 31, 16, 16, + verify_gemm_reorderA(gemm_fp16_a, gemm_fp16_a1, shl_rvv_reorder_kernel_n8_fp16, 31, 16, 16, CSINN_DTYPE_FLOAT16); - verify_gemm_reorderB(gemm_fp16_b, gemm_fp16_b1, csi_nn_rvv_reorder_input_z16_fp16, 16, 20, 20, + verify_gemm_reorderB(gemm_fp16_b, gemm_fp16_b1, shl_rvv_reorder_input_z16_fp16, 16, 20, 20, CSINN_DTYPE_FLOAT16); verify_gemm_compute(gemm_fp16_a1, gemm_fp16_b1, gemm_fp16_bias, gemm_fp16_c, - csi_nn_rvv_gemm_8x16_fp16, 31, 16, 20, 20, CSINN_DTYPE_FLOAT16); + shl_rvv_gemm_8x16_fp16, 31, 16, 20, 20, CSINN_DTYPE_FLOAT16); return done_testing(); } diff --git a/tests/unit_test/leaky_relu.c b/tests/unit_test/leaky_relu.c index b23e746b..80d0ae27 100644 --- a/tests/unit_test/leaky_relu.c +++ b/tests/unit_test/leaky_relu.c @@ -16,59 +16,59 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/activation.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_leaky_relu(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, float alpha, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = in_c; output->dim[2] = in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct relu_params params; - params.base.name = "params"; - params.n = alpha; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; + params->n = alpha; input->data = input_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input, output, ¶ms); + func(input, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of leaky_relu for RVV.\n"); - verify_leaky_relu(leaky_relu_fp32_in, leaky_relu_fp32_out, csi_nn_rvv_leaky_relu_fp32, 2, 5, 11, + verify_leaky_relu(leaky_relu_fp32_in, leaky_relu_fp32_out, shl_rvv_leaky_relu_fp32, 2, 5, 11, 0.2, CSINN_DTYPE_FLOAT32); - verify_leaky_relu(leaky_relu_fp16_in, leaky_relu_fp16_out, csi_nn_rvv_leaky_relu_fp16, 2, 5, 11, + verify_leaky_relu(leaky_relu_fp16_in, leaky_relu_fp16_out, shl_rvv_leaky_relu_fp16, 2, 5, 11, 0.2, CSINN_DTYPE_FLOAT16); - // verify_leaky_relu(leaky_relu_int8_in, leaky_relu_int8_out, csi_nn_rvv_leaky_relu_int8, 2, 5, + // verify_leaky_relu(leaky_relu_int8_in, leaky_relu_int8_out, shl_rvv_leaky_relu_int8, 2, 5, // 11, 0.2, CSINN_DTYPE_INT8); return done_testing(); diff --git a/tests/unit_test/maxpool.c b/tests/unit_test/maxpool.c index e4da7765..eacde7a7 100644 --- a/tests/unit_test/maxpool.c +++ b/tests/unit_test/maxpool.c @@ -16,113 +16,112 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/maxpool.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_maxpool2d(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, int out_c, int out_h, int out_w, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_c; output->dim[2] = out_h; output->dim[3] = out_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); - - struct pool_params params; - params.base.name = "params"; - params.ceil_mode = 0; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.filter_height = kernel_h; - params.filter_width = kernel_w; - params.pad_left = pad_w; - params.pad_right = pad_w; - params.pad_top = pad_h; - params.pad_down = pad_h; + int out_size = csinn_tensor_size(output); + + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->ceil_mode = 0; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->filter_height = kernel_h; + params->filter_width = kernel_w; + params->pad_left = pad_w; + params->pad_right = pad_w; + params->pad_top = pad_h; + params->pad_down = pad_h; input->data = input_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input, output, ¶ms); + func(input, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of maxpool for RVV.\n"); - verify_maxpool2d(maxpool2x2s2_fp32_in, maxpool2x2s2_fp32_out, csi_nn_rvv_maxpool2x2s2_fp32, 2, - 6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); - verify_maxpool2d(maxpool2x2s2_fp16_in, maxpool2x2s2_fp16_out, csi_nn_rvv_maxpool2x2s2_fp16, 2, - 6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); - verify_maxpool2d(maxpool2x2s2_int8_in, maxpool2x2s2_int8_out, csi_nn_rvv_maxpool2x2s2_int8, 2, - 6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_INT8); + verify_maxpool2d(maxpool2x2s2_fp32_in, maxpool2x2s2_fp32_out, shl_rvv_maxpool2x2s2_fp32, 2, 6, + 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); + verify_maxpool2d(maxpool2x2s2_fp16_in, maxpool2x2s2_fp16_out, shl_rvv_maxpool2x2s2_fp16, 2, 6, + 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); + verify_maxpool2d(maxpool2x2s2_int8_in, maxpool2x2s2_int8_out, shl_rvv_maxpool2x2s2_int8, 2, 6, + 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_INT8); verify_maxpool2d(maxpool2x2s2_p1_fp32_in, maxpool2x2s2_p1_fp32_out, - csi_nn_rvv_maxpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, + shl_rvv_maxpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, CSINN_DTYPE_FLOAT32); verify_maxpool2d(maxpool2x2s2_p1_fp16_in, maxpool2x2s2_p1_fp16_out, - csi_nn_rvv_maxpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, + shl_rvv_maxpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, CSINN_DTYPE_FLOAT16); verify_maxpool2d(maxpool2x2s2_p1_int8_in, maxpool2x2s2_p1_int8_out, - csi_nn_rvv_maxpool2x2s2_p1_int8, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, + shl_rvv_maxpool2x2s2_p1_int8, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1, CSINN_DTYPE_INT8); - verify_maxpool2d(maxpool3x3s2_fp32_in, maxpool3x3s2_fp32_out, csi_nn_rvv_maxpool3x3s2_fp32, 2, - 7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); - verify_maxpool2d(maxpool3x3s2_fp16_in, maxpool3x3s2_fp16_out, csi_nn_rvv_maxpool3x3s2_fp16, 2, - 7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); - verify_maxpool2d(maxpool3x3s2_int8_in, maxpool3x3s2_int8_out, csi_nn_rvv_maxpool3x3s2_int8, 2, - 7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_INT8); + verify_maxpool2d(maxpool3x3s2_fp32_in, maxpool3x3s2_fp32_out, shl_rvv_maxpool3x3s2_fp32, 2, 7, + 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32); + verify_maxpool2d(maxpool3x3s2_fp16_in, maxpool3x3s2_fp16_out, shl_rvv_maxpool3x3s2_fp16, 2, 7, + 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16); + verify_maxpool2d(maxpool3x3s2_int8_in, maxpool3x3s2_int8_out, shl_rvv_maxpool3x3s2_int8, 2, 7, + 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_INT8); verify_maxpool2d(maxpool3x3s2_p1_fp32_in, maxpool3x3s2_p1_fp32_out, - csi_nn_rvv_maxpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, + shl_rvv_maxpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, CSINN_DTYPE_FLOAT32); verify_maxpool2d(maxpool3x3s2_p1_fp16_in, maxpool3x3s2_p1_fp16_out, - csi_nn_rvv_maxpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, + shl_rvv_maxpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, CSINN_DTYPE_FLOAT16); verify_maxpool2d(maxpool3x3s2_p1_int8_in, maxpool3x3s2_p1_int8_out, - csi_nn_rvv_maxpool3x3s2_p1_int8, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, + shl_rvv_maxpool3x3s2_p1_int8, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1, CSINN_DTYPE_INT8); verify_maxpool2d(maxpool3x3s1_p1_fp32_in, maxpool3x3s1_p1_fp32_out, - csi_nn_rvv_maxpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, + shl_rvv_maxpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, CSINN_DTYPE_FLOAT32); verify_maxpool2d(maxpool3x3s1_p1_fp16_in, maxpool3x3s1_p1_fp16_out, - csi_nn_rvv_maxpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, + shl_rvv_maxpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, CSINN_DTYPE_FLOAT16); verify_maxpool2d(maxpool3x3s1_p1_int8_in, maxpool3x3s1_p1_int8_out, - csi_nn_rvv_maxpool3x3s1_p1_int8, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, + shl_rvv_maxpool3x3s1_p1_int8, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1, CSINN_DTYPE_INT8); - verify_maxpool2d(global_maxpool_fp32_in, global_maxpool_fp32_out, - csi_nn_rvv_global_maxpool2d_fp32, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, - CSINN_DTYPE_FLOAT32); - verify_maxpool2d(global_maxpool_fp16_in, global_maxpool_fp16_out, - csi_nn_rvv_global_maxpool2d_fp16, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, - CSINN_DTYPE_FLOAT16); + verify_maxpool2d(global_maxpool_fp32_in, global_maxpool_fp32_out, shl_rvv_global_maxpool2d_fp32, + 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT32); + verify_maxpool2d(global_maxpool_fp16_in, global_maxpool_fp16_out, shl_rvv_global_maxpool2d_fp16, + 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT16); return done_testing(); } \ No newline at end of file diff --git a/tests/unit_test/mul.c b/tests/unit_test/mul.c index f8f5f89e..e30f81e8 100644 --- a/tests/unit_test/mul.c +++ b/tests/unit_test/mul.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/basic_math.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_mul(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = 1; input0->dim[1] = in_c; input0->dim[2] = in_h; input0->dim[3] = in_w; input0->dim_count = 4; input0->name = "input0"; - int in0_size = csi_tensor_size(input0); + int in0_size = csinn_tensor_size(input0); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim[0] = 1; input1->dim[1] = in_c; input1->dim[2] = in_h; input1->dim[3] = in_w; input1->dim_count = 4; input1->name = "input1"; - int in1_size = csi_tensor_size(input1); + int in1_size = csinn_tensor_size(input1); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = in_c; output->dim[2] = in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct diso_params params; - params.base.name = "params"; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; input0->data = input0_data; input1->data = input1_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input0, input1, output, ¶ms); + func(input0, input1, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input0); - csi_free_tensor(input1); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input0); + csinn_free_tensor(input1); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of mul for RVV.\n"); - // verify_mul(mul_fp32_in0, mul_fp32_in1, mul_fp32_out, csi_nn_rvv_mul_fp32, 2, 5, 11, + // verify_mul(mul_fp32_in0, mul_fp32_in1, mul_fp32_out, shl_rvv_mul_fp32, 2, 5, 11, // CSINN_DTYPE_FLOAT32); - // verify_mul(mul_fp16_in0, mul_fp16_in1, mul_fp16_out, csi_nn_rvv_mul_fp16, 2, 5, 11, + // verify_mul(mul_fp16_in0, mul_fp16_in1, mul_fp16_out, shl_rvv_mul_fp16, 2, 5, 11, // CSINN_DTYPE_FLOAT16); - // verify_mul(mul_int8_in0, mul_int8_in1, mul_int8_out, csi_nn_rvv_mul_int8, 2, 5, 11, + // verify_mul(mul_int8_in0, mul_int8_in1, mul_int8_out, shl_rvv_mul_int8, 2, 5, 11, // CSINN_DTYPE_INT8); return done_testing(); diff --git a/tests/unit_test/pad.c b/tests/unit_test/pad.c index 8861dbdd..4c324773 100644 --- a/tests/unit_test/pad.c +++ b/tests/unit_test/pad.c @@ -16,12 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/pad.dat" + #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int in_h, int in_w, @@ -31,7 +32,7 @@ void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int int padded_w = in_w + pad_left + pad_right; int out_size = in_c * padded_h * padded_w; - float *out = csi_mem_alloc(out_size * sizeof(float)); + float *out = shl_mem_alloc(out_size * sizeof(float)); if (dtype == CSINN_DTYPE_INT8) { func(input_data, out, in_c, in_h, in_w, padded_h, padded_w, pad_top, pad_left, (int8_t)0); @@ -41,17 +42,17 @@ void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int evaluate_error(out, ref_data, out_size, dtype); - csi_mem_free(out); + shl_mem_free(out); } int main(int argc, char **argv) { init_testsuite("Test function of pad for RVV.\n"); - verify_pad(pad_fp32_in, pad_fp32_out, csi_nn_rvv_pad_input_fp32, 3, 4, 19, 1, 1, 1, 1, + verify_pad(pad_fp32_in, pad_fp32_out, shl_rvv_pad_input_fp32, 3, 4, 19, 1, 1, 1, 1, CSINN_DTYPE_FLOAT32); - verify_pad(pad_fp16_in, pad_fp16_out, csi_nn_rvv_pad_input_fp16, 3, 4, 19, 1, 1, 1, 1, + verify_pad(pad_fp16_in, pad_fp16_out, shl_rvv_pad_input_fp16, 3, 4, 19, 1, 1, 1, 1, CSINN_DTYPE_FLOAT16); - verify_pad(pad_int8_in, pad_int8_out, csi_nn_rvv_pad_input_int8, 3, 4, 19, 1, 1, 1, 1, + verify_pad(pad_int8_in, pad_int8_out, shl_rvv_pad_input_int8, 3, 4, 19, 1, 1, 1, 1, CSINN_DTYPE_INT8); return done_testing(); diff --git a/tests/unit_test/relu.c b/tests/unit_test/relu.c index 332a4aa6..7f444dab 100644 --- a/tests/unit_test/relu.c +++ b/tests/unit_test/relu.c @@ -16,56 +16,56 @@ * limitations under the License. */ -/* CSI-NN2 version 1.13.x */ +/* CSI-NN2 version 2.0.x */ #include "./valid_data/activation.dat" #include "csi_nn.h" -#include "csi_thead_rvv.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" void verify_relu(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w, enum csinn_dtype_enum dtype) { - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_c; input->dim[2] = in_h; input->dim[3] = in_w; input->dim_count = 4; input->name = "input"; - int in_size = csi_tensor_size(input); + int in_size = csinn_tensor_size(input); - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = in_c; output->dim[2] = in_h; output->dim[3] = in_w; output->dim_count = 4; output->name = "output"; - int out_size = csi_tensor_size(output); + int out_size = csinn_tensor_size(output); - struct relu_params params; - params.base.name = "params"; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; input->data = input_data; - output->data = csi_mem_alloc(out_size * sizeof(float)); + output->data = shl_mem_alloc(out_size * sizeof(float)); - func(input, output, ¶ms); + func(input, output, params); evaluate_error(output->data, ref_data, out_size, dtype); - csi_free_tensor(input); - csi_mem_free(output->data); - csi_free_tensor(output); + csinn_free_tensor(input); + shl_mem_free(output->data); + csinn_free_tensor(output); } int main(int argc, char **argv) { init_testsuite("Test function of relu for RVV.\n"); - verify_relu(relu_fp32_in, relu_fp32_out, csi_nn_rvv_relu_fp32, 2, 5, 11, CSINN_DTYPE_FLOAT32); - verify_relu(relu_fp16_in, relu_fp16_out, csi_nn_rvv_relu_fp16, 2, 5, 11, CSINN_DTYPE_FLOAT16); - // verify_relu(relu_int8_in, relu_int8_out, csi_nn_rvv_relu_int8, 2, 5, 11, CSINN_DTYPE_INT8); + verify_relu(relu_fp32_in, relu_fp32_out, shl_rvv_relu_fp32, 2, 5, 11, CSINN_DTYPE_FLOAT32); + verify_relu(relu_fp16_in, relu_fp16_out, shl_rvv_relu_fp16, 2, 5, 11, CSINN_DTYPE_FLOAT16); + // verify_relu(relu_int8_in, relu_int8_out, shl_rvv_relu_int8, 2, 5, 11, CSINN_DTYPE_INT8); return done_testing(); } diff --git a/tests/unit_test/valid_data/conv2d.dat b/tests/unit_test/valid_data/conv2d.dat index 071fc448..9412ab70 100644 --- a/tests/unit_test/valid_data/conv2d.dat +++ b/tests/unit_test/valid_data/conv2d.dat @@ -1791,4210 +1791,4210 @@ unsigned char conv2d_winograd_fp32_ker[] = { 0x06, 0xcb, 0xbf, 0xbf, 0xd4, 0xee, 0x09, 0xc0, 0x51, 0x4c, 0x6d, 0xc0, 0x7f, 0x9b, 0x05, 0xc0}; unsigned char conv2d_winograd_fp32_ker1[] = { 0xd1, 0x12, 0xc5, 0xbf, 0x07, 0x12, 0xea, 0xbf, 0xae, 0x60, 0xc9, 0xbf, 0x51, 0xc9, 0x83, 0xc0, + 0xee, 0x16, 0xac, 0xbf, 0x7d, 0x67, 0xba, 0xbf, 0x7b, 0x05, 0xd6, 0xbe, 0x76, 0x7e, 0x75, 0xbe, 0x18, 0xa1, 0x9a, 0xbf, 0x5a, 0xf2, 0xab, 0xbf, 0x34, 0x42, 0xf5, 0xbf, 0x6c, 0xc2, 0x6c, 0xc0, + 0x7a, 0xc2, 0x53, 0xc0, 0x84, 0x5c, 0xd5, 0xbf, 0xb5, 0xc6, 0xc2, 0xbf, 0xbd, 0x5b, 0x7a, 0x3e, 0xb3, 0x48, 0x23, 0xc0, 0x1e, 0x78, 0xf7, 0xbd, 0x5f, 0x30, 0x29, 0xc0, 0xae, 0xbe, 0x53, 0xc0, + 0x81, 0x24, 0x23, 0xc0, 0x44, 0x9a, 0x35, 0xc0, 0x74, 0x7d, 0x51, 0xbf, 0xcd, 0x4c, 0xb7, 0xbe, 0xec, 0x61, 0x48, 0xc0, 0xc5, 0xbb, 0xb5, 0xbf, 0x55, 0x22, 0xfc, 0xbf, 0xfa, 0x50, 0xa9, 0xbe, + 0xf4, 0x46, 0xee, 0xbf, 0x64, 0x84, 0x34, 0xc0, 0x20, 0xc1, 0xbb, 0xbf, 0xf5, 0xd5, 0x1e, 0xc0, 0x63, 0x1a, 0x19, 0xc0, 0x2e, 0xe9, 0x07, 0xc0, 0x92, 0xf7, 0x4a, 0xbf, 0xc7, 0x17, 0x8d, 0xbf, + 0xe1, 0x5f, 0x00, 0xc0, 0xf8, 0x3a, 0x14, 0xc0, 0xa8, 0x24, 0xfe, 0xbf, 0xd0, 0x6a, 0xe9, 0xbf, 0xa9, 0x2d, 0x12, 0xc0, 0x1f, 0xf1, 0xbb, 0xbf, 0x7e, 0x8b, 0xdb, 0xbf, 0x46, 0x23, 0x42, 0xc0, + 0x78, 0x6d, 0xe6, 0xbf, 0x3f, 0x25, 0x3e, 0xc0, 0xdc, 0x09, 0xa7, 0xbf, 0xfd, 0x31, 0xae, 0xbf, 0x5d, 0x46, 0x28, 0xbf, 0x5f, 0x9a, 0x10, 0xc0, 0x87, 0x98, 0xba, 0xbf, 0x64, 0x3f, 0x1d, 0xc0, + 0xc1, 0x09, 0x45, 0xc0, 0x80, 0xa3, 0x9c, 0xbf, 0x8f, 0x27, 0x9e, 0xbf, 0xf0, 0x4f, 0x3c, 0xc0, 0x5b, 0x0e, 0x7d, 0xbf, 0x26, 0x87, 0x70, 0xbf, 0x06, 0x95, 0xdc, 0xbf, 0x03, 0xee, 0x4b, 0xbf, + 0x7f, 0x79, 0x38, 0xc0, 0x6f, 0x66, 0x02, 0xc0, 0xa5, 0xeb, 0xdb, 0xbf, 0x27, 0xe7, 0x20, 0xc0, 0x5f, 0x46, 0x85, 0x3f, 0x2a, 0xf3, 0x42, 0x3f, 0xb6, 0xf5, 0x9d, 0x3f, 0x23, 0x36, 0xfb, 0x3f, + 0xb4, 0x6b, 0x9d, 0x3f, 0xfa, 0x6c, 0x98, 0x3f, 0x1f, 0xb7, 0x72, 0x3f, 0x45, 0xff, 0x8f, 0x3f, 0x94, 0xd1, 0xc5, 0x3f, 0xa1, 0xb6, 0x6f, 0x3f, 0x04, 0x12, 0xba, 0x3f, 0xd8, 0x0f, 0x67, 0x3f, + 0x8f, 0xcd, 0xb6, 0x3f, 0xe9, 0x87, 0x2f, 0x3f, 0x39, 0x3b, 0x5a, 0x3f, 0xb8, 0x10, 0x85, 0x3f, 0xb4, 0x4c, 0x9e, 0x3f, 0xd8, 0xcb, 0xf5, 0x3e, 0xab, 0xf9, 0xb0, 0x3f, 0x52, 0x72, 0x9f, 0x3f, + 0x6b, 0x3a, 0x8f, 0x3f, 0x99, 0x81, 0xf7, 0x3f, 0xa8, 0x8f, 0xa4, 0x3f, 0x3e, 0x75, 0x00, 0x3f, 0x4e, 0xa0, 0x05, 0x40, 0xa7, 0xe1, 0x62, 0x3f, 0x0f, 0x6f, 0xb9, 0x3f, 0xd8, 0x65, 0xa6, 0x3f, + 0x08, 0xa0, 0xbf, 0x3f, 0x32, 0xdc, 0x85, 0x3f, 0xef, 0x2c, 0x9d, 0x3f, 0xb0, 0x4b, 0xf2, 0x3f, 0xb7, 0x00, 0xda, 0x3f, 0x7e, 0xed, 0x94, 0x3f, 0xb8, 0xd9, 0x3b, 0x3e, 0x40, 0xaa, 0x9c, 0x3f, + 0xcb, 0x87, 0x85, 0x3f, 0xca, 0x64, 0xa2, 0x3f, 0x46, 0x70, 0xb0, 0x3f, 0x14, 0x19, 0xff, 0x3f, 0x00, 0x2d, 0xb4, 0x3f, 0x44, 0x1f, 0xe7, 0x3f, 0x8c, 0x47, 0x23, 0x3f, 0xf0, 0xd8, 0xe8, 0x3f, + 0x06, 0x10, 0xbb, 0x3f, 0xd7, 0x3a, 0x93, 0x3f, 0xfa, 0xe4, 0xe9, 0x3f, 0xad, 0xaf, 0x55, 0x3f, 0xe8, 0xd0, 0xa2, 0x3f, 0x37, 0xd4, 0xa1, 0x3f, 0x0e, 0x4b, 0x14, 0x3f, 0x6d, 0x33, 0x81, 0x3f, + 0xec, 0xfe, 0x08, 0x40, 0x43, 0x0a, 0xb1, 0x3f, 0xcc, 0xbf, 0xc6, 0x3f, 0x60, 0xdb, 0xb0, 0x3f, 0xae, 0x61, 0x90, 0x3f, 0x7c, 0x31, 0x95, 0x3f, 0x5f, 0xfd, 0xed, 0x3f, 0xce, 0x86, 0x08, 0x3f, + 0x01, 0x51, 0xcc, 0x3f, 0x22, 0xc7, 0xdc, 0x3f, 0xed, 0xc7, 0xed, 0x3f, 0x46, 0x4a, 0xdc, 0x3f, 0x5e, 0xa7, 0x1a, 0x3f, 0xb0, 0x13, 0x9f, 0x3e, 0xc9, 0x50, 0x3c, 0xbe, 0x6e, 0x2a, 0x48, 0x3f, + 0x0a, 0x58, 0x94, 0x3e, 0x10, 0xef, 0xd4, 0xbc, 0x00, 0xe3, 0xd7, 0x3a, 0xd3, 0x84, 0x9b, 0x3e, 0x70, 0x59, 0x6f, 0x3e, 0xd0, 0xe9, 0x23, 0x3e, 0xd4, 0xdf, 0x48, 0x3f, 0x22, 0x8f, 0x39, 0x3f, + 0xd8, 0x2c, 0x10, 0x3f, 0x0e, 0xa9, 0xc6, 0x3e, 0x9b, 0x83, 0xdd, 0xbd, 0x48, 0x85, 0xaf, 0xbd, 0x35, 0xad, 0xc3, 0x3e, 0x14, 0x57, 0x12, 0xbd, 0xbf, 0x5a, 0x8e, 0xbd, 0xb4, 0x4d, 0xbf, 0x3e, + 0xe6, 0x30, 0x87, 0x3f, 0x0b, 0x2f, 0x89, 0x3e, 0x27, 0x42, 0x24, 0x3f, 0x77, 0xcd, 0x86, 0xbe, 0x72, 0x8e, 0xa7, 0x3f, 0xbc, 0xac, 0x13, 0x3f, 0x6a, 0x3d, 0x64, 0x3e, 0x4b, 0xf9, 0xb5, 0xbe, + 0xda, 0xb2, 0x25, 0x3e, 0x5e, 0x46, 0x99, 0x3f, 0xb9, 0x03, 0xa5, 0x3e, 0x3e, 0x8e, 0x09, 0x3f, 0xc4, 0xa4, 0xfa, 0x3d, 0x83, 0xe3, 0x34, 0xbe, 0x37, 0xff, 0x94, 0x3e, 0x3a, 0x1b, 0x5a, 0x3e, + 0x6f, 0xd3, 0x37, 0x3f, 0x08, 0x1b, 0x39, 0x3f, 0xec, 0xce, 0xbc, 0x3e, 0xa8, 0xdd, 0xde, 0x3e, 0x64, 0x3a, 0x6c, 0x3e, 0x5c, 0x76, 0x25, 0x3e, 0xa8, 0x77, 0x47, 0x3f, 0x70, 0x40, 0x48, 0x3f, + 0xc6, 0xfd, 0x1c, 0x3f, 0x71, 0xcb, 0x0f, 0x3f, 0x08, 0xce, 0xfe, 0x3d, 0x65, 0x5f, 0x68, 0x3e, 0xd4, 0xe8, 0x41, 0x3f, 0x46, 0xfa, 0x0b, 0x3f, 0x82, 0x76, 0x1d, 0x3e, 0x74, 0xe0, 0x0c, 0x3f, + 0xe8, 0x52, 0x9f, 0x3e, 0x2c, 0x3f, 0xc0, 0x3d, 0x42, 0xdb, 0x3f, 0x3f, 0xdf, 0xcc, 0xf5, 0x3e, 0x4c, 0xb5, 0xd0, 0x3d, 0xb0, 0xee, 0x4a, 0x3f, 0xf4, 0x4e, 0x53, 0x3e, 0xa6, 0xf8, 0x01, 0x3f, + 0x7c, 0x3b, 0x6a, 0x3f, 0xc6, 0x9b, 0x14, 0x3f, 0xe6, 0xa6, 0x73, 0x3f, 0xe1, 0x9d, 0xa3, 0x3e, 0xc4, 0x4e, 0x0a, 0xbe, 0xeb, 0xea, 0x8c, 0xbd, 0xdd, 0xde, 0xfc, 0xbd, 0x2e, 0xea, 0x48, 0xbe, + 0x45, 0xd9, 0x1d, 0xbe, 0x48, 0xdf, 0x03, 0xbe, 0x08, 0x72, 0x03, 0xbe, 0x53, 0x2a, 0x34, 0xbe, 0x8b, 0x1d, 0x50, 0xbe, 0xae, 0x57, 0xd4, 0xbd, 0x98, 0x0e, 0x46, 0xbe, 0xb5, 0xe8, 0x45, 0xbd, + 0x96, 0x42, 0x07, 0xbe, 0x8f, 0x93, 0x88, 0xbd, 0xd8, 0xeb, 0x92, 0xbd, 0xba, 0xa2, 0x23, 0xbe, 0x88, 0xe2, 0xf4, 0xbd, 0xfe, 0x91, 0x87, 0xbd, 0x9a, 0x27, 0xed, 0xbd, 0x42, 0x12, 0xc3, 0xbd, + 0xdc, 0xf0, 0x0a, 0xbe, 0x04, 0xdf, 0x55, 0xbe, 0x9f, 0x64, 0x4a, 0xbe, 0x84, 0x7f, 0x4d, 0xbd, 0x68, 0x6d, 0x8c, 0xbe, 0x33, 0x67, 0xea, 0xbd, 0x62, 0xb2, 0x26, 0xbe, 0x38, 0x31, 0x2a, 0xbe, + 0xcb, 0xb1, 0x2e, 0xbe, 0x27, 0x55, 0xfb, 0xbd, 0x54, 0x0b, 0x1b, 0xbe, 0xb3, 0x8d, 0x69, 0xbe, 0xe6, 0x36, 0x3a, 0xbe, 0xb6, 0x5d, 0xc2, 0xbd, 0xb2, 0x1e, 0x80, 0xbc, 0x53, 0x47, 0x21, 0xbe, + 0x09, 0x89, 0x00, 0xbe, 0x5f, 0xd6, 0x18, 0xbe, 0xea, 0xd5, 0x22, 0xbe, 0x6b, 0x14, 0x85, 0xbe, 0x80, 0x0f, 0x16, 0xbe, 0x7a, 0x80, 0x6b, 0xbe, 0xb9, 0xa1, 0x9e, 0xbd, 0x55, 0xed, 0x57, 0xbe, + 0x10, 0x6d, 0x42, 0xbe, 0x33, 0x0c, 0xd0, 0xbd, 0xf6, 0x7f, 0x72, 0xbe, 0xb0, 0xc1, 0xba, 0xbd, 0x78, 0xb9, 0x53, 0xbe, 0x95, 0x11, 0x11, 0xbe, 0x54, 0x5c, 0x3c, 0xbd, 0xb7, 0xb3, 0xc6, 0xbd, + 0xa3, 0xa2, 0x6f, 0xbe, 0xe4, 0x7b, 0x2f, 0xbe, 0x20, 0xb2, 0x6a, 0xbe, 0x0b, 0x60, 0x08, 0xbe, 0x1a, 0xbc, 0x10, 0xbe, 0x4d, 0x8c, 0x3b, 0xbe, 0xcb, 0x54, 0x6d, 0xbe, 0x6c, 0x70, 0xa1, 0xbd, + 0xc6, 0xa3, 0x41, 0xbe, 0xc0, 0x1b, 0x61, 0xbe, 0x07, 0xb6, 0x89, 0xbe, 0x1a, 0xe5, 0x42, 0xbe, 0x3c, 0x19, 0xbb, 0xbd, 0xa7, 0x5a, 0xc2, 0xbc, 0xec, 0x1c, 0x96, 0x3c, 0x15, 0x06, 0xa0, 0xbd, + 0xdb, 0x52, 0x76, 0xbd, 0x88, 0xa3, 0xe8, 0xbb, 0xa2, 0x1c, 0x0a, 0xbd, 0x8c, 0x24, 0xc0, 0xbd, 0x3e, 0x97, 0x93, 0xbd, 0xf6, 0x6a, 0xd5, 0xbc, 0xea, 0x8c, 0x01, 0xbe, 0x85, 0x35, 0xfa, 0xbc, + 0x4e, 0xc0, 0x3a, 0xbd, 0x50, 0x3b, 0x17, 0xbd, 0xc6, 0x42, 0xc7, 0x3c, 0x22, 0xa1, 0x41, 0xbd, 0x8e, 0xc0, 0x0b, 0xbd, 0x6a, 0x7a, 0x6f, 0xbc, 0xf8, 0xf6, 0xf0, 0x3c, 0x5d, 0xd3, 0x03, 0xbc, + 0xd8, 0x82, 0x04, 0xbe, 0x2d, 0x36, 0x2d, 0xbd, 0x8f, 0x72, 0x08, 0xbe, 0xe5, 0xc0, 0xd7, 0x3c, 0x16, 0x19, 0x49, 0xbe, 0xaa, 0x09, 0xab, 0xbd, 0xea, 0xb1, 0x24, 0xbd, 0x80, 0xe2, 0x2d, 0xba, + 0x68, 0xdb, 0x17, 0xbd, 0xb7, 0x32, 0x0d, 0xbe, 0x1a, 0x3a, 0x79, 0xbd, 0x7e, 0x7a, 0xbd, 0xbd, 0x58, 0xc1, 0xc2, 0xbc, 0x2c, 0x31, 0x20, 0x3d, 0x10, 0x3c, 0xd8, 0xbc, 0x62, 0x07, 0x67, 0xbd, + 0x8c, 0x7b, 0xbe, 0xbd, 0xe7, 0xed, 0xc1, 0xbd, 0x1c, 0xc9, 0x6d, 0xbd, 0x04, 0x4f, 0xd5, 0xbd, 0x84, 0x57, 0xec, 0xbc, 0xfd, 0x4c, 0x86, 0xbd, 0x02, 0x95, 0xbb, 0xbd, 0xea, 0x7f, 0xdb, 0xbd, + 0x1a, 0x25, 0xd7, 0xbd, 0x02, 0x08, 0x2f, 0xbd, 0xc3, 0x3f, 0x88, 0xbd, 0xea, 0x20, 0xf9, 0xbc, 0x14, 0x09, 0x1f, 0xbe, 0x70, 0x31, 0x8f, 0xbd, 0x6c, 0x3a, 0x61, 0xbb, 0x32, 0x5d, 0x51, 0xbd, + 0x2d, 0x3a, 0x51, 0xbd, 0x5e, 0xdb, 0x2d, 0xbd, 0x64, 0x70, 0x18, 0xbe, 0xda, 0x32, 0x30, 0xbd, 0xe3, 0xa8, 0x1e, 0xbd, 0xca, 0x5d, 0x15, 0xbe, 0xfa, 0x23, 0x88, 0xbd, 0xe5, 0x31, 0x9c, 0xbd, + 0x52, 0xc2, 0xf7, 0xbd, 0xe7, 0xdb, 0xd7, 0xbd, 0x79, 0xa8, 0x36, 0xbe, 0x6e, 0x8b, 0x4d, 0xbd, 0xe0, 0x1a, 0x6a, 0xbd, 0x57, 0xe6, 0x61, 0xbd, 0x3b, 0x2f, 0x99, 0xbd, 0x28, 0xa4, 0x07, 0xbe, + 0x01, 0xed, 0x84, 0xbd, 0x22, 0xd4, 0x8d, 0xbd, 0xc0, 0x14, 0x2e, 0xbd, 0x68, 0x9c, 0x2d, 0xbd, 0x23, 0xf1, 0x99, 0xbd, 0x58, 0x56, 0x63, 0xbd, 0x04, 0xf5, 0x9c, 0xbd, 0xcd, 0x08, 0xb1, 0xbd, + 0xe7, 0x33, 0xd0, 0xbd, 0xc9, 0x3e, 0x47, 0xbd, 0x78, 0x5b, 0x70, 0xbd, 0x37, 0x60, 0x13, 0xbd, 0xe0, 0x47, 0xac, 0xbd, 0x5b, 0xa2, 0xa7, 0xbc, 0xdd, 0x32, 0xc6, 0xbd, 0x89, 0x08, 0xc7, 0xbd, + 0x79, 0xe8, 0x91, 0xbd, 0xd1, 0x7f, 0xee, 0xbd, 0x58, 0x7c, 0x5c, 0xbd, 0x76, 0xfd, 0xe5, 0xbc, 0xf7, 0xb5, 0xe9, 0xbd, 0xb8, 0x84, 0x4b, 0xbd, 0x41, 0xca, 0xac, 0xbd, 0x46, 0x73, 0x70, 0xbd, + 0x89, 0x5f, 0xae, 0xbd, 0xda, 0xef, 0x91, 0xbd, 0x57, 0x1f, 0x88, 0xbd, 0x92, 0x55, 0xd8, 0xbd, 0xd7, 0x52, 0xd1, 0xbd, 0x59, 0x5d, 0xa6, 0xbd, 0x0b, 0xce, 0x86, 0xbc, 0xa0, 0x57, 0x7c, 0xbd, + 0xbc, 0x33, 0x82, 0xbd, 0x5b, 0xfb, 0x9d, 0xbd, 0xba, 0x31, 0xa4, 0xbd, 0xfa, 0x28, 0xcc, 0xbd, 0x5c, 0x29, 0xb4, 0xbd, 0x92, 0xa7, 0xb8, 0xbd, 0x32, 0x2a, 0x2f, 0xbd, 0x33, 0x39, 0xdf, 0xbd, + 0x5d, 0xfc, 0x9d, 0xbd, 0x8e, 0x5e, 0xaf, 0xbd, 0xfa, 0xb1, 0xb5, 0xbd, 0x4c, 0x7f, 0x51, 0xbd, 0x7c, 0x78, 0x49, 0xbd, 0xa3, 0x38, 0xa0, 0xbd, 0xda, 0x9e, 0x34, 0xbd, 0x5a, 0x4c, 0x93, 0xbd, + 0x04, 0xc2, 0x02, 0xbe, 0x5e, 0x97, 0x91, 0xbd, 0x09, 0x3d, 0x8e, 0xbd, 0xe2, 0x41, 0xc2, 0xbd, 0x94, 0x85, 0x6b, 0xbd, 0x16, 0x96, 0x4a, 0xbd, 0x42, 0x55, 0xc4, 0xbd, 0x1e, 0x9b, 0xdc, 0xbc, + 0x96, 0x8d, 0xc5, 0xbd, 0x12, 0x27, 0xbb, 0xbd, 0xe8, 0xf2, 0xb0, 0xbd, 0xe3, 0xce, 0xd1, 0xbd, 0x94, 0x96, 0x10, 0xbd, 0x17, 0x92, 0x05, 0xbd, 0xc4, 0x85, 0x7f, 0xbb, 0x2c, 0x61, 0x96, 0xbd, + 0xca, 0xa1, 0x92, 0xbc, 0xaf, 0xcf, 0x09, 0xbc, 0xf6, 0x07, 0x9e, 0x3b, 0xb0, 0x89, 0xad, 0xba, 0xba, 0xf9, 0x1c, 0xbc, 0xcd, 0xb2, 0x88, 0xbc, 0xac, 0xe6, 0x30, 0xbd, 0x51, 0xd5, 0x9e, 0xbd, + 0xcb, 0x42, 0x6f, 0xbd, 0xe1, 0x48, 0x0a, 0xbd, 0xb8, 0x7b, 0x2e, 0xbc, 0x56, 0x29, 0xa6, 0x3c, 0x7f, 0x8d, 0x29, 0xbd, 0x18, 0x8c, 0xae, 0x3b, 0xc7, 0x00, 0xc6, 0xbc, 0x3e, 0x79, 0x5b, 0xbd, + 0x75, 0x7a, 0x8b, 0xbd, 0xb2, 0xdc, 0x07, 0xbd, 0x6f, 0x30, 0xb1, 0xbc, 0xff, 0xc4, 0x26, 0x3c, 0x3d, 0xf4, 0x99, 0xbd, 0x30, 0x27, 0x0c, 0xbd, 0x65, 0x11, 0xbd, 0xbc, 0xcc, 0x26, 0xc5, 0x3c, + 0x62, 0x92, 0x96, 0xbc, 0xfe, 0x77, 0xa1, 0xbd, 0x28, 0x8a, 0xad, 0xbc, 0x3a, 0x0a, 0x1b, 0xbd, 0x70, 0xd0, 0xbd, 0xbc, 0x98, 0x11, 0x69, 0xbc, 0xba, 0xdc, 0xb2, 0xbc, 0xaa, 0x32, 0x35, 0xbc, + 0xf2, 0xd0, 0x41, 0xbd, 0xde, 0x37, 0x4c, 0xbd, 0x5e, 0x38, 0xf3, 0xbc, 0x9a, 0x9e, 0xb2, 0xbc, 0x32, 0x93, 0xee, 0xbc, 0xb6, 0x6c, 0x02, 0xbc, 0x7c, 0x1d, 0x4c, 0xbd, 0xa6, 0x17, 0x6a, 0xbd, + 0xb4, 0x43, 0x0e, 0xbd, 0xec, 0x34, 0x66, 0xbd, 0x90, 0x3c, 0x6a, 0xbb, 0xac, 0x0b, 0xaa, 0xbc, 0x66, 0x2f, 0xc0, 0xbc, 0x8b, 0x7f, 0x2d, 0xbd, 0x07, 0xf5, 0xba, 0xbc, 0x96, 0x93, 0x48, 0xbd, + 0xe0, 0x5f, 0x14, 0xbd, 0x12, 0x25, 0xd9, 0xbb, 0x32, 0xed, 0xef, 0xbc, 0x1c, 0xdd, 0x4b, 0xbd, 0x90, 0x0e, 0xcb, 0xbb, 0x1f, 0x72, 0xfc, 0xbc, 0xa0, 0x93, 0x58, 0xbc, 0x10, 0x1e, 0xd2, 0xbc, + 0xf3, 0x95, 0x7f, 0xbd, 0x8a, 0xf2, 0x0b, 0xbd, 0xa6, 0x5e, 0x28, 0xbd, 0x48, 0x99, 0x04, 0xbd, 0xe7, 0x63, 0x0a, 0xc0, 0x35, 0x75, 0x15, 0xbf, 0xe4, 0x22, 0x4a, 0xbf, 0x23, 0xa2, 0x03, 0xc0, + 0x77, 0xc6, 0x04, 0xc0, 0x4d, 0x11, 0x95, 0xbf, 0x14, 0x06, 0xdc, 0xbf, 0xa1, 0x64, 0x3e, 0xc0, 0xce, 0xe3, 0x32, 0xc0, 0xdd, 0xd4, 0x8f, 0xbf, 0x11, 0xb1, 0x47, 0xc0, 0xcb, 0x3f, 0x1a, 0x3d, + 0x01, 0xfc, 0x95, 0xbf, 0xf2, 0xb6, 0x3f, 0xbf, 0x50, 0xab, 0x1c, 0xbe, 0x2c, 0x01, 0x19, 0xc0, 0x9f, 0xac, 0x8b, 0xbf, 0x10, 0x02, 0x61, 0xbf, 0xfd, 0x30, 0x9f, 0xbe, 0x4b, 0x7e, 0xab, 0xbe, + 0x3a, 0x14, 0x16, 0xc0, 0xc3, 0x6c, 0x07, 0xc0, 0x75, 0x27, 0x61, 0xc0, 0xf8, 0xe8, 0x36, 0xbe, 0x81, 0x63, 0x90, 0xc0, 0x6a, 0xa4, 0xef, 0xbf, 0xd2, 0x48, 0xe5, 0xbf, 0xad, 0xb4, 0xe5, 0xbf, + 0x6b, 0x7b, 0xef, 0xbf, 0x7f, 0x82, 0x0e, 0xc0, 0x09, 0x5b, 0x01, 0xc0, 0x34, 0x1f, 0x3f, 0xc0, 0xfe, 0x8b, 0xdb, 0xbf, 0x3c, 0x40, 0xc6, 0xbd, 0x07, 0xa4, 0x8c, 0xbe, 0x79, 0x5f, 0x08, 0xc0, + 0xa3, 0x7f, 0xfa, 0xbf, 0xa0, 0x95, 0x0a, 0xc0, 0x5a, 0x0c, 0xf9, 0xbf, 0x24, 0xf5, 0x68, 0xc0, 0x58, 0x7a, 0xb3, 0xbf, 0x42, 0x4f, 0x3d, 0xc0, 0x9c, 0x8b, 0xbc, 0xbf, 0x07, 0x75, 0x34, 0xc0, + 0x0b, 0x8a, 0x37, 0xc0, 0x8a, 0x7d, 0x61, 0xbf, 0x28, 0x87, 0x45, 0xc0, 0x7a, 0x8e, 0x83, 0xbf, 0x64, 0x2c, 0x7a, 0xc0, 0x4d, 0x62, 0xe8, 0xbf, 0xe5, 0x29, 0x44, 0xbe, 0x6e, 0xb1, 0x86, 0xbf, + 0x24, 0x03, 0x1c, 0xc0, 0x3c, 0x5e, 0x06, 0xc0, 0x53, 0x6f, 0x7c, 0xc0, 0xf7, 0x90, 0x9f, 0xbf, 0x18, 0xae, 0xe3, 0xbf, 0x25, 0xdc, 0x5d, 0xc0, 0xa3, 0x29, 0x3b, 0xc0, 0x61, 0xd8, 0xc5, 0xbf, + 0x18, 0x23, 0x31, 0xc0, 0x36, 0x91, 0x49, 0xc0, 0x7d, 0x4c, 0x93, 0xc0, 0xcf, 0xf0, 0x04, 0xc0, 0xf9, 0x86, 0x92, 0x3f, 0x40, 0x6d, 0xa8, 0x3f, 0x18, 0x77, 0x7f, 0x3f, 0x96, 0x6b, 0x06, 0x40, + 0xf9, 0xc8, 0xb3, 0x3f, 0x24, 0x2f, 0xd5, 0x3f, 0xd3, 0x7f, 0x76, 0x3f, 0xd1, 0x13, 0x6e, 0x3f, 0xaa, 0xe2, 0x58, 0x3f, 0xd0, 0xa1, 0x85, 0x3f, 0x09, 0xf4, 0xb2, 0x3f, 0x65, 0xe8, 0xbd, 0x3f, + 0x51, 0xae, 0xca, 0x3f, 0x74, 0xa1, 0x9c, 0x3f, 0x10, 0x0f, 0x9e, 0x3f, 0x2e, 0xc9, 0x8b, 0x3f, 0xcb, 0xe4, 0xe5, 0x3f, 0xd7, 0xbe, 0x86, 0x3f, 0xd3, 0xd7, 0xcd, 0x3f, 0x52, 0x4c, 0xb7, 0x3f, + 0x9e, 0x1c, 0xe7, 0x3f, 0x06, 0x7e, 0xa9, 0x3f, 0xad, 0x58, 0x8f, 0x3f, 0xcc, 0x89, 0x2f, 0x3f, 0x84, 0x48, 0xed, 0x3f, 0xc0, 0xd0, 0x9a, 0x3f, 0x02, 0x71, 0xb4, 0x3f, 0x0c, 0xaa, 0x31, 0x3f, + 0xb3, 0xab, 0x76, 0x3f, 0x89, 0x1e, 0xd0, 0x3f, 0x66, 0xd0, 0x97, 0x3f, 0xa7, 0x3e, 0xa3, 0x3f, 0x54, 0x72, 0x9a, 0x3f, 0x78, 0x1b, 0x94, 0x3f, 0x44, 0x9c, 0x57, 0x3f, 0x7d, 0x45, 0x9a, 0x3f, + 0x58, 0xf4, 0x9f, 0x3f, 0xd8, 0x73, 0xd1, 0x3f, 0xcc, 0x2b, 0xd9, 0x3f, 0x74, 0xf4, 0x9e, 0x3f, 0xee, 0x3d, 0xc8, 0x3f, 0xf5, 0xc7, 0xc4, 0x3f, 0x64, 0x7a, 0x82, 0x3f, 0x91, 0xa9, 0xd8, 0x3f, + 0xf3, 0x78, 0x8d, 0x3f, 0xd2, 0xf1, 0x82, 0x3f, 0x47, 0x2c, 0xba, 0x3f, 0xf3, 0xb0, 0x89, 0x3f, 0x22, 0x34, 0x85, 0x3f, 0x59, 0xe8, 0xb6, 0x3f, 0xa8, 0x50, 0x89, 0x3f, 0x3c, 0xeb, 0xd3, 0x3f, + 0x93, 0xc6, 0xc8, 0x3f, 0x0a, 0x3c, 0x00, 0x40, 0xb1, 0x8d, 0x6d, 0x3f, 0x9c, 0x6f, 0xad, 0x3f, 0x44, 0xd2, 0x88, 0x3f, 0xfa, 0x04, 0x98, 0x3f, 0x18, 0xe1, 0xc9, 0x3f, 0x15, 0xf6, 0x9e, 0x3f, + 0xaf, 0x1f, 0xb8, 0x3f, 0x39, 0x2e, 0x39, 0x3f, 0xb5, 0xef, 0x73, 0x3f, 0xec, 0x08, 0xb4, 0x3f, 0x9a, 0x2f, 0x51, 0xbf, 0x72, 0x90, 0x48, 0xbf, 0x70, 0x98, 0x45, 0xbf, 0xde, 0x02, 0x8a, 0xbf, + 0xa8, 0x20, 0x58, 0xbf, 0x2d, 0x3e, 0x6c, 0xbf, 0xab, 0xd5, 0x44, 0xbf, 0xd0, 0x8c, 0x62, 0xbf, 0x8e, 0x63, 0x68, 0xbf, 0x60, 0x2e, 0x60, 0xbf, 0x5c, 0x1c, 0x7b, 0xbf, 0xe9, 0xc8, 0x6b, 0xbf, + 0x5c, 0x0b, 0x56, 0xbf, 0x08, 0x9d, 0x46, 0xbf, 0xbc, 0xa1, 0x56, 0xbf, 0xd1, 0x72, 0x6f, 0xbf, 0xac, 0xdb, 0x51, 0xbf, 0x04, 0x0c, 0x42, 0xbf, 0x01, 0x92, 0x46, 0xbf, 0xec, 0x50, 0x51, 0xbf, + 0x10, 0xc5, 0x77, 0xbf, 0xe6, 0xf5, 0x90, 0xbf, 0xf8, 0x3c, 0x76, 0xbf, 0xc3, 0x3a, 0x3b, 0xbf, 0xcf, 0x1f, 0x8e, 0xbf, 0xc8, 0x59, 0x62, 0xbf, 0xf8, 0xb7, 0x5e, 0xbf, 0x86, 0x9a, 0x3f, 0xbf, + 0x8e, 0x4c, 0x75, 0xbf, 0xc0, 0x94, 0x71, 0xbf, 0x40, 0x7e, 0x46, 0xbf, 0x9a, 0x67, 0x86, 0xbf, 0xae, 0xad, 0x77, 0xbf, 0xea, 0x83, 0x46, 0xbf, 0xc0, 0x07, 0x2c, 0xbf, 0x45, 0xcb, 0x0b, 0xbf, + 0x32, 0xcd, 0x45, 0xbf, 0xf1, 0xe1, 0x64, 0xbf, 0x4d, 0xc3, 0x77, 0xbf, 0xa0, 0x77, 0x5b, 0xbf, 0xc4, 0xdc, 0xa0, 0xbf, 0xc0, 0xe5, 0x84, 0xbf, 0x6b, 0x8d, 0x27, 0xbf, 0x1c, 0x19, 0x89, 0xbf, + 0xff, 0x35, 0x87, 0xbf, 0x76, 0xfb, 0x19, 0xbf, 0x4c, 0x89, 0x84, 0xbf, 0xb2, 0x47, 0x4a, 0xbf, 0xf6, 0x7f, 0x6b, 0xbf, 0x9e, 0xe5, 0x70, 0xbf, 0xec, 0xf6, 0x44, 0xbf, 0xc5, 0x7b, 0x66, 0xbf, + 0x91, 0x84, 0x73, 0xbf, 0x06, 0x90, 0x5c, 0xbf, 0x96, 0xf7, 0x64, 0xbf, 0xb8, 0x69, 0x8b, 0xbf, 0xef, 0x54, 0x5e, 0xbf, 0x96, 0x1a, 0x72, 0xbf, 0x28, 0x1e, 0x8c, 0xbf, 0x14, 0xea, 0x41, 0xbf, + 0x8f, 0x72, 0x89, 0xbf, 0xc6, 0x35, 0x2a, 0xbf, 0x4b, 0x9c, 0x4e, 0xbf, 0x52, 0x8a, 0x53, 0xbf, 0xc7, 0xfb, 0x8c, 0xbe, 0x2f, 0x6a, 0xc6, 0xbe, 0xaa, 0x1a, 0x90, 0xbe, 0x4a, 0x9f, 0xa1, 0xbe, + 0xc0, 0x12, 0xcd, 0xbe, 0x6c, 0x28, 0x9a, 0xbe, 0x20, 0x4d, 0xba, 0xbd, 0x18, 0x85, 0xe2, 0xbd, 0xee, 0x66, 0xa4, 0xbd, 0x8c, 0xe0, 0x28, 0xbe, 0x3e, 0xe2, 0xf4, 0xbe, 0x70, 0x3b, 0x4f, 0xbe, + 0x67, 0xd1, 0x73, 0xbe, 0x00, 0xc4, 0x9f, 0xbe, 0x7c, 0x70, 0x52, 0xbe, 0xa7, 0x6d, 0x50, 0xbe, 0x6c, 0xc4, 0xbc, 0xbe, 0xe1, 0x87, 0x50, 0xbe, 0x42, 0x8f, 0x43, 0xbe, 0xad, 0x1f, 0xa0, 0xbe, + 0xe2, 0xaf, 0xea, 0xbe, 0x3c, 0x82, 0x6c, 0xbe, 0xf1, 0x27, 0x6f, 0xbe, 0xa8, 0x4a, 0x40, 0x3d, 0x6a, 0xbf, 0x1e, 0xbf, 0x95, 0x15, 0x9a, 0xbe, 0x40, 0x19, 0xab, 0xbe, 0xc3, 0x60, 0x0d, 0xbe, + 0x94, 0x65, 0x8d, 0xbd, 0xee, 0xcc, 0xe9, 0xbe, 0xcb, 0xcf, 0x01, 0xbe, 0x54, 0x4e, 0xa9, 0xbe, 0x10, 0x8a, 0xd0, 0xbc, 0xdc, 0xdd, 0x7e, 0xbe, 0x60, 0x56, 0xf1, 0xbd, 0xc5, 0x1f, 0x26, 0xbe, + 0xb6, 0x9b, 0x8c, 0xbe, 0x00, 0x9d, 0xe6, 0xbe, 0x39, 0xa3, 0x1d, 0xbe, 0x2a, 0x00, 0xa3, 0xbd, 0x42, 0x75, 0x4a, 0xbe, 0x30, 0x9e, 0xd9, 0xbe, 0x6d, 0xe4, 0x9f, 0xbe, 0x65, 0x09, 0xdd, 0xbe, + 0xf2, 0x27, 0x64, 0xbe, 0x5b, 0x40, 0x7a, 0xbe, 0xa6, 0x8b, 0x64, 0xbe, 0x49, 0xd9, 0x74, 0xbe, 0xde, 0x94, 0xbe, 0xbe, 0xa6, 0x2d, 0xf2, 0xbe, 0xff, 0x94, 0x5d, 0xbe, 0xb6, 0x3f, 0xa6, 0xbe, + 0x23, 0x15, 0x30, 0xbe, 0x40, 0xba, 0x1b, 0xbf, 0x98, 0x92, 0x1c, 0xbf, 0x13, 0xf2, 0xa1, 0xbe, 0xb0, 0x58, 0x8f, 0xbe, 0xff, 0x1a, 0x9b, 0xbe, 0xfb, 0x22, 0xf3, 0xbe, 0x94, 0x89, 0x0c, 0xbf, + 0xf7, 0x18, 0xda, 0xbe, 0x03, 0x5d, 0x23, 0xbe, 0x24, 0x13, 0xad, 0xbe, 0x41, 0x8b, 0xaa, 0xbe, 0x6e, 0x12, 0xc9, 0x3d, 0x4e, 0x88, 0xbe, 0x3d, 0xb4, 0xcf, 0xc5, 0x3d, 0x74, 0x2b, 0xdc, 0x3d, + 0x7c, 0x7b, 0xcc, 0x3d, 0xf3, 0x9f, 0xc8, 0x3d, 0x92, 0xc8, 0xb3, 0x3d, 0xe3, 0xb2, 0xdb, 0x3d, 0x2b, 0x40, 0xe5, 0x3d, 0x78, 0xa2, 0xd6, 0x3d, 0x34, 0xde, 0xfe, 0x3d, 0x7e, 0x61, 0xca, 0x3d, + 0x78, 0x23, 0xad, 0x3d, 0xbb, 0xc0, 0xba, 0x3d, 0xd4, 0x4d, 0xc2, 0x3d, 0xea, 0xa0, 0xe9, 0x3d, 0x24, 0xf9, 0xa6, 0x3d, 0x5c, 0xd8, 0xb5, 0x3d, 0xb1, 0x0e, 0x94, 0x3d, 0xf0, 0x71, 0xb9, 0x3d, + 0x62, 0x00, 0xdd, 0x3d, 0x84, 0x94, 0x0c, 0x3e, 0x32, 0xf3, 0xf2, 0x3d, 0xd6, 0x0e, 0xad, 0x3d, 0xea, 0x05, 0x0b, 0x3e, 0x96, 0xde, 0xdb, 0x3d, 0x26, 0x3f, 0xcd, 0x3d, 0x5c, 0xaf, 0xc4, 0x3d, + 0x5b, 0xa6, 0xeb, 0x3d, 0x11, 0xa9, 0xe1, 0x3d, 0x99, 0x34, 0xaa, 0x3d, 0x2e, 0xaf, 0x06, 0x3e, 0x0a, 0x73, 0xd9, 0x3d, 0xe6, 0xb6, 0xb8, 0x3d, 0xd1, 0x01, 0xa1, 0x3d, 0x9e, 0x2c, 0x4c, 0x3d, + 0xa8, 0x2c, 0xb4, 0x3d, 0xe2, 0x12, 0xd1, 0x3d, 0x30, 0x41, 0xc5, 0x3d, 0x1e, 0xbc, 0xba, 0x3d, 0x8a, 0xc2, 0x15, 0x3e, 0x88, 0xc3, 0x00, 0x3e, 0xfb, 0x73, 0xa3, 0x3d, 0x40, 0xd8, 0x00, 0x3e, + 0x57, 0xef, 0x07, 0x3e, 0x7c, 0xf7, 0x8b, 0x3d, 0x45, 0xa6, 0xf1, 0x3d, 0x1e, 0xc9, 0xc1, 0x3d, 0x97, 0xac, 0xf9, 0x3d, 0x14, 0xf6, 0xef, 0x3d, 0xeb, 0x47, 0xb9, 0x3d, 0x78, 0xce, 0xc4, 0x3d, + 0xd2, 0xbf, 0xca, 0x3d, 0xc1, 0x2e, 0xbe, 0x3d, 0x61, 0x05, 0x09, 0x3e, 0x93, 0x3d, 0x09, 0x3e, 0xfc, 0x7e, 0xde, 0x3d, 0xfc, 0x77, 0xf0, 0x3d, 0xdc, 0x9e, 0x0a, 0x3e, 0x03, 0x22, 0xcc, 0x3d, + 0xa7, 0xa5, 0x09, 0x3e, 0xfa, 0x34, 0xab, 0x3d, 0x76, 0x7f, 0xd9, 0x3d, 0x48, 0xf0, 0xbf, 0x3d, 0xb7, 0x3b, 0x34, 0x3d, 0xd8, 0xe4, 0x5a, 0x3d, 0xd5, 0xc0, 0x42, 0x3d, 0x5b, 0x00, 0x00, 0x3d, + 0x20, 0x38, 0x63, 0x3d, 0x5a, 0x96, 0x12, 0x3d, 0x98, 0xc8, 0xa3, 0x3c, 0x5c, 0x71, 0xf4, 0x3c, 0x78, 0x1e, 0xef, 0x3c, 0x5d, 0x21, 0x0a, 0x3d, 0xcf, 0xee, 0x97, 0x3d, 0x33, 0xcc, 0xdc, 0x3c, + 0xa5, 0xaa, 0xca, 0x3c, 0x69, 0x89, 0x37, 0x3d, 0x7b, 0x5f, 0x01, 0x3d, 0xc9, 0x82, 0x27, 0x3d, 0xf0, 0x2f, 0x15, 0x3d, 0xa6, 0xa0, 0x08, 0x3d, 0x1c, 0x81, 0x62, 0x3c, 0x56, 0x15, 0x24, 0x3d, + 0x2e, 0x52, 0x69, 0x3d, 0xe4, 0x0c, 0x41, 0x3d, 0xd2, 0x94, 0x3b, 0x3d, 0x48, 0x8e, 0xda, 0x3b, 0x44, 0xa5, 0xb1, 0x3d, 0x62, 0xd8, 0x48, 0x3d, 0xbd, 0x05, 0x3f, 0x3d, 0xfc, 0x5a, 0x0f, 0x3d, + 0x7c, 0x33, 0xd6, 0x3c, 0xe0, 0xd4, 0x7b, 0x3d, 0x3c, 0x7e, 0x95, 0x3c, 0x43, 0x16, 0x74, 0x3d, 0x24, 0x29, 0x44, 0x3c, 0x48, 0xc0, 0x19, 0x3d, 0x03, 0x11, 0xbe, 0x3c, 0x5b, 0xd0, 0x3b, 0x3c, + 0x5f, 0x5a, 0x1c, 0x3d, 0x0e, 0x6d, 0x6c, 0x3d, 0x20, 0x93, 0x74, 0x3c, 0x42, 0xb0, 0x5b, 0x3c, 0x65, 0x43, 0x25, 0x3d, 0x56, 0xf0, 0x83, 0x3d, 0x6e, 0xbc, 0x3a, 0x3d, 0x94, 0x7e, 0x7d, 0x3d, + 0xc1, 0x53, 0x4a, 0x3d, 0x93, 0xa9, 0x05, 0x3d, 0xd6, 0x99, 0x16, 0x3d, 0xd7, 0xdc, 0x21, 0x3d, 0xc5, 0x81, 0x89, 0x3d, 0x72, 0x1d, 0x90, 0x3d, 0x2a, 0x0d, 0x10, 0x3d, 0xaf, 0xd6, 0x1d, 0x3d, + 0x2d, 0x9b, 0xac, 0x3c, 0x89, 0x50, 0x8a, 0x3d, 0x5e, 0x20, 0xd8, 0x3d, 0xa7, 0x65, 0x68, 0x3d, 0x39, 0xf0, 0x4b, 0x3d, 0xd3, 0xa7, 0x59, 0x3d, 0xaa, 0x4e, 0x96, 0x3d, 0x6a, 0x6e, 0xa1, 0x3d, + 0xff, 0x9d, 0x8e, 0x3d, 0x4f, 0x6c, 0x07, 0x3d, 0xf6, 0xe0, 0x72, 0x3d, 0x0c, 0xd9, 0x35, 0x3d, 0xa4, 0x8f, 0x3d, 0x3d, 0x1e, 0x53, 0x3e, 0x3d, 0x5c, 0x45, 0x2c, 0x3d, 0x9b, 0x6a, 0x92, 0x3d, + 0xf9, 0x0d, 0x4d, 0x3d, 0x34, 0x07, 0x70, 0x3d, 0x28, 0x2d, 0x33, 0x3d, 0x1e, 0xc1, 0x41, 0x3d, 0xce, 0x28, 0x41, 0x3d, 0xd5, 0x55, 0x45, 0x3d, 0xae, 0x9e, 0x5d, 0x3d, 0xb9, 0x64, 0x68, 0x3d, + 0x0c, 0x55, 0x60, 0x3d, 0x54, 0xba, 0x3a, 0x3d, 0x1e, 0x8e, 0x4a, 0x3d, 0xd6, 0xcc, 0x4f, 0x3d, 0xeb, 0xa5, 0x65, 0x3d, 0xaa, 0xdd, 0x31, 0x3d, 0xae, 0x25, 0x5b, 0x3d, 0x17, 0x56, 0x4f, 0x3d, + 0xe9, 0x73, 0x78, 0x3d, 0xe9, 0x85, 0x7c, 0x3d, 0x8b, 0x3c, 0x54, 0x3d, 0x34, 0x8d, 0x21, 0x3d, 0xe2, 0xac, 0x84, 0x3d, 0x60, 0xf8, 0x4a, 0x3d, 0xbc, 0xc1, 0x54, 0x3d, 0x39, 0x45, 0x1b, 0x3d, + 0x61, 0x6c, 0x51, 0x3d, 0xc7, 0x90, 0x68, 0x3d, 0xa4, 0x92, 0x41, 0x3d, 0x3d, 0x6b, 0x67, 0x3d, 0xdb, 0xac, 0x65, 0x3d, 0x3e, 0x5b, 0x39, 0x3d, 0x4c, 0xae, 0x1a, 0x3d, 0x2b, 0xd9, 0x1d, 0x3d, + 0xdd, 0xed, 0x3d, 0x3d, 0xea, 0xf9, 0x61, 0x3d, 0xdb, 0x4c, 0x80, 0x3d, 0x9d, 0x61, 0x54, 0x3d, 0x94, 0xda, 0x90, 0x3d, 0x42, 0xaa, 0x72, 0x3d, 0x6b, 0x21, 0x1a, 0x3d, 0xd0, 0xfb, 0x80, 0x3d, + 0x1c, 0x4c, 0x62, 0x3d, 0xc4, 0xb6, 0x15, 0x3d, 0x98, 0xe8, 0x76, 0x3d, 0x98, 0x8f, 0x36, 0x3d, 0xa1, 0x48, 0x41, 0x3d, 0xfe, 0xde, 0x59, 0x3d, 0x96, 0x55, 0x34, 0x3d, 0x48, 0xb0, 0x6a, 0x3d, + 0x0e, 0x63, 0x74, 0x3d, 0xbe, 0xf9, 0x6d, 0x3d, 0xd1, 0x69, 0x29, 0x3d, 0xa6, 0x99, 0x73, 0x3d, 0x2b, 0x09, 0x40, 0x3d, 0x2d, 0xd4, 0x52, 0x3d, 0x23, 0x7f, 0x7b, 0x3d, 0x3e, 0x05, 0x2d, 0x3d, + 0x9c, 0x53, 0x71, 0x3d, 0xe0, 0x56, 0x0f, 0x3d, 0x62, 0xb5, 0x2b, 0x3d, 0x6d, 0xe3, 0x4d, 0x3d, 0x22, 0x36, 0x9d, 0x3c, 0x77, 0x7a, 0xda, 0x3c, 0x24, 0xac, 0x8f, 0x3c, 0xee, 0xa9, 0x08, 0x3d, + 0x19, 0x5d, 0xe4, 0x3c, 0xdc, 0x64, 0xe1, 0x3c, 0xf4, 0x5a, 0x21, 0x3c, 0x48, 0xaa, 0x0c, 0x3c, 0x07, 0x82, 0xbd, 0x3b, 0x2b, 0x10, 0x4f, 0x3c, 0x92, 0x5e, 0xed, 0x3c, 0x8e, 0x6c, 0xaa, 0x3c, + 0x7c, 0xb8, 0xcb, 0x3c, 0x9c, 0x7c, 0xb7, 0x3c, 0x0e, 0xe0, 0x91, 0x3c, 0x42, 0xb5, 0x67, 0x3c, 0xbf, 0x44, 0x09, 0x3d, 0x41, 0xab, 0x80, 0x3c, 0x40, 0xce, 0xc6, 0x3c, 0xa6, 0xdd, 0xcf, 0x3c, + 0x9d, 0x1c, 0x10, 0x3d, 0xa6, 0xc6, 0x87, 0x3c, 0x09, 0x4f, 0x7c, 0x3c, 0xa0, 0xd0, 0x09, 0x3a, 0x34, 0xf3, 0x24, 0x3d, 0xf8, 0x0b, 0xa7, 0x3c, 0xe8, 0x0a, 0xce, 0x3c, 0xd7, 0x1a, 0xf2, 0x3b, + 0x28, 0x97, 0xda, 0x3b, 0x26, 0xd2, 0x04, 0x3d, 0x66, 0xf6, 0x72, 0x3c, 0x04, 0x30, 0xa8, 0x3c, 0xd8, 0xfb, 0x12, 0x3c, 0xf6, 0x08, 0x9b, 0x3c, 0xef, 0xc2, 0x24, 0x3c, 0xd0, 0x79, 0x9e, 0x3c, + 0xc8, 0xdc, 0xaf, 0x3c, 0x90, 0x1d, 0x07, 0x3d, 0xd4, 0xd5, 0xb3, 0x3c, 0x1c, 0x6e, 0x54, 0x3c, 0x90, 0xa3, 0x91, 0x3c, 0x11, 0x27, 0xea, 0x3c, 0x4f, 0x17, 0xa8, 0x3c, 0xd2, 0x0c, 0xfe, 0x3c, + 0x34, 0x5d, 0x5e, 0x3c, 0x22, 0x28, 0x99, 0x3c, 0x7c, 0x1e, 0xa1, 0x3c, 0xca, 0x69, 0x8b, 0x3c, 0x9d, 0x3b, 0xa2, 0x3c, 0xb8, 0x0c, 0xf4, 0x3c, 0x80, 0x28, 0x86, 0x3c, 0x4e, 0x9a, 0xe9, 0x3c, + 0x0f, 0x94, 0xa9, 0x3c, 0x86, 0x1b, 0x3a, 0x3d, 0xda, 0xfe, 0xde, 0x3c, 0xa8, 0xa2, 0xaa, 0x3c, 0x96, 0x04, 0x8f, 0x3c, 0x34, 0x60, 0x9e, 0x3c, 0x2a, 0x20, 0xf9, 0x3c, 0xa4, 0x51, 0x02, 0x3d, + 0x9a, 0x4c, 0xd9, 0x3c, 0x3a, 0x60, 0x1f, 0x3c, 0xd0, 0x4c, 0x97, 0x3c, 0x56, 0xbf, 0xd1, 0x3c, 0x35, 0x1c, 0xa8, 0x3f, 0xfa, 0xd0, 0xa8, 0x3f, 0xf2, 0x9e, 0xaf, 0x3f, 0xe2, 0x98, 0x84, 0x3f, + 0x50, 0xb6, 0xb2, 0x3f, 0x8a, 0x4d, 0x8b, 0x3f, 0x9e, 0xc6, 0x78, 0x3f, 0x3a, 0xaf, 0xa7, 0x3f, 0x24, 0x1d, 0xb0, 0x3f, 0x83, 0x11, 0xa6, 0x3f, 0x23, 0x4b, 0xf1, 0x3f, 0x58, 0xa2, 0x85, 0x3f, + 0xb5, 0x62, 0x55, 0x3f, 0x75, 0xad, 0x9c, 0x3f, 0x86, 0x96, 0x8e, 0x3f, 0xd4, 0x36, 0xbc, 0x3f, 0xa8, 0xc1, 0x60, 0x3f, 0xe5, 0x34, 0x8e, 0x3f, 0x71, 0x19, 0x11, 0x3f, 0x88, 0x40, 0x8e, 0x3f, + 0x02, 0xa4, 0xb3, 0x3f, 0xe0, 0x2f, 0xdf, 0x3f, 0x25, 0xef, 0xc8, 0x3f, 0xec, 0xaf, 0x5a, 0x3f, 0x32, 0x8b, 0x02, 0x40, 0x64, 0x80, 0xba, 0x3f, 0x2a, 0x5c, 0xa6, 0x3f, 0x07, 0x7c, 0xa6, 0x3f, + 0x8e, 0x82, 0xac, 0x3f, 0x14, 0x2c, 0xc3, 0x3f, 0x25, 0x00, 0x58, 0x3f, 0x82, 0x66, 0xea, 0x3f, 0xeb, 0x85, 0x83, 0x3f, 0x55, 0xe7, 0x92, 0x3f, 0x77, 0x55, 0x6f, 0x3f, 0x80, 0xe0, 0xc6, 0x3e, + 0x16, 0xaa, 0x8d, 0x3f, 0xa7, 0xc2, 0xb1, 0x3f, 0xa2, 0xcb, 0x53, 0x3f, 0x4c, 0xbc, 0x5d, 0x3f, 0xc6, 0xa3, 0xda, 0x3f, 0xf7, 0xa5, 0xe0, 0x3f, 0x32, 0xf5, 0x93, 0x3f, 0x36, 0x24, 0xd8, 0x3f, + 0xc7, 0xeb, 0xe2, 0x3f, 0x58, 0x56, 0x61, 0x3f, 0xec, 0x4f, 0xb0, 0x3f, 0xd0, 0xbc, 0x9e, 0x3f, 0x90, 0xef, 0xee, 0x3f, 0xa4, 0x53, 0xe0, 0x3f, 0x08, 0x97, 0x92, 0x3f, 0xd7, 0xe3, 0x8c, 0x3f, + 0x04, 0x69, 0x75, 0x3f, 0x7b, 0xdb, 0xa6, 0x3f, 0x4e, 0x7a, 0x1d, 0x40, 0x6c, 0x56, 0xe7, 0x3f, 0x2c, 0xef, 0xc1, 0x3f, 0x1f, 0x98, 0xcf, 0x3f, 0x6e, 0x26, 0xfa, 0x3f, 0x07, 0x4c, 0xd9, 0x3f, + 0x1e, 0xd0, 0xf7, 0x3f, 0x8c, 0xd7, 0x90, 0x3f, 0xbe, 0xd2, 0xcf, 0x3f, 0x06, 0xe1, 0x99, 0x3f, 0x64, 0x51, 0x6c, 0x3e, 0x01, 0x28, 0xde, 0x3e, 0xda, 0x7b, 0xd1, 0x3e, 0x93, 0x5b, 0x09, 0x3f, + 0x50, 0xfd, 0xeb, 0xbc, 0x3f, 0x1b, 0xb3, 0x3e, 0xc8, 0x7c, 0xe3, 0x3e, 0x86, 0xb3, 0x3a, 0x3f, 0x5b, 0x1a, 0xfc, 0x3e, 0x9c, 0x39, 0xeb, 0x3d, 0xf6, 0x88, 0x57, 0x3f, 0xc6, 0xc5, 0x78, 0x3f, + 0x72, 0xa0, 0x5d, 0x3f, 0x42, 0xe7, 0x23, 0x3f, 0x97, 0xf6, 0xb1, 0x3e, 0xe0, 0x6d, 0xdc, 0x3c, 0xa4, 0x1a, 0x44, 0x3f, 0x57, 0x26, 0x9c, 0x3e, 0x68, 0xd1, 0x34, 0x3f, 0x7a, 0x28, 0x48, 0x3f, + 0xd8, 0x6b, 0x6c, 0x3e, 0xcb, 0x21, 0xf9, 0x3e, 0x64, 0x64, 0x03, 0x3f, 0x22, 0xfb, 0x4b, 0x3e, 0x3d, 0x70, 0x43, 0x3f, 0xf8, 0x27, 0x6f, 0x3e, 0x57, 0xa6, 0x54, 0x3f, 0xe5, 0xa9, 0x05, 0x3e, + 0xed, 0x96, 0x94, 0x3d, 0xf6, 0xfe, 0x55, 0x3e, 0x69, 0x78, 0x80, 0x3f, 0xbc, 0x6a, 0x07, 0x3f, 0xaa, 0xf7, 0x0a, 0x3e, 0xee, 0xa9, 0x01, 0x3f, 0x9c, 0xa7, 0xef, 0x3e, 0xa5, 0xac, 0x91, 0x3e, + 0xe0, 0x5c, 0xde, 0x3b, 0x30, 0x13, 0xe5, 0x3e, 0xbe, 0x44, 0x2b, 0x3f, 0x6a, 0x28, 0x04, 0x3f, 0x82, 0xc6, 0x31, 0x3f, 0x72, 0xc8, 0x50, 0x3f, 0xb0, 0xae, 0x32, 0x3f, 0x14, 0x78, 0x5f, 0x3e, + 0x70, 0x30, 0x2f, 0x3f, 0xff, 0x5a, 0x81, 0x3f, 0xb7, 0xff, 0x06, 0x3f, 0x60, 0xfa, 0xd5, 0xbb, 0x02, 0x8a, 0xab, 0x3e, 0xba, 0xa9, 0x39, 0x3e, 0x4c, 0x35, 0x30, 0x3f, 0xf4, 0x2e, 0x38, 0x3f, + 0x3c, 0x15, 0xbd, 0x3d, 0xed, 0x6f, 0x33, 0x3f, 0x6a, 0xef, 0x32, 0xbe, 0x74, 0x09, 0x6b, 0x3f, 0xbf, 0x9e, 0x81, 0x3f, 0x4a, 0x5a, 0xc7, 0x3e, 0xc8, 0xc6, 0x6b, 0x3e, 0x3e, 0xda, 0x2a, 0x3f, + 0x4e, 0xb1, 0x28, 0x3f, 0x14, 0x6c, 0xf3, 0x3e, 0x57, 0xec, 0x31, 0x3e, 0xac, 0xa2, 0x63, 0x3f, 0x58, 0x0e, 0x98, 0xbe, 0x14, 0xeb, 0xf1, 0xbd, 0x4e, 0x74, 0xbf, 0xbe, 0xe8, 0xad, 0x02, 0xbf, + 0x56, 0x08, 0xfe, 0xbd, 0x12, 0xc3, 0x59, 0xbe, 0xf4, 0xcf, 0xc3, 0xbe, 0x06, 0x74, 0xb2, 0xbe, 0xac, 0x65, 0x24, 0xbf, 0xf4, 0x82, 0x3e, 0xbe, 0xbc, 0x46, 0xb8, 0xbe, 0xa3, 0xf3, 0x2b, 0xbe, + 0x8c, 0xdc, 0xe0, 0xbe, 0x8a, 0x6b, 0xa9, 0xbe, 0x8a, 0x35, 0xdf, 0xbd, 0x46, 0x6f, 0x8c, 0xbe, 0x18, 0x5e, 0x85, 0xbe, 0x19, 0x9b, 0x74, 0xbe, 0x49, 0x7f, 0xa3, 0xbe, 0x54, 0xda, 0x48, 0xbd, + 0x56, 0x37, 0xf3, 0xbd, 0xff, 0xba, 0xc9, 0xbe, 0xf8, 0xd1, 0xba, 0xbe, 0x00, 0x7d, 0x0a, 0xbe, 0x2a, 0x8d, 0x2e, 0xbf, 0xb7, 0x71, 0x9e, 0xbd, 0x39, 0xa4, 0x79, 0xbe, 0x8c, 0x03, 0xb2, 0xbe, + 0xbd, 0x06, 0x57, 0xbe, 0x29, 0xb5, 0x2a, 0xbe, 0xda, 0x91, 0x96, 0xbe, 0x8f, 0x19, 0x90, 0xbe, 0x45, 0xa4, 0x8e, 0xbe, 0x98, 0x50, 0x83, 0xbe, 0x54, 0x2d, 0xd8, 0xbd, 0x5b, 0x28, 0x4a, 0xbe, + 0x00, 0x40, 0x84, 0xbe, 0xcc, 0x1f, 0x26, 0xbe, 0x5d, 0xba, 0xbd, 0xbe, 0x9c, 0x29, 0xdc, 0xbe, 0x9f, 0xaf, 0xa5, 0xbe, 0x6d, 0x00, 0x00, 0xbf, 0xde, 0xae, 0xb7, 0xbe, 0xed, 0xc7, 0xc5, 0xbe, + 0x97, 0x9f, 0xc0, 0xbe, 0x64, 0x6c, 0xb0, 0xbe, 0x8b, 0x82, 0xcd, 0xbe, 0x19, 0xc3, 0x83, 0x3d, 0x68, 0xe8, 0xbd, 0xbe, 0xbb, 0x54, 0x54, 0xbe, 0x0a, 0xea, 0x6b, 0xbe, 0xb1, 0xca, 0x73, 0xbd, + 0xff, 0xc2, 0x83, 0xbe, 0xce, 0x56, 0xd0, 0xbe, 0x6e, 0x5c, 0x67, 0xbd, 0x72, 0xe5, 0xc5, 0xbe, 0x45, 0x4c, 0x12, 0xbf, 0x8f, 0xd8, 0x91, 0xbe, 0x19, 0xc5, 0xba, 0xbe, 0xb6, 0x84, 0xdb, 0xbe, + 0x35, 0xda, 0x11, 0xbf, 0x3e, 0xaa, 0xa4, 0xbe, 0x00, 0x36, 0xb5, 0xbe, 0x03, 0x6c, 0x04, 0xbf, 0x40, 0x92, 0x09, 0xbe, 0x60, 0x7a, 0x98, 0x3b, 0xfd, 0x1b, 0xcb, 0x3d, 0xd0, 0x60, 0x31, 0xbc, + 0x02, 0xdf, 0x04, 0xbe, 0xb7, 0xac, 0x3e, 0x3d, 0x39, 0x4e, 0xbb, 0xbd, 0x9a, 0xc9, 0x41, 0xbe, 0xdc, 0x50, 0x30, 0xbd, 0x3c, 0x24, 0xd4, 0xbc, 0xf8, 0x9b, 0xc1, 0xbd, 0x2c, 0x60, 0x2f, 0xbd, + 0xa8, 0x83, 0x25, 0xbe, 0x75, 0x34, 0xbd, 0xbd, 0xf4, 0xdb, 0xa0, 0x3d, 0x3c, 0x79, 0xcc, 0x3c, 0x3e, 0xf7, 0x81, 0xbe, 0x44, 0xd5, 0x17, 0x3c, 0xcb, 0x1b, 0x04, 0xbe, 0x6c, 0x15, 0x29, 0xbe, + 0xbe, 0xad, 0xa3, 0xbe, 0x3b, 0xc2, 0xb1, 0xbd, 0x2f, 0xed, 0xcc, 0xbd, 0xee, 0x8c, 0xf0, 0x3d, 0x2e, 0x0d, 0xae, 0xbe, 0x95, 0x2f, 0x90, 0xbd, 0xb6, 0xbb, 0x8e, 0xbe, 0x80, 0xb6, 0xaa, 0x3c, + 0x74, 0x80, 0x5a, 0x3d, 0xf2, 0x58, 0x1d, 0xbe, 0x4a, 0xce, 0x8f, 0xbe, 0xee, 0xf1, 0x9b, 0xbc, 0x3e, 0x94, 0x02, 0x3d, 0xdc, 0x9c, 0x6a, 0xbd, 0x3d, 0x63, 0x88, 0xbe, 0xa7, 0xd0, 0xc5, 0x3c, + 0xda, 0x2a, 0x5d, 0xbd, 0xab, 0x36, 0x3b, 0xbd, 0xf4, 0x96, 0x2c, 0xbe, 0xec, 0x70, 0x9d, 0xbe, 0x90, 0x5c, 0x0a, 0xbe, 0x0f, 0xeb, 0xca, 0xbd, 0x52, 0x63, 0x80, 0xbe, 0x12, 0x66, 0x17, 0xbe, + 0x30, 0x8a, 0x65, 0xbe, 0x43, 0x0a, 0x05, 0xbe, 0x8c, 0x02, 0x04, 0xbe, 0xaf, 0x45, 0x85, 0xbc, 0x27, 0xaa, 0x9d, 0xbe, 0xc5, 0x1f, 0x7d, 0xbd, 0x53, 0x54, 0x1b, 0xbe, 0x12, 0x6f, 0x2f, 0xbe, + 0x8b, 0xd0, 0x31, 0x3e, 0x88, 0x43, 0x21, 0xbe, 0xfe, 0x9d, 0x29, 0xbe, 0x26, 0x45, 0x5c, 0xbe, 0xde, 0x81, 0x1e, 0xbe, 0xe8, 0xa6, 0x44, 0xbe, 0x3c, 0x46, 0x8a, 0x3c, 0x04, 0x6d, 0x52, 0xbe, + 0x3e, 0xe5, 0x38, 0xbe, 0xec, 0xc2, 0xd8, 0xbe, 0x23, 0x37, 0xc9, 0xbd, 0x59, 0xba, 0x41, 0xbe, 0xf2, 0x78, 0x32, 0x3d, 0xd4, 0x17, 0x46, 0x3b, 0x67, 0x92, 0x19, 0x3d, 0x14, 0x97, 0x72, 0x3d, + 0x2e, 0x6f, 0xd5, 0x3c, 0x38, 0xb9, 0x92, 0x3c, 0xc3, 0x0a, 0x41, 0x3d, 0xce, 0x53, 0x19, 0x3d, 0x1d, 0x12, 0xa8, 0x3d, 0x8b, 0xdb, 0xcf, 0x3c, 0xbb, 0x14, 0xfb, 0x3c, 0x40, 0xe9, 0xa5, 0xbb, + 0x5b, 0xbc, 0x38, 0x3d, 0x39, 0xcf, 0x06, 0x3d, 0xa0, 0xe0, 0x5a, 0x3a, 0x67, 0xbc, 0x1f, 0x3d, 0x8a, 0xe0, 0xd6, 0x3c, 0x89, 0x73, 0xce, 0x3c, 0x3c, 0x5d, 0xfc, 0x3c, 0xae, 0x39, 0x2b, 0xbc, + 0xb4, 0xd3, 0xd5, 0x3c, 0xcf, 0x6a, 0x41, 0x3d, 0xdf, 0x99, 0x2e, 0x3d, 0x3b, 0xb9, 0xfe, 0x3b, 0xb4, 0x27, 0xc0, 0x3d, 0xd2, 0x83, 0xf0, 0x3b, 0x12, 0xee, 0xba, 0x3c, 0xe8, 0x86, 0x3f, 0x3d, + 0x96, 0x5e, 0xd8, 0x3c, 0xfc, 0xb8, 0xd2, 0x3c, 0x88, 0x57, 0xca, 0x3c, 0x30, 0x31, 0xd1, 0x3c, 0xb1, 0x1c, 0x12, 0x3d, 0x0e, 0x4f, 0xc8, 0x3c, 0x42, 0x00, 0x5e, 0x3c, 0x62, 0x01, 0x9b, 0x3c, + 0xf4, 0xd4, 0x28, 0x3d, 0x53, 0xcb, 0x3f, 0x3c, 0x2a, 0xd9, 0x2a, 0x3d, 0xf9, 0x57, 0x80, 0x3d, 0x96, 0xae, 0x03, 0x3d, 0x18, 0x25, 0x58, 0x3d, 0x44, 0x7a, 0x30, 0x3d, 0x2f, 0xd2, 0x6d, 0x3d, + 0x2e, 0x9f, 0x37, 0x3d, 0x84, 0xac, 0xc8, 0x3c, 0xb7, 0x03, 0x49, 0x3d, 0x94, 0x38, 0x0d, 0xbc, 0x24, 0x36, 0x75, 0x3d, 0x76, 0x99, 0xe6, 0x3c, 0xff, 0x45, 0x9d, 0x3c, 0xd9, 0x9a, 0xd4, 0xbb, + 0x26, 0xe4, 0xdb, 0x3c, 0x50, 0x8f, 0x3a, 0x3d, 0xa3, 0xf8, 0xb8, 0x3c, 0x32, 0x2d, 0x1c, 0x3d, 0x18, 0x8e, 0x74, 0x3d, 0x4f, 0x2f, 0x21, 0x3d, 0xed, 0x3a, 0x3d, 0x3d, 0xca, 0x62, 0x56, 0x3d, + 0x18, 0x87, 0x94, 0x3d, 0xd6, 0x63, 0x5b, 0x3d, 0x01, 0xda, 0x55, 0x3d, 0x12, 0x27, 0x6b, 0x3d, 0x57, 0xb6, 0xdf, 0x3c, 0x9e, 0xa2, 0x17, 0xbc, 0x8b, 0xda, 0x20, 0xbc, 0x32, 0xc0, 0x17, 0x3c, + 0x40, 0x1e, 0xda, 0x3c, 0xf9, 0x3d, 0x03, 0xbc, 0xe2, 0xb4, 0x93, 0x3c, 0xd9, 0x28, 0xb0, 0x3c, 0xac, 0x79, 0xb5, 0x3c, 0x34, 0x53, 0x19, 0x3c, 0xf6, 0xba, 0x86, 0x3b, 0xca, 0xf6, 0x8f, 0xbc, + 0xc2, 0x1a, 0x8e, 0x3c, 0x21, 0x75, 0x14, 0x3c, 0x94, 0xc9, 0x92, 0xbc, 0x7b, 0xa8, 0x14, 0x3c, 0x2e, 0x6f, 0xd1, 0x3c, 0x00, 0x33, 0xcb, 0x39, 0x38, 0xe9, 0x40, 0x3c, 0x50, 0xbd, 0x97, 0x3a, + 0x94, 0x36, 0x3d, 0x3d, 0x51, 0x2b, 0x87, 0x3c, 0xac, 0x42, 0x84, 0x3c, 0x76, 0x54, 0x8f, 0xbc, 0xe2, 0x44, 0x74, 0x3d, 0x9c, 0xb3, 0xd9, 0x3b, 0x08, 0x97, 0xd7, 0x3c, 0x2e, 0x52, 0x22, 0x3c, + 0x00, 0xe7, 0xa5, 0x38, 0xd0, 0x08, 0xc8, 0x3c, 0x0a, 0x85, 0xbf, 0x3c, 0x40, 0xbd, 0x38, 0xba, 0xca, 0x86, 0x97, 0x3b, 0xb3, 0x82, 0x94, 0x3b, 0xfb, 0xbf, 0xf2, 0x3c, 0x8a, 0x04, 0x54, 0xbb, + 0x14, 0x4c, 0xaa, 0x3c, 0x40, 0x82, 0x61, 0x38, 0x1c, 0x34, 0xb0, 0x3c, 0x98, 0x82, 0x4e, 0x3d, 0x40, 0xe9, 0x59, 0x3c, 0x15, 0xb4, 0x4f, 0x3c, 0xd5, 0x3d, 0x04, 0x3d, 0xde, 0x27, 0x0c, 0x3d, + 0x91, 0xad, 0xf2, 0x3c, 0xcc, 0xa0, 0x46, 0x3b, 0x32, 0xd2, 0xb2, 0x3c, 0x98, 0x82, 0x12, 0xba, 0xbd, 0x6a, 0x5b, 0x3d, 0x10, 0xb8, 0x5e, 0x3c, 0x72, 0x9c, 0x39, 0x3c, 0x04, 0xc0, 0x99, 0x3b, + 0xe4, 0x2d, 0x85, 0xbc, 0xc3, 0xc9, 0xa8, 0x3c, 0x46, 0x32, 0x09, 0x3d, 0x64, 0xef, 0xab, 0x3c, 0x36, 0xc3, 0x93, 0x3c, 0x73, 0x56, 0xf6, 0x3c, 0x24, 0x9a, 0x03, 0x3c, 0x0f, 0xe2, 0xf5, 0x3c, + 0x59, 0xa6, 0x09, 0x3d, 0x97, 0x88, 0x82, 0x3d, 0x76, 0x40, 0xda, 0x3c, 0x96, 0x89, 0xc9, 0x3c, 0x9a, 0xf8, 0x59, 0x3c, 0x06, 0x03, 0x40, 0x3c, 0xf2, 0xe3, 0xb4, 0x3c, 0xca, 0xf6, 0xe4, 0x3c, + 0xaf, 0x13, 0x27, 0x3b, 0xe1, 0xb6, 0x6b, 0x3c, 0x12, 0xc9, 0xa8, 0x3c, 0x66, 0x44, 0xbb, 0x3c, 0x4e, 0x0c, 0x02, 0x3d, 0xe2, 0x5d, 0x0c, 0x3c, 0x5a, 0xd6, 0xd7, 0x3c, 0x3c, 0xe1, 0xb3, 0x3c, + 0x01, 0x91, 0xec, 0x3c, 0x5f, 0x9b, 0xb3, 0x3c, 0x60, 0xf6, 0x31, 0x3c, 0x28, 0xfb, 0x34, 0x3c, 0xc0, 0x9d, 0x9e, 0x3c, 0xf8, 0xf0, 0x67, 0x3c, 0xe0, 0x52, 0xb5, 0x3c, 0x2c, 0xd0, 0x51, 0x3c, + 0xfe, 0x13, 0x8d, 0x3b, 0xe6, 0x95, 0xb2, 0x3c, 0xee, 0xeb, 0xab, 0x3c, 0xfe, 0x8a, 0x21, 0x3c, 0x50, 0x71, 0x0b, 0x3d, 0x80, 0xc7, 0xc1, 0x3b, 0x4f, 0xc2, 0x9f, 0x3c, 0x31, 0x87, 0x7d, 0x3c, + 0xb9, 0xc0, 0x1f, 0x3c, 0x4c, 0x05, 0x00, 0x3c, 0xa0, 0x9d, 0xc6, 0x3c, 0xa3, 0xbb, 0x9c, 0x3c, 0xc8, 0xc1, 0x56, 0x3c, 0xde, 0x13, 0x8e, 0x3c, 0x2a, 0x1d, 0x0a, 0x3c, 0xd7, 0xee, 0x4b, 0x3c, + 0xa1, 0x7e, 0x15, 0x3c, 0xb7, 0x7c, 0x54, 0x3c, 0xc9, 0xeb, 0xbb, 0x3c, 0xf0, 0x17, 0xab, 0x3c, 0x96, 0x63, 0xb4, 0x3c, 0x99, 0xce, 0xfe, 0x3c, 0x5b, 0xd4, 0xb3, 0x3c, 0x18, 0x53, 0x85, 0x3c, + 0xe7, 0x0d, 0xba, 0x3c, 0x78, 0x87, 0xe6, 0x3c, 0xb8, 0x1a, 0xb6, 0x3c, 0x08, 0x93, 0x32, 0xbb, 0xd0, 0x27, 0x80, 0x3c, 0xa6, 0x40, 0x24, 0x3c, 0x12, 0x38, 0x95, 0x3c, 0xbd, 0xf1, 0x45, 0x3c, + 0x57, 0xe3, 0x5a, 0x3c, 0xb1, 0x93, 0xcc, 0x3c, 0xc7, 0xbd, 0x3b, 0xbb, 0x55, 0x10, 0xde, 0x3c, 0xef, 0xfb, 0x14, 0x3d, 0x93, 0x01, 0x72, 0x3c, 0x2c, 0x3a, 0x91, 0x3c, 0x30, 0xca, 0xc9, 0x3c, + 0xb0, 0x80, 0xf6, 0x3c, 0xf6, 0xb7, 0x70, 0x3c, 0x1c, 0xc7, 0x74, 0x3c, 0x34, 0x78, 0x02, 0x3d, 0x1a, 0x7a, 0xa9, 0x3b, 0x79, 0xdd, 0xb6, 0x3b, 0x8d, 0x51, 0xce, 0xba, 0x15, 0x7e, 0x42, 0x3b, + 0xf9, 0xcf, 0x39, 0x3b, 0x69, 0xf5, 0xba, 0x3a, 0x02, 0x63, 0xc6, 0x3b, 0x0c, 0x0a, 0x74, 0x3c, 0x90, 0x13, 0xeb, 0x3a, 0x10, 0xbf, 0x12, 0x3a, 0xb8, 0x46, 0x56, 0x3c, 0x00, 0xa3, 0x80, 0x3c, + 0x0e, 0xc4, 0x75, 0x3c, 0xde, 0xd2, 0x23, 0x3c, 0x35, 0xae, 0xc2, 0x3a, 0xd0, 0x52, 0x80, 0xbb, 0x12, 0xe5, 0x9b, 0x3c, 0xdc, 0x51, 0xe5, 0x3a, 0x20, 0xbd, 0x4e, 0x3c, 0x6c, 0x74, 0x98, 0x3c, + 0x73, 0x23, 0x6b, 0x3c, 0xfe, 0x02, 0xd3, 0x3b, 0x92, 0xcd, 0xfd, 0x3b, 0x1b, 0xdf, 0x35, 0xbb, 0x1a, 0xd8, 0x8a, 0x3c, 0x64, 0x5f, 0xb6, 0x3b, 0xc9, 0x16, 0xae, 0x3c, 0x23, 0x76, 0x41, 0xbb, + 0xd6, 0xdf, 0x5f, 0xbb, 0x40, 0xaa, 0xea, 0x3b, 0x60, 0x34, 0xc1, 0x3c, 0x58, 0x00, 0xc5, 0x3b, 0x9c, 0x57, 0x1e, 0xbb, 0xb3, 0xf2, 0xf1, 0x3b, 0x82, 0xee, 0x86, 0x3c, 0x22, 0x67, 0xb3, 0x3a, + 0x98, 0xf9, 0x8e, 0xba, 0x9e, 0x0f, 0xea, 0x3b, 0x5b, 0x59, 0x52, 0x3c, 0x2b, 0xd5, 0x71, 0x3c, 0xa2, 0x5e, 0x4e, 0x3c, 0x0d, 0x2d, 0x35, 0x3c, 0xec, 0x97, 0x87, 0x3c, 0x1e, 0xa3, 0x8e, 0x3b, + 0x02, 0x8b, 0x77, 0x3c, 0x44, 0x9b, 0x8e, 0x3c, 0x34, 0x00, 0x0d, 0x3c, 0x98, 0x1b, 0xab, 0x3a, 0xd4, 0xb8, 0x4c, 0x3c, 0xdb, 0x0c, 0x34, 0x3b, 0x5f, 0xf8, 0x69, 0x3c, 0x3b, 0xc4, 0x90, 0x3c, + 0xb3, 0x2e, 0x06, 0xbc, 0x85, 0xd2, 0x4c, 0x3c, 0xdc, 0xf1, 0x29, 0x3b, 0xd5, 0xda, 0x97, 0x3c, 0xc2, 0x96, 0x7e, 0x3c, 0x69, 0xf9, 0x25, 0x3c, 0x74, 0xa3, 0xb1, 0xba, 0xda, 0xb0, 0x5c, 0x3c, + 0xb4, 0x31, 0x2e, 0x3c, 0x53, 0x09, 0xa2, 0x3c, 0x43, 0x4e, 0x0d, 0x3b, 0x42, 0x1c, 0x7d, 0x3c, 0x0e, 0x5e, 0x3d, 0x3f, 0x28, 0xe0, 0x36, 0xbe, 0x2c, 0x14, 0x56, 0x3e, 0x1d, 0xe8, 0x22, 0x3f, + 0xb3, 0x8f, 0x19, 0x3f, 0x20, 0xfa, 0xc3, 0x3c, 0x8f, 0x39, 0x1f, 0x3f, 0xce, 0x20, 0xf6, 0x3e, 0x7a, 0x51, 0x86, 0x3f, 0x04, 0x5a, 0xb9, 0x3e, 0xec, 0xe9, 0x38, 0x3e, 0x69, 0xc6, 0xfe, 0xbe, + 0x76, 0xe3, 0xf8, 0x3e, 0xf4, 0xd0, 0x9f, 0x3e, 0x32, 0xe4, 0x8e, 0xbe, 0x3a, 0xb9, 0x08, 0x3f, 0xbb, 0x4a, 0xc8, 0x3e, 0x4c, 0xb6, 0x58, 0x3e, 0xd8, 0xda, 0x9a, 0x3e, 0x72, 0x9b, 0x99, 0xbe, + 0x0f, 0x70, 0x41, 0x3f, 0x1a, 0x60, 0x18, 0x3f, 0x88, 0x6a, 0x08, 0x3f, 0x70, 0x00, 0x23, 0xbe, 0x1a, 0x8e, 0xc4, 0x3f, 0xfc, 0x9a, 0xca, 0x3d, 0x6a, 0xb2, 0xb0, 0x3e, 0xaf, 0xd8, 0x1a, 0x3f, + 0xb9, 0x4d, 0x8f, 0x3e, 0x32, 0x08, 0x03, 0x3f, 0xad, 0x76, 0x94, 0x3e, 0x1e, 0xa4, 0x16, 0x3e, 0x15, 0xbc, 0xd6, 0x3e, 0x30, 0x3b, 0x4c, 0x3e, 0x2c, 0xd1, 0xbc, 0x3e, 0xa6, 0xbd, 0xd7, 0x3d, + 0x4c, 0x25, 0x32, 0x3f, 0x68, 0x7f, 0xa7, 0x3c, 0xd4, 0x41, 0x0b, 0x3f, 0xb7, 0x52, 0x92, 0x3f, 0x42, 0xe6, 0xac, 0x3e, 0xa0, 0x4a, 0x08, 0x3f, 0xc5, 0x65, 0x2c, 0x3f, 0x50, 0xcc, 0x7b, 0x3f, + 0xd5, 0xa0, 0x2a, 0x3f, 0x7c, 0x9b, 0x69, 0x3d, 0x94, 0x74, 0x2a, 0x3f, 0x20, 0x20, 0xd0, 0xbd, 0xef, 0x9f, 0x98, 0x3f, 0x65, 0x3b, 0xd9, 0x3e, 0x22, 0x77, 0x2f, 0x3e, 0xe2, 0xdf, 0x4c, 0xbe, + 0x4c, 0x9b, 0xc4, 0x3d, 0xb7, 0xa7, 0x11, 0x3f, 0xb9, 0xad, 0x2c, 0x3f, 0x25, 0xff, 0xde, 0x3e, 0x2a, 0x17, 0x1f, 0x3f, 0x5e, 0x04, 0x2f, 0x3f, 0x1a, 0x73, 0x0d, 0x3f, 0x60, 0x78, 0x42, 0x3f, + 0x4e, 0xbd, 0x83, 0x3f, 0x62, 0xb2, 0x99, 0x3f, 0x2c, 0xf9, 0x57, 0x3f, 0x2c, 0x49, 0x33, 0x3f, 0x7c, 0x18, 0x07, 0xbe, 0xb2, 0xe8, 0x21, 0xbe, 0xb8, 0x0d, 0xf1, 0xbd, 0x0c, 0x82, 0x51, 0xbe, + 0x67, 0x60, 0x28, 0xbe, 0x1a, 0x06, 0x60, 0xbe, 0x57, 0x61, 0x1c, 0xbe, 0xea, 0x00, 0x2c, 0xbe, 0xd2, 0x36, 0xe4, 0xbd, 0xfa, 0xc5, 0xf0, 0xbd, 0xd9, 0x72, 0x40, 0xbe, 0x45, 0x5f, 0x17, 0xbe, + 0x9e, 0x9a, 0x2e, 0xbe, 0x52, 0xd7, 0x23, 0xbe, 0xd0, 0x86, 0x1b, 0xbe, 0x4c, 0x77, 0x31, 0xbe, 0x2d, 0x02, 0x64, 0xbe, 0x5f, 0x2f, 0x2d, 0xbe, 0xf0, 0xf0, 0x40, 0xbe, 0x0e, 0x0f, 0x13, 0xbe, + 0x54, 0x25, 0x4a, 0xbe, 0x01, 0x73, 0x03, 0xbe, 0xb2, 0x5c, 0x2a, 0xbe, 0xa2, 0x9a, 0xce, 0xbd, 0xfa, 0xf4, 0x58, 0xbe, 0x29, 0x46, 0x15, 0xbe, 0xec, 0xd2, 0x3f, 0xbe, 0xea, 0xfc, 0xcb, 0xbd, + 0x69, 0x5a, 0xb0, 0xbd, 0xca, 0x2a, 0x24, 0xbe, 0x47, 0x7f, 0x37, 0xbe, 0xc1, 0x43, 0x0a, 0xbe, 0x34, 0x42, 0xdd, 0xbd, 0xfc, 0x2c, 0x03, 0xbe, 0xf4, 0x89, 0xfc, 0xbd, 0x82, 0x11, 0x22, 0xbe, + 0x1c, 0xa7, 0xf7, 0xbd, 0xbe, 0x31, 0x43, 0xbe, 0x66, 0x16, 0x63, 0xbe, 0xd0, 0xee, 0x1a, 0xbe, 0x32, 0xe2, 0x45, 0xbe, 0xac, 0xc6, 0x63, 0xbe, 0xd0, 0xc3, 0x05, 0xbe, 0xce, 0xa0, 0x27, 0xbe, + 0x80, 0x5b, 0x0f, 0xbe, 0xf8, 0xed, 0xd6, 0xbd, 0x1d, 0xdd, 0x4d, 0xbe, 0xd6, 0xe2, 0xec, 0xbd, 0xc7, 0x8f, 0x1a, 0xbe, 0x93, 0xa6, 0x17, 0xbe, 0xc6, 0x42, 0x16, 0xbe, 0x20, 0x46, 0x4f, 0xbe, + 0x28, 0x92, 0x0c, 0xbe, 0xa4, 0xf0, 0x96, 0xbe, 0x54, 0xd2, 0xb6, 0xbd, 0x85, 0xb2, 0x1a, 0xbe, 0xf8, 0x4a, 0x36, 0xbe, 0xc8, 0x49, 0x2a, 0xbe, 0xd9, 0x38, 0x43, 0xbe, 0xc0, 0xbb, 0x45, 0xbe, + 0xf4, 0x4c, 0x1c, 0xbe, 0xc1, 0xce, 0x83, 0xbd, 0x9c, 0x39, 0xc1, 0xbd, 0x06, 0xc1, 0x2f, 0xbe, 0xbc, 0x5a, 0xd2, 0x3d, 0x50, 0xc9, 0xc8, 0x3d, 0x02, 0x29, 0xbf, 0x3d, 0xe7, 0x84, 0xf9, 0x3d, + 0xd4, 0x18, 0xbc, 0x3d, 0x56, 0xf9, 0xdf, 0x3d, 0xd4, 0xa3, 0xd2, 0x3d, 0x94, 0xc0, 0xe6, 0x3d, 0x4b, 0x1f, 0xef, 0x3d, 0x1d, 0x25, 0xe0, 0x3d, 0x61, 0xf3, 0xee, 0x3d, 0x3a, 0x85, 0xee, 0x3d, + 0xa4, 0x54, 0xcc, 0x3d, 0x70, 0x69, 0xe1, 0x3d, 0x06, 0x86, 0xd2, 0x3d, 0xfb, 0x74, 0xf4, 0x3d, 0xa6, 0x13, 0xc2, 0x3d, 0x49, 0x8b, 0xe0, 0x3d, 0xd6, 0x98, 0xb0, 0x3d, 0x99, 0x29, 0xab, 0x3d, + 0x9d, 0x18, 0xe9, 0x3d, 0xc1, 0x1f, 0x00, 0x3e, 0x79, 0x15, 0xf5, 0x3d, 0xd2, 0x44, 0xcc, 0x3d, 0x2c, 0x31, 0x06, 0x3e, 0x95, 0x0a, 0xdb, 0x3d, 0x18, 0x54, 0xc1, 0x3d, 0x16, 0xc8, 0xb0, 0x3d, + 0x52, 0xa9, 0xd5, 0x3d, 0x72, 0x93, 0xeb, 0x3d, 0x92, 0x7a, 0xb8, 0x3d, 0xb8, 0x2a, 0xde, 0x3d, 0x6d, 0x79, 0xd1, 0x3d, 0x9e, 0x0d, 0xb9, 0x3d, 0xc8, 0xb8, 0xcc, 0x3d, 0xea, 0xd3, 0x50, 0x3d, + 0x7a, 0x98, 0xc0, 0x3d, 0x5a, 0xa9, 0xcc, 0x3d, 0x16, 0x29, 0xf1, 0x3d, 0x4d, 0x57, 0xab, 0x3d, 0x7c, 0x8e, 0x21, 0x3e, 0xb2, 0xe3, 0xf6, 0x3d, 0x22, 0x42, 0xc2, 0x3d, 0x04, 0x68, 0xf4, 0x3d, + 0xd0, 0xa1, 0x03, 0x3e, 0x73, 0x8a, 0x8d, 0x3d, 0x0b, 0x72, 0xea, 0x3d, 0x3a, 0x2a, 0xb3, 0x3d, 0x06, 0xbf, 0xe9, 0x3d, 0x2a, 0x00, 0xe0, 0x3d, 0x6c, 0x67, 0xdc, 0x3d, 0x8e, 0xc4, 0xd5, 0x3d, + 0xe7, 0x71, 0xac, 0x3d, 0xec, 0xeb, 0xd3, 0x3d, 0x84, 0x8b, 0xae, 0x3d, 0x5f, 0xec, 0x0b, 0x3e, 0x56, 0x50, 0xf8, 0x3d, 0xc2, 0x1f, 0xf0, 0x3d, 0x76, 0xb5, 0xf6, 0x3d, 0x9e, 0x31, 0xf0, 0x3d, + 0x83, 0x9f, 0x0b, 0x3e, 0xaf, 0xdf, 0x6e, 0x3d, 0xba, 0x5b, 0x9d, 0x3d, 0x40, 0x54, 0xbd, 0x3d, 0xc5, 0x6c, 0xe4, 0x3c, 0x69, 0xb9, 0x42, 0x3d, 0x0a, 0xb9, 0x31, 0x3d, 0x4f, 0xd1, 0xb2, 0x3c, + 0x8f, 0x1a, 0x69, 0x3d, 0x30, 0x01, 0x33, 0x3d, 0xc7, 0xca, 0x94, 0x3c, 0xc6, 0x7b, 0x82, 0x3c, 0x14, 0xc9, 0xd1, 0x3b, 0xa4, 0x8c, 0xa9, 0x3c, 0xa8, 0x16, 0x4e, 0x3d, 0xaa, 0x92, 0x08, 0x3c, + 0x1e, 0x00, 0xcd, 0x3c, 0xc3, 0xa9, 0x1d, 0x3d, 0x00, 0xe4, 0xf9, 0x3c, 0x85, 0xa5, 0x03, 0x3d, 0x9f, 0x53, 0x62, 0x3d, 0x3b, 0x19, 0x00, 0x3d, 0x6d, 0x3f, 0x19, 0x3d, 0x72, 0xf3, 0x2e, 0x3d, + 0x64, 0xe4, 0x4a, 0x3d, 0x92, 0x32, 0xf6, 0x3c, 0x78, 0xc3, 0x98, 0x3c, 0x90, 0x9c, 0x87, 0xbb, 0xb2, 0xf1, 0x87, 0x3d, 0xa8, 0x1e, 0xf1, 0x3c, 0xcc, 0xfa, 0x67, 0x3d, 0xd6, 0x15, 0x01, 0x3d, + 0xb0, 0x69, 0x16, 0x3b, 0x74, 0x89, 0x14, 0x3d, 0x9a, 0xcf, 0xb6, 0x3c, 0xea, 0xb3, 0x05, 0x3d, 0xb0, 0x23, 0xf3, 0xba, 0xe4, 0xc4, 0x3c, 0x3d, 0x90, 0x72, 0xae, 0x3c, 0xe3, 0x4b, 0x83, 0x3c, + 0xda, 0x7e, 0xa3, 0x3c, 0xa6, 0x5f, 0x3b, 0x3d, 0xd2, 0x80, 0x9d, 0x3c, 0x6f, 0xc8, 0x51, 0x3c, 0xd6, 0x4c, 0xeb, 0x3c, 0x92, 0xaf, 0x81, 0x3d, 0xe5, 0xd7, 0x08, 0x3d, 0x0f, 0xb9, 0x3c, 0x3d, + 0x4c, 0x25, 0xc6, 0x3c, 0x01, 0x23, 0xc8, 0x3c, 0xd4, 0x8a, 0x12, 0x3d, 0x1f, 0x84, 0xee, 0x3c, 0x66, 0x58, 0x3c, 0x3d, 0x8e, 0x9d, 0x64, 0x3d, 0x20, 0x05, 0x0f, 0x3d, 0x7d, 0x73, 0x1f, 0x3d, + 0x52, 0xcd, 0xdc, 0x3b, 0x5a, 0x97, 0xc4, 0x3d, 0xe9, 0xaf, 0x99, 0x3d, 0x8c, 0xd7, 0x2c, 0x3d, 0xa4, 0xcd, 0x3d, 0x3d, 0xe6, 0x73, 0xea, 0x3c, 0xfb, 0x10, 0x82, 0x3d, 0x4b, 0x07, 0x9b, 0x3d, + 0xb1, 0xc5, 0x2d, 0x3d, 0xee, 0xed, 0xd2, 0x3c, 0x24, 0xba, 0xc3, 0x3c, 0x6a, 0xc4, 0x47, 0x3d, 0x80, 0x37, 0x4b, 0xbc, 0x7c, 0x89, 0x41, 0xbc, 0xe6, 0xa7, 0x48, 0xbc, 0x32, 0x91, 0x4d, 0xbc, + 0x4a, 0x89, 0x36, 0xbc, 0x1e, 0x17, 0x39, 0xbc, 0x8e, 0x3e, 0x38, 0xbc, 0x41, 0x37, 0x46, 0xbc, 0x90, 0x55, 0x67, 0xbc, 0x02, 0xb9, 0x5d, 0xbc, 0xbc, 0x51, 0x61, 0xbc, 0x43, 0x52, 0x54, 0xbc, + 0x1f, 0x93, 0x2c, 0xbc, 0x1a, 0xa5, 0x56, 0xbc, 0x78, 0xab, 0x42, 0xbc, 0x76, 0x07, 0x61, 0xbc, 0x8c, 0x8d, 0x1c, 0xbc, 0x0e, 0xb5, 0x4a, 0xbc, 0xfe, 0xa9, 0x0b, 0xbc, 0x2d, 0xf4, 0x21, 0xbc, + 0x93, 0x7b, 0x54, 0xbc, 0x6e, 0x01, 0x83, 0xbc, 0x6a, 0x84, 0x5a, 0xbc, 0x4e, 0xa3, 0x3a, 0xbc, 0x38, 0x5d, 0x82, 0xbc, 0x46, 0x59, 0x4f, 0xbc, 0x84, 0x15, 0x30, 0xbc, 0xe5, 0x8e, 0x37, 0xbc, + 0x10, 0x3f, 0x53, 0xbc, 0xbe, 0xd7, 0x60, 0xbc, 0x5d, 0xca, 0x0d, 0xbc, 0x84, 0x99, 0x5b, 0xbc, 0xe7, 0xd8, 0x3e, 0xbc, 0x76, 0xdb, 0x3d, 0xbc, 0x69, 0xc4, 0x43, 0xbc, 0x9a, 0xf5, 0x4f, 0xbb, + 0x6e, 0x6c, 0x35, 0xbc, 0x06, 0xf7, 0x32, 0xbc, 0x93, 0x07, 0x38, 0xbc, 0xaf, 0x77, 0x05, 0xbc, 0x2c, 0xdd, 0x98, 0xbc, 0x3e, 0xa9, 0x62, 0xbc, 0x2c, 0x23, 0x3d, 0xbc, 0x66, 0xa1, 0x71, 0xbc, + 0x1c, 0xa3, 0x81, 0xbc, 0x50, 0x8c, 0x04, 0xbc, 0x5c, 0xd9, 0x48, 0xbc, 0x10, 0xae, 0x2f, 0xbc, 0x4b, 0xbb, 0x6b, 0xbc, 0xbc, 0xa4, 0x69, 0xbc, 0x20, 0xf3, 0x54, 0xbc, 0x91, 0xdd, 0x31, 0xbc, + 0x98, 0x7b, 0x09, 0xbc, 0x8f, 0xf0, 0x2b, 0xbc, 0x2a, 0x2d, 0x5e, 0xbc, 0x88, 0xf0, 0x8f, 0xbc, 0xe1, 0xb6, 0x6e, 0xbc, 0x16, 0xc6, 0x5c, 0xbc, 0x6d, 0xf5, 0x73, 0xbc, 0x42, 0xc9, 0x74, 0xbc, + 0xaa, 0x3e, 0x8f, 0xbc, 0xbc, 0x44, 0x01, 0xbc, 0x42, 0xe0, 0x1c, 0xbc, 0xb3, 0x69, 0x2d, 0xbc, 0x89, 0x3c, 0xa1, 0xbb, 0x66, 0x98, 0xdd, 0xbb, 0x35, 0xa2, 0xed, 0xbb, 0x87, 0xdc, 0x26, 0xbb, + 0xb4, 0x99, 0xfa, 0xbb, 0xa8, 0x06, 0x9b, 0xbb, 0x9a, 0xf6, 0x35, 0xbb, 0xc8, 0xd7, 0x1e, 0xbb, 0x52, 0x1a, 0x4a, 0xbb, 0x18, 0xa2, 0x98, 0xbb, 0x30, 0x30, 0xe9, 0xbb, 0x30, 0xa6, 0x0c, 0xbb, + 0xea, 0x70, 0x48, 0xbb, 0xea, 0xc2, 0xc2, 0xbb, 0x4a, 0x75, 0x98, 0xbb, 0x90, 0x3e, 0xa4, 0xbb, 0xf2, 0xa4, 0xb7, 0xbb, 0x71, 0x9f, 0x94, 0xbb, 0xf8, 0xbd, 0x6e, 0xbb, 0x27, 0x02, 0xbe, 0xbb, + 0x48, 0x53, 0xd6, 0xbb, 0x25, 0x81, 0xd4, 0xbb, 0xe9, 0x02, 0x54, 0xbb, 0xbc, 0x89, 0x83, 0xba, 0xb8, 0xc6, 0x1a, 0xbc, 0xe2, 0xad, 0xa0, 0xbb, 0x1e, 0x6d, 0xe4, 0xbb, 0xb8, 0x88, 0xbb, 0xbb, + 0x60, 0x4f, 0x30, 0xbb, 0xed, 0x97, 0xbf, 0xbb, 0x3a, 0x28, 0xf6, 0xba, 0xce, 0xb1, 0xbe, 0xbb, 0x20, 0xd4, 0xa1, 0xba, 0x0c, 0xa5, 0xea, 0xbb, 0x30, 0xc2, 0x85, 0xbb, 0x05, 0x06, 0xa9, 0x39, + 0x47, 0x2f, 0x70, 0xbb, 0xcc, 0x5e, 0xb4, 0xbb, 0x8c, 0xd0, 0xb4, 0xba, 0xe4, 0xfb, 0x8a, 0xba, 0x44, 0x98, 0xbc, 0xbb, 0xf1, 0xe5, 0x04, 0xbc, 0xa5, 0xef, 0xb0, 0xbb, 0x6d, 0x30, 0xf3, 0xbb, + 0x8e, 0x95, 0xb0, 0xbb, 0x04, 0x5f, 0x6d, 0xbb, 0xb4, 0xd1, 0x8f, 0xbb, 0x36, 0x1a, 0xa0, 0xbb, 0x44, 0x25, 0xf8, 0xbb, 0xa0, 0xe3, 0x11, 0xbc, 0x44, 0xab, 0xbb, 0xbb, 0x6e, 0x43, 0x8d, 0xbb, + 0x44, 0x12, 0x29, 0xba, 0xe7, 0xac, 0x1f, 0xbc, 0x7a, 0x7d, 0x4d, 0xbc, 0x7c, 0x23, 0x05, 0xbc, 0x24, 0xf8, 0xe7, 0xbb, 0x50, 0x21, 0x97, 0xbb, 0x0b, 0xa5, 0x16, 0xbc, 0x66, 0xa7, 0x30, 0xbc, + 0xfb, 0x99, 0x04, 0xbc, 0x4c, 0xcf, 0x97, 0xbb, 0x02, 0x45, 0x8c, 0xbb, 0x88, 0xb6, 0xcb, 0xbb, 0xa9, 0xd7, 0xba, 0xbb, 0xd2, 0x59, 0xbb, 0xbb, 0x6e, 0x34, 0xa1, 0xbb, 0x1b, 0x82, 0xfc, 0xbb, + 0x41, 0x59, 0xb3, 0xbb, 0x9a, 0xf6, 0xeb, 0xbb, 0xe0, 0x58, 0xca, 0xbb, 0x95, 0xa9, 0xdf, 0xbb, 0x48, 0xb5, 0xc9, 0xbb, 0x3a, 0x37, 0xbe, 0xbb, 0x9d, 0x66, 0xe1, 0xbb, 0x04, 0x6e, 0xdc, 0xbb, + 0x71, 0x78, 0xcd, 0xbb, 0xdb, 0xdc, 0xce, 0xbb, 0x62, 0xc4, 0xc4, 0xbb, 0x76, 0x28, 0xe4, 0xbb, 0xfd, 0x6a, 0xd7, 0xbb, 0x2e, 0x48, 0xd6, 0xbb, 0x10, 0x89, 0xc1, 0xbb, 0x3e, 0xa2, 0xa3, 0xbb, + 0x2c, 0x61, 0xe3, 0xbb, 0x94, 0x8b, 0xd3, 0xbb, 0x9d, 0x45, 0xe6, 0xbb, 0xd2, 0x59, 0xb3, 0xbb, 0x52, 0x8d, 0xf9, 0xbb, 0x4d, 0xfb, 0xc6, 0xbb, 0xf1, 0x1a, 0xc3, 0xbb, 0xa5, 0x8e, 0x93, 0xbb, + 0xc7, 0x48, 0xae, 0xbb, 0x36, 0x0c, 0xd6, 0xbb, 0x71, 0x69, 0xc7, 0xbb, 0xc7, 0xab, 0xc1, 0xbb, 0x5b, 0x72, 0xb9, 0xbb, 0x8a, 0x1d, 0xa2, 0xbb, 0x55, 0x74, 0xb5, 0xbb, 0x1e, 0x85, 0x8e, 0xbb, + 0x60, 0x6c, 0xad, 0xbb, 0xc7, 0x29, 0xd0, 0xbb, 0x9d, 0xf7, 0x00, 0xbc, 0xdf, 0x30, 0xb4, 0xbb, 0x06, 0xd2, 0x0f, 0xbc, 0x54, 0xad, 0xf3, 0xbb, 0x41, 0x36, 0xae, 0xbb, 0x55, 0x2c, 0xd9, 0xbb, + 0x2f, 0x82, 0xe0, 0xbb, 0x72, 0x3b, 0x84, 0xbb, 0x9b, 0xce, 0xeb, 0xbb, 0x28, 0xbe, 0x9e, 0xbb, 0x96, 0xd8, 0xcb, 0xbb, 0x51, 0x6b, 0xc0, 0xbb, 0x5b, 0x14, 0xc6, 0xbb, 0x96, 0xdc, 0xde, 0xbb, + 0x97, 0xad, 0xaf, 0xbb, 0x5e, 0xd6, 0xf8, 0xbb, 0xe6, 0x7b, 0x75, 0xbb, 0x09, 0x21, 0xe9, 0xbb, 0x16, 0x3b, 0xe3, 0xbb, 0x14, 0x2c, 0xdf, 0xbb, 0x66, 0x21, 0xe2, 0xbb, 0x83, 0x91, 0xda, 0xbb, + 0xe3, 0x87, 0xe9, 0xbb, 0x5b, 0x87, 0x40, 0xbb, 0xc2, 0x27, 0x88, 0xbb, 0x30, 0x5b, 0xbb, 0xbb, 0xdc, 0x7c, 0x00, 0xbb, 0x10, 0x39, 0x51, 0xbb, 0x46, 0xbb, 0x1e, 0xbb, 0x16, 0x50, 0x31, 0xbb, + 0xa2, 0x39, 0x74, 0xbb, 0xd0, 0x62, 0x80, 0xbb, 0xde, 0x5f, 0xfe, 0xba, 0x8b, 0x50, 0x02, 0xbb, 0x69, 0x32, 0x27, 0xba, 0x12, 0x3d, 0xb3, 0xba, 0xf2, 0x59, 0x69, 0xbb, 0x32, 0x15, 0xad, 0xba, + 0x1a, 0x03, 0x26, 0xbb, 0x6c, 0x32, 0x33, 0xbb, 0x20, 0xa7, 0x1c, 0xbb, 0x92, 0x80, 0x2a, 0xbb, 0xeb, 0xaf, 0x96, 0xbb, 0xb0, 0xc5, 0x2b, 0xbb, 0x1f, 0x1d, 0x63, 0xbb, 0x4a, 0x5e, 0x41, 0xbb, + 0x7a, 0x1e, 0x74, 0xbb, 0x27, 0x25, 0xdf, 0xba, 0xdd, 0x83, 0x01, 0xbb, 0xba, 0x7b, 0x92, 0xb9, 0x9a, 0x99, 0x8f, 0xbb, 0xef, 0xf1, 0x0f, 0xbb, 0xfb, 0x3b, 0x85, 0xbb, 0x74, 0x10, 0xe7, 0xba, + 0xdc, 0xb1, 0x63, 0xb9, 0xde, 0x00, 0x2a, 0xbb, 0x36, 0xc8, 0x30, 0xbb, 0x55, 0xd6, 0x0a, 0xbb, 0x82, 0x3f, 0xed, 0xb9, 0x34, 0x29, 0x33, 0xbb, 0x0f, 0x44, 0xd2, 0xba, 0x0f, 0x7f, 0x2a, 0xbb, + 0x0d, 0x2f, 0xd0, 0xba, 0x4e, 0xc4, 0x6e, 0xbb, 0x71, 0x03, 0x41, 0xbb, 0x59, 0x31, 0x00, 0xbb, 0xac, 0x6b, 0x18, 0xbb, 0x08, 0xea, 0x95, 0xbb, 0xd2, 0x15, 0x13, 0xbb, 0x4a, 0x46, 0x42, 0xbb, + 0xf3, 0x1a, 0xd6, 0xba, 0x8b, 0x1b, 0xec, 0xba, 0x33, 0xbc, 0x55, 0xbb, 0xc9, 0x74, 0xfc, 0xba, 0xdc, 0x5f, 0x38, 0xbb, 0x6a, 0x54, 0x51, 0xbb, 0xbb, 0xed, 0x1d, 0xbb, 0x78, 0x41, 0x67, 0xbb, + 0x8c, 0x0c, 0xc3, 0xba, 0xb6, 0x92, 0xec, 0xbb, 0x87, 0x1c, 0x54, 0xbb, 0xec, 0xc6, 0x1c, 0xbb, 0x8f, 0x00, 0x51, 0xbb, 0x4d, 0xed, 0x1b, 0xbb, 0x04, 0xd1, 0x84, 0xbb, 0xa7, 0x6f, 0x96, 0xbb, + 0x16, 0x49, 0x1f, 0xbb, 0x5b, 0x9a, 0xab, 0xba, 0x07, 0xa8, 0xc5, 0xba, 0x82, 0x99, 0x67, 0xbb, 0x2e, 0xcc, 0x25, 0xbe, 0x12, 0x82, 0x2d, 0xbe, 0x5c, 0x7f, 0x42, 0xbe, 0xaf, 0xfc, 0xf2, 0xbd, + 0x77, 0x5a, 0x2e, 0xbe, 0x11, 0xa3, 0x00, 0xbe, 0xfe, 0xdf, 0xf4, 0xbd, 0x1a, 0x95, 0xf8, 0xbd, 0xef, 0xa7, 0x29, 0xbe, 0x52, 0x76, 0x33, 0xbe, 0xb2, 0x4b, 0x40, 0xbe, 0x3c, 0x2b, 0x08, 0xbe, + 0x46, 0xd9, 0xe1, 0xbd, 0xc8, 0x6e, 0x32, 0xbe, 0x18, 0x98, 0x17, 0xbe, 0x6a, 0x99, 0x2b, 0xbe, 0xef, 0x45, 0xeb, 0xbd, 0x84, 0x7b, 0x17, 0xbe, 0x5e, 0xdd, 0xb7, 0xbd, 0xae, 0xe8, 0x0f, 0xbe, + 0xd5, 0x36, 0x2e, 0xbe, 0xaa, 0x12, 0x62, 0xbe, 0x70, 0x52, 0x14, 0xbe, 0x32, 0xee, 0xe9, 0xbd, 0x92, 0xe9, 0x6d, 0xbe, 0x5d, 0xf6, 0x24, 0xbe, 0xc2, 0x28, 0x1c, 0xbe, 0xec, 0x7e, 0x29, 0xbe, + 0xfe, 0x7a, 0x1d, 0xbe, 0x68, 0x68, 0x38, 0xbe, 0x01, 0xea, 0x96, 0xbd, 0x82, 0xe1, 0x3a, 0xbe, 0xc0, 0x82, 0xf1, 0xbd, 0x16, 0x31, 0x37, 0xbe, 0x20, 0x1b, 0x19, 0xbe, 0xc8, 0x9e, 0xf5, 0x3b, + 0xa9, 0xd3, 0x0a, 0xbe, 0x94, 0x72, 0x0c, 0xbe, 0x1b, 0x08, 0xb1, 0xbd, 0x2d, 0xa7, 0x86, 0xbd, 0x00, 0xcc, 0x67, 0xbe, 0x02, 0xdf, 0x43, 0xbe, 0x06, 0xc0, 0x21, 0xbe, 0x4e, 0x7c, 0x55, 0xbe, + 0x13, 0x8b, 0x50, 0xbe, 0x3a, 0x1d, 0xd8, 0xbd, 0x46, 0x51, 0x0c, 0xbe, 0x3f, 0x33, 0x16, 0xbe, 0xd8, 0x58, 0x56, 0xbe, 0x3b, 0xf2, 0x64, 0xbe, 0x76, 0x24, 0x32, 0xbe, 0xee, 0xcf, 0xf5, 0xbd, + 0xd5, 0xe8, 0x89, 0xbd, 0x65, 0xb2, 0x1d, 0xbe, 0xd8, 0xec, 0x8a, 0xbe, 0x48, 0xad, 0x80, 0xbe, 0x14, 0xd3, 0x4b, 0xbe, 0x8b, 0xca, 0x25, 0xbe, 0x66, 0xa6, 0x64, 0xbe, 0x46, 0xe4, 0x76, 0xbe, + 0x26, 0x99, 0x7f, 0xbe, 0xba, 0x92, 0xff, 0xbd, 0xbd, 0x76, 0x07, 0xbe, 0x3c, 0x9c, 0x15, 0xbe, 0xca, 0x05, 0x26, 0xbd, 0x96, 0x32, 0x8f, 0xbd, 0x2b, 0xf3, 0x70, 0xbd, 0xc4, 0x83, 0x45, 0xbd, + 0x10, 0xca, 0xac, 0xbc, 0x77, 0x98, 0xb2, 0xbd, 0x55, 0x8e, 0xce, 0xbd, 0xff, 0x73, 0x17, 0xbe, 0x07, 0x8c, 0x9b, 0xbd, 0x26, 0xf0, 0xc9, 0xbc, 0x01, 0x80, 0x07, 0xbe, 0xee, 0xe8, 0xc5, 0xbd, + 0x48, 0x38, 0xca, 0xbd, 0x22, 0x32, 0xd0, 0xbd, 0x2a, 0x58, 0x81, 0xbd, 0x0b, 0xc9, 0x88, 0xbd, 0xff, 0x11, 0xf5, 0xbd, 0xbc, 0x3c, 0xc1, 0xbd, 0x14, 0x30, 0xc9, 0xbd, 0x94, 0xf7, 0xa0, 0xbd, + 0x16, 0x98, 0x23, 0xbd, 0x2a, 0xba, 0x36, 0xbd, 0xd3, 0x7b, 0xd8, 0xbd, 0x70, 0xf0, 0x55, 0xbd, 0x84, 0x9c, 0xd2, 0xbd, 0x6e, 0x59, 0x45, 0xbd, 0xda, 0x87, 0x04, 0xbe, 0xb6, 0x2d, 0x31, 0xbd, + 0x80, 0xfb, 0x43, 0x3b, 0x04, 0x92, 0x98, 0xbc, 0x7c, 0xd2, 0x24, 0xbe, 0xb4, 0x57, 0x77, 0xbd, 0x20, 0x01, 0x78, 0xba, 0x5d, 0x1c, 0x81, 0xbd, 0x96, 0xe9, 0xaf, 0xbd, 0x7e, 0x92, 0x87, 0xbd, + 0xb0, 0x4e, 0x5c, 0x3b, 0x04, 0xe5, 0x92, 0xbd, 0x4f, 0xb7, 0xf3, 0xbd, 0xa1, 0x43, 0xa1, 0xbd, 0xb7, 0x99, 0xd9, 0xbd, 0x49, 0xdd, 0x19, 0xbe, 0xf2, 0xb5, 0xc9, 0xbd, 0x44, 0x29, 0x85, 0xbc, + 0xa2, 0x82, 0xc8, 0xbd, 0x0c, 0x63, 0xd4, 0xbd, 0x8e, 0xd9, 0xdd, 0xbd, 0x60, 0xe2, 0xf3, 0xbb, 0xf2, 0x9c, 0xa4, 0xbd, 0x8a, 0x20, 0xbf, 0xbc, 0xbc, 0xc8, 0xdd, 0xbd, 0x0b, 0xd3, 0xde, 0xbd, + 0x20, 0x87, 0x29, 0x3c, 0xca, 0x7a, 0x28, 0xbe, 0x73, 0x08, 0xac, 0x3c, 0x3b, 0xed, 0xdb, 0xbd, 0x27, 0x88, 0x30, 0xbe, 0x1e, 0x16, 0xb1, 0xbd, 0x64, 0x2f, 0x65, 0xbd, 0xc7, 0xe7, 0x0a, 0xbe, + 0x0e, 0xf5, 0x98, 0xbd, 0x6a, 0x10, 0x22, 0xbd, 0x70, 0xa4, 0x86, 0xbc, 0x83, 0x8f, 0xf5, 0xbd, 0xf8, 0xa7, 0x4f, 0x3d, 0xb9, 0x0d, 0x01, 0x3d, 0xc2, 0x54, 0x5b, 0x3d, 0xa4, 0x3e, 0x85, 0x3d, + 0x8b, 0x64, 0xa2, 0x3c, 0x69, 0x10, 0x1d, 0x3d, 0x46, 0x7f, 0x83, 0x3d, 0x78, 0xc9, 0x71, 0x3d, 0x97, 0xba, 0xb8, 0x3d, 0x02, 0xce, 0x25, 0x3d, 0x8f, 0x8b, 0x5f, 0x3d, 0xdb, 0x90, 0x28, 0x3d, + 0x5a, 0x14, 0x76, 0x3d, 0x07, 0x4a, 0x86, 0x3d, 0x65, 0x8e, 0xf4, 0x3c, 0xe0, 0x24, 0x5a, 0x3d, 0x1a, 0x13, 0x1f, 0x3d, 0x94, 0x74, 0x6c, 0x3d, 0x7f, 0x47, 0x26, 0x3d, 0x32, 0x09, 0xdc, 0x3b, + 0xce, 0xce, 0xec, 0x3c, 0xba, 0x01, 0x52, 0x3d, 0x92, 0xa4, 0x75, 0x3d, 0x9e, 0x5d, 0x24, 0x3d, 0x63, 0xa0, 0xb4, 0x3d, 0x82, 0x38, 0xd7, 0x3c, 0xea, 0x29, 0x02, 0x3d, 0x92, 0x68, 0x3d, 0x3d, + 0xb3, 0xb5, 0xf1, 0x3c, 0xc1, 0xe7, 0x18, 0x3d, 0x04, 0xd3, 0x2b, 0x3d, 0xc4, 0x84, 0x01, 0x3d, 0xc7, 0xc6, 0x08, 0x3d, 0xa6, 0x88, 0x1d, 0x3d, 0x3c, 0x6e, 0x31, 0x3d, 0x74, 0x0a, 0x84, 0x3c, + 0x3e, 0x82, 0x2e, 0x3d, 0x0a, 0x1f, 0xdb, 0x3c, 0xc8, 0xae, 0x6d, 0x3d, 0x16, 0xaa, 0x27, 0x3d, 0xfc, 0x01, 0x84, 0x3d, 0xa4, 0xa7, 0x88, 0x3d, 0x8a, 0xb0, 0x85, 0x3d, 0xd4, 0x52, 0x50, 0x3d, + 0xbf, 0xf3, 0x77, 0x3d, 0x45, 0xd9, 0x31, 0x3d, 0xc2, 0x2e, 0x51, 0x3d, 0x3a, 0x45, 0x05, 0x3b, 0x6c, 0x9e, 0x72, 0x3d, 0xa2, 0x7f, 0x13, 0x3d, 0xcc, 0x07, 0x5c, 0x3d, 0x53, 0x48, 0xa6, 0x3c, + 0xc0, 0xae, 0x72, 0x3c, 0x3e, 0x9d, 0x6d, 0x3d, 0x20, 0x1e, 0xbc, 0x3a, 0xc7, 0xf1, 0x87, 0x3d, 0x9a, 0x7c, 0xbb, 0x3d, 0x05, 0x8f, 0x51, 0x3d, 0xe6, 0x74, 0x42, 0x3d, 0xa4, 0xde, 0xac, 0x3d, + 0x18, 0x03, 0xb0, 0x3d, 0x14, 0x8a, 0xc4, 0x3c, 0xc8, 0x1b, 0x01, 0x3d, 0x98, 0x11, 0x7c, 0x3d, 0x0e, 0xd1, 0x61, 0x3c, 0x84, 0x58, 0x08, 0x3c, 0x7c, 0x8a, 0xae, 0x3b, 0x6c, 0xd1, 0x0d, 0xbc, + 0x1e, 0x63, 0xf4, 0x3c, 0x83, 0x71, 0x12, 0x3c, 0x9e, 0x31, 0x95, 0x3c, 0xd4, 0xe7, 0xc2, 0x3c, 0xf0, 0x96, 0x2f, 0x3b, 0xbe, 0xa5, 0xde, 0x3b, 0xfb, 0x9c, 0x43, 0x3c, 0x51, 0xb2, 0xf9, 0xbb, + 0x86, 0x5b, 0x8e, 0x3c, 0x1c, 0x62, 0x87, 0x3c, 0x55, 0x90, 0x09, 0x3b, 0xbf, 0x34, 0x18, 0x3c, 0x14, 0x49, 0x33, 0x3d, 0xc6, 0x8f, 0x23, 0x3c, 0x14, 0xbc, 0xff, 0x3c, 0x8e, 0xf8, 0xe4, 0x3c, + 0xe2, 0x15, 0x12, 0x3d, 0x14, 0x18, 0x80, 0x3c, 0x2a, 0xa3, 0xad, 0x3b, 0x5b, 0x54, 0x37, 0xbc, 0x11, 0x22, 0x1d, 0x3d, 0x78, 0x09, 0xd1, 0x3b, 0x90, 0x49, 0x51, 0x3d, 0x70, 0xfe, 0x7f, 0x3c, + 0x3e, 0xea, 0x22, 0xbc, 0xe0, 0x7b, 0xc3, 0x3b, 0x84, 0x86, 0x1a, 0x3d, 0xa4, 0xc4, 0xc1, 0x3a, 0xea, 0xa9, 0xf8, 0xbb, 0x45, 0x91, 0xdc, 0x3c, 0xff, 0x10, 0x14, 0x3d, 0x18, 0x13, 0x2b, 0xbb, + 0x10, 0xf4, 0x09, 0xbb, 0xb9, 0xd5, 0xac, 0x3b, 0x02, 0x77, 0xa9, 0x3c, 0x69, 0xcc, 0x11, 0x3d, 0xe0, 0x05, 0xb8, 0x3c, 0xd4, 0xb8, 0xfb, 0x3c, 0x9d, 0x47, 0xdf, 0x3c, 0xf0, 0xe7, 0x90, 0x3c, + 0xb4, 0x40, 0xc7, 0x3c, 0x3e, 0xbc, 0x54, 0x3c, 0x2e, 0xdb, 0xd7, 0x3c, 0x72, 0xdb, 0xdf, 0x3b, 0x06, 0x03, 0x22, 0x3d, 0xa4, 0xbd, 0x70, 0x3c, 0xb8, 0x09, 0xe9, 0x3c, 0x4d, 0x40, 0xc1, 0x3c, + 0xd2, 0xea, 0xe3, 0xbc, 0x1c, 0x86, 0x50, 0x3d, 0xef, 0x68, 0xf9, 0x3c, 0xc1, 0x64, 0x03, 0x3d, 0x70, 0x87, 0x0a, 0x3d, 0x06, 0x9b, 0x8f, 0x3c, 0xab, 0xd6, 0x6a, 0x3c, 0x10, 0x5e, 0x29, 0x3d, + 0x35, 0x81, 0x92, 0x3c, 0x4c, 0x87, 0x55, 0x3d, 0x00, 0xd5, 0xb4, 0xb8, 0xbf, 0xd2, 0x0c, 0x3d, 0x10, 0x7f, 0xe3, 0xbb, 0xf2, 0x40, 0x38, 0xbb, 0xca, 0xab, 0xcf, 0xbb, 0x54, 0xef, 0xfc, 0xbb, + 0x8c, 0x8c, 0x76, 0xbb, 0x97, 0xbd, 0x57, 0xbb, 0xce, 0x44, 0xeb, 0xbb, 0x54, 0x93, 0xa7, 0xbb, 0xec, 0x63, 0x36, 0xbc, 0x47, 0x2c, 0xb7, 0xbb, 0x3b, 0x48, 0x8f, 0xbb, 0x09, 0x7c, 0x28, 0xbb, + 0x94, 0xea, 0xd7, 0xbb, 0x38, 0x55, 0xee, 0xbb, 0xda, 0x60, 0x22, 0xbb, 0x92, 0x0a, 0xcc, 0xbb, 0x4a, 0xe6, 0x83, 0xbb, 0xf3, 0x0a, 0xc5, 0xbb, 0xc6, 0x61, 0x8f, 0xbb, 0xe0, 0x95, 0x61, 0x39, + 0xd0, 0xe4, 0x9c, 0xbb, 0x51, 0xe6, 0xe4, 0xbb, 0xce, 0xfe, 0xbb, 0xbb, 0xf8, 0xb5, 0x73, 0xbb, 0xf2, 0x03, 0x40, 0xbc, 0x66, 0xea, 0x2d, 0xbb, 0x3c, 0x17, 0x45, 0xbb, 0x56, 0xa4, 0xcd, 0xbb, + 0x30, 0x00, 0x84, 0xbb, 0xf8, 0xea, 0xac, 0xbb, 0xc2, 0x60, 0x38, 0xbb, 0xd8, 0xc5, 0x37, 0xbb, 0x4c, 0xaa, 0x96, 0xbb, 0x81, 0x4b, 0xa4, 0xbb, 0xc5, 0x52, 0xb2, 0xbb, 0xc0, 0x1d, 0xa9, 0xb8, + 0x2b, 0xa2, 0xd1, 0xbb, 0x36, 0x15, 0xf7, 0xba, 0x60, 0x21, 0xbd, 0xbb, 0x55, 0x82, 0xad, 0xbb, 0xd4, 0x91, 0xed, 0xbb, 0xa8, 0x31, 0xd6, 0xbb, 0x4b, 0xf7, 0x00, 0xbc, 0x79, 0x9b, 0x02, 0xbc, + 0x4d, 0x74, 0xe6, 0xbb, 0x83, 0xd6, 0x72, 0xbb, 0xda, 0xde, 0xaf, 0xbb, 0x4a, 0xf0, 0xd0, 0xb9, 0xac, 0x13, 0x06, 0xbc, 0xb8, 0x96, 0xaf, 0xbb, 0xfa, 0x5b, 0xc0, 0xbb, 0xfe, 0xa3, 0x1c, 0xba, + 0xbd, 0x17, 0x9a, 0xba, 0xc2, 0xd5, 0xbc, 0xbb, 0x8e, 0xbf, 0x1f, 0xbb, 0xd2, 0xc3, 0x02, 0xbc, 0x20, 0x8a, 0x1e, 0xbc, 0xd6, 0xbe, 0xb9, 0xbb, 0x3e, 0xb7, 0xc3, 0xbb, 0xf8, 0x3b, 0x27, 0xbc, + 0x8e, 0x13, 0x39, 0xbc, 0x31, 0x1e, 0xa0, 0xbb, 0x2e, 0xd6, 0x88, 0xbb, 0x9a, 0xd8, 0xe3, 0xbb, 0xc9, 0x11, 0x55, 0xbb, 0x94, 0x39, 0x01, 0xba, 0x1d, 0xa4, 0xc6, 0xba, 0x70, 0x13, 0xb6, 0xb9, + 0x81, 0x12, 0x9c, 0xbb, 0x80, 0x1f, 0xb8, 0xb9, 0x06, 0x1a, 0x29, 0xbb, 0xc2, 0x6e, 0xd0, 0xba, 0xe2, 0xfc, 0x1b, 0xbb, 0x7e, 0x96, 0x11, 0xbb, 0x00, 0x65, 0xe4, 0xb8, 0x02, 0x29, 0x17, 0x3b, + 0xcf, 0xfd, 0x17, 0xbb, 0x3c, 0x3f, 0x1b, 0xbb, 0x12, 0x15, 0x9c, 0x39, 0x7a, 0xde, 0xef, 0xba, 0x78, 0x11, 0x94, 0xbb, 0xf4, 0x5c, 0xa2, 0xba, 0x04, 0x4e, 0x61, 0xbb, 0x12, 0x24, 0xfa, 0xba, + 0x00, 0x0a, 0xb3, 0xbb, 0x87, 0x43, 0x60, 0xbb, 0x30, 0x84, 0x8d, 0xb9, 0x50, 0x36, 0xb9, 0x3a, 0x1e, 0xbc, 0xdc, 0xbb, 0x88, 0x2f, 0x2e, 0xba, 0x22, 0xd8, 0xa1, 0xbb, 0xbc, 0xa0, 0x52, 0xbb, + 0x00, 0xf7, 0xae, 0xb8, 0xb3, 0x48, 0x0c, 0xbb, 0x27, 0xb3, 0x1c, 0xbb, 0xf8, 0x4b, 0x5c, 0x39, 0x7c, 0x05, 0x03, 0xba, 0x64, 0xfd, 0x7c, 0xbb, 0xfa, 0xd4, 0x9a, 0xbb, 0x16, 0xe9, 0xea, 0x3a, + 0xe0, 0x83, 0xfc, 0xba, 0x41, 0x82, 0x89, 0x39, 0x4e, 0x8a, 0x05, 0xbb, 0x31, 0x04, 0x9c, 0xbb, 0xcc, 0xee, 0x47, 0xbb, 0x56, 0x78, 0x40, 0xbb, 0x2a, 0x57, 0x85, 0xbb, 0xa9, 0x84, 0x98, 0xbb, + 0x5e, 0x96, 0x5f, 0xbb, 0x1a, 0x7f, 0x2d, 0xba, 0xd4, 0xbb, 0x3d, 0xbb, 0x36, 0x3f, 0x66, 0xba, 0x06, 0xab, 0xcb, 0xbb, 0xe3, 0x79, 0x53, 0xbb, 0xa7, 0x19, 0x5b, 0xbb, 0xba, 0xf0, 0x72, 0xba, + 0x4a, 0x5c, 0x4a, 0x3b, 0x0d, 0x90, 0xa5, 0xbb, 0x66, 0xef, 0xae, 0xbb, 0x34, 0x22, 0x95, 0xbb, 0xe2, 0xd8, 0x7f, 0xbb, 0xdc, 0x14, 0x17, 0xbb, 0x83, 0x3c, 0x2e, 0xbb, 0xf4, 0x5e, 0xc1, 0xbb, + 0xa3, 0x22, 0x93, 0xbb, 0x33, 0x53, 0xfc, 0xbb, 0xb0, 0x11, 0x85, 0xba, 0x86, 0xd9, 0x8a, 0xbb, 0x58, 0x93, 0x1d, 0xbb, 0xa7, 0xfc, 0x12, 0xbb, 0x09, 0x7d, 0x3f, 0xbb, 0xb7, 0xa7, 0x5b, 0xbb, + 0x4e, 0x45, 0x3d, 0xba, 0xd2, 0x2a, 0x36, 0xbb, 0x58, 0x0a, 0x7d, 0xbb, 0xd8, 0x90, 0x8f, 0xbb, 0x16, 0x38, 0x97, 0xbb, 0xf6, 0xae, 0xf1, 0xba, 0x9d, 0x70, 0x86, 0xbb, 0x12, 0xef, 0x54, 0xbb, + 0x35, 0x43, 0x71, 0xbb, 0xbf, 0x52, 0x81, 0xbb, 0x89, 0xa1, 0x0c, 0xbb, 0x72, 0x3a, 0x44, 0xbb, 0xb6, 0x4a, 0x3e, 0xbb, 0x49, 0xc1, 0x6c, 0xbb, 0x89, 0x7e, 0x35, 0xbb, 0xd2, 0xe8, 0x9e, 0xba, + 0x93, 0xe6, 0xaa, 0xba, 0x02, 0xec, 0x21, 0xbb, 0x5a, 0x40, 0x81, 0xbb, 0x1b, 0xd6, 0x24, 0xbb, 0x5d, 0xd3, 0x94, 0xbb, 0xff, 0xee, 0xdf, 0xba, 0x59, 0x9b, 0x31, 0xbb, 0xcf, 0x17, 0x14, 0xbb, + 0xdc, 0xc8, 0x9a, 0xba, 0xca, 0x65, 0xd6, 0xba, 0xf6, 0xff, 0x77, 0xbb, 0x49, 0x75, 0x0e, 0xbb, 0xb4, 0xa1, 0xb2, 0xba, 0x97, 0x52, 0x0d, 0xbb, 0x42, 0xaa, 0x2a, 0xbb, 0x9b, 0x1c, 0xe4, 0xba, + 0x0a, 0x84, 0xcd, 0xba, 0x61, 0xc2, 0x0b, 0xbb, 0x22, 0xa5, 0x7f, 0xbb, 0x4e, 0x72, 0x1d, 0xbb, 0xc2, 0x07, 0x80, 0xbb, 0xc3, 0x77, 0x97, 0xbb, 0xba, 0xad, 0x75, 0xbb, 0x23, 0x42, 0x00, 0xbb, + 0xeb, 0xc8, 0x6b, 0xbb, 0x71, 0x58, 0x51, 0xbb, 0x6f, 0x3e, 0x5e, 0xbb, 0x60, 0xfb, 0xf0, 0xb8, 0x6c, 0xf6, 0x48, 0xbb, 0x36, 0xe4, 0xcb, 0xba, 0xd5, 0xff, 0x62, 0xbb, 0x39, 0x66, 0x15, 0xbb, + 0xad, 0x91, 0x3f, 0xba, 0xa6, 0x49, 0x8c, 0xbb, 0xf3, 0x67, 0x31, 0x3a, 0x1a, 0x26, 0x7e, 0xbb, 0x19, 0xca, 0xc0, 0xbb, 0x1c, 0xd2, 0x4d, 0xbb, 0x50, 0x0c, 0x26, 0xbb, 0x32, 0xde, 0xa0, 0xbb, + 0x14, 0x12, 0x8b, 0xbb, 0x90, 0xe4, 0x6d, 0xba, 0xe2, 0xf9, 0xbe, 0xba, 0xd0, 0x0a, 0x7e, 0xbb, 0xb2, 0x74, 0x12, 0xba, 0xc2, 0x06, 0x8e, 0xba, 0x1e, 0xe9, 0x05, 0xba, 0x32, 0x33, 0xaf, 0x39, + 0x1d, 0x3b, 0xa0, 0xba, 0xfc, 0x9b, 0xab, 0xba, 0x1b, 0xa5, 0xcc, 0xba, 0x0d, 0xaa, 0x2b, 0xbb, 0x58, 0x6c, 0x7a, 0xb9, 0x32, 0x67, 0x27, 0xb9, 0x93, 0x2a, 0x01, 0xbb, 0x32, 0x72, 0x54, 0xba, + 0x13, 0xaf, 0xca, 0xba, 0xc9, 0xdf, 0xc3, 0xba, 0x2c, 0xbf, 0x46, 0xba, 0xfc, 0x9d, 0x50, 0xba, 0xe4, 0x75, 0x4e, 0xbb, 0x26, 0x9b, 0xa0, 0xba, 0xc5, 0xc3, 0x16, 0xbb, 0x1d, 0x0a, 0x15, 0xbb, + 0xf2, 0x30, 0xd7, 0xba, 0xd2, 0x9d, 0x34, 0xba, 0x52, 0xb4, 0x9e, 0xba, 0x5e, 0x9d, 0x54, 0x38, 0xf6, 0x5a, 0x06, 0xbb, 0xf7, 0x20, 0x3b, 0xba, 0xdd, 0xe7, 0x70, 0xbb, 0x5e, 0x0f, 0x3f, 0xba, + 0x97, 0x7f, 0x4f, 0x3a, 0xd0, 0x88, 0x8d, 0xb8, 0x28, 0x29, 0x6a, 0xbb, 0xf4, 0xbf, 0x2e, 0xba, 0x0c, 0xe3, 0x33, 0x3a, 0x90, 0x0b, 0xcf, 0xba, 0x77, 0x2c, 0x13, 0xbb, 0x46, 0xbe, 0x52, 0xba, + 0xb9, 0xfc, 0x2e, 0x3a, 0xdf, 0xc9, 0x8a, 0xba, 0xe9, 0x48, 0x05, 0xbb, 0x2a, 0xf4, 0x0b, 0xbb, 0x2a, 0xea, 0xec, 0xba, 0x07, 0xfa, 0x38, 0xbb, 0x9c, 0x2c, 0xf2, 0xba, 0xd3, 0x7e, 0x9c, 0xb9, + 0x9a, 0x3f, 0xea, 0xba, 0x26, 0x3a, 0xdb, 0xba, 0x7f, 0x3d, 0x0d, 0xbb, 0xe9, 0x05, 0xba, 0xb9, 0x1a, 0x7a, 0x08, 0xbb, 0x4e, 0x61, 0x00, 0xba, 0xaf, 0x30, 0x10, 0xbb, 0xd1, 0x2f, 0x20, 0xbb, + 0x51, 0x9f, 0xb7, 0x3a, 0xcc, 0xa6, 0x80, 0xbb, 0x86, 0xd6, 0x4a, 0xba, 0xaa, 0xc0, 0x0d, 0xbb, 0x63, 0x6c, 0x44, 0xbb, 0x66, 0x3b, 0xbf, 0xba, 0x4c, 0xcd, 0x65, 0xba, 0x68, 0xa3, 0x34, 0xbb, + 0xc0, 0x7e, 0x5c, 0xba, 0x26, 0xae, 0x17, 0xbb, 0x04, 0x4d, 0x01, 0x39, 0xbc, 0x0b, 0x25, 0xbb, 0xde, 0x1c, 0xd6, 0xbd, 0xfc, 0x56, 0xa1, 0xbc, 0x3d, 0xd1, 0x96, 0xbd, 0x29, 0x28, 0xa1, 0xbd, + 0xcb, 0x9d, 0xb9, 0xbd, 0xae, 0x27, 0x9d, 0xbc, 0xf1, 0x3b, 0xad, 0xbd, 0x1f, 0x7a, 0x1d, 0xbd, 0x34, 0x38, 0x08, 0xbe, 0x07, 0x5b, 0xa7, 0xbd, 0x94, 0x04, 0x8e, 0xbc, 0x2c, 0x8c, 0xad, 0x3c, + 0x18, 0xb2, 0x9a, 0xbd, 0x9e, 0x1b, 0xaa, 0xbd, 0xa8, 0x20, 0x0f, 0xbc, 0x47, 0x6f, 0x97, 0xbd, 0xb4, 0x95, 0x87, 0xbd, 0xb6, 0x8d, 0x6d, 0xbd, 0x27, 0xba, 0x81, 0xbd, 0x80, 0x3c, 0x1d, 0x3a, + 0xe7, 0xc0, 0xd7, 0xbd, 0x68, 0xf2, 0xd8, 0xbd, 0x86, 0x90, 0x28, 0xbd, 0xfc, 0x0a, 0x53, 0xbc, 0x57, 0x49, 0x3a, 0xbe, 0x9e, 0x21, 0xcf, 0xbc, 0x6c, 0xa4, 0x69, 0xbd, 0x58, 0x7e, 0xc4, 0xbd, + 0x5e, 0x88, 0x40, 0xbd, 0xa4, 0x5d, 0xa1, 0xbd, 0x06, 0xff, 0x95, 0xbc, 0x60, 0xcb, 0x66, 0xbc, 0x70, 0xef, 0x69, 0xbd, 0x14, 0x2f, 0xac, 0xbd, 0x8d, 0x45, 0xbe, 0xbd, 0xe9, 0xa7, 0x12, 0x3d, + 0xd2, 0x81, 0xc1, 0xbd, 0x00, 0x80, 0xaa, 0xb9, 0x86, 0x00, 0x6e, 0xbd, 0xae, 0x61, 0xbf, 0xbd, 0x0f, 0xee, 0xb6, 0xbd, 0x94, 0x56, 0x8d, 0xbd, 0x8e, 0xaf, 0xe0, 0xbd, 0xaa, 0x4a, 0x0d, 0xbe, + 0xfa, 0x83, 0xbe, 0xbd, 0x4a, 0x19, 0xbe, 0xbc, 0x56, 0xe6, 0x86, 0xbd, 0xa0, 0xdc, 0x4c, 0xbc, 0x57, 0x4c, 0x11, 0xbe, 0xca, 0xdc, 0xb9, 0xbd, 0x81, 0xd5, 0x9c, 0xbd, 0x84, 0xb1, 0x24, 0x3c, + 0x31, 0x08, 0x9b, 0x3c, 0x31, 0xf2, 0xa4, 0xbd, 0x32, 0xea, 0xbd, 0xbd, 0x1e, 0xc4, 0xe9, 0xbd, 0x6a, 0xa0, 0xe0, 0xbd, 0xfc, 0x71, 0x8b, 0xbd, 0x31, 0x38, 0xaa, 0xbd, 0x9a, 0xd7, 0x16, 0xbe, + 0x4b, 0xbd, 0x22, 0xbe, 0xd6, 0xde, 0x06, 0xbe, 0xd9, 0x60, 0x5e, 0xbd, 0x5e, 0x71, 0xc4, 0xbd, 0xb3, 0x8f, 0x86, 0xbd, 0xf5, 0x5d, 0x98, 0xbd, 0x6f, 0xbb, 0x6f, 0xbd, 0x02, 0x12, 0x10, 0xbe, + 0xc2, 0x7f, 0x9a, 0xbd, 0xb9, 0xa9, 0xa8, 0xbd, 0xe2, 0xaf, 0x19, 0xbd, 0xb3, 0xbe, 0xf3, 0xbc, 0xd1, 0x63, 0x3b, 0xbd, 0xca, 0x2e, 0x76, 0xbd, 0x84, 0x38, 0x97, 0xbd, 0xd2, 0x56, 0xd7, 0xbd, + 0x66, 0x62, 0xd4, 0xbd, 0x04, 0x7c, 0x86, 0xbd, 0xa1, 0xe0, 0x89, 0xbd, 0xc0, 0xac, 0x15, 0xbd, 0x1e, 0x69, 0xcd, 0xbd, 0x1c, 0x54, 0x1a, 0xbd, 0x7d, 0xa8, 0xc3, 0xbd, 0xc6, 0xe4, 0xca, 0xbd, + 0x13, 0xc4, 0xdb, 0xbd, 0xef, 0x17, 0xba, 0xbd, 0xec, 0x99, 0x49, 0xbd, 0xba, 0x20, 0xef, 0xbc, 0x61, 0xb2, 0xe5, 0xbd, 0x64, 0x5f, 0x87, 0xbd, 0x53, 0x3c, 0x9a, 0xbd, 0x54, 0xfb, 0xf4, 0xbc, + 0x37, 0xaf, 0x87, 0xbd, 0x29, 0xcc, 0xd7, 0xbd, 0xa6, 0xec, 0x66, 0xbd, 0x19, 0x1e, 0xa9, 0xbd, 0x47, 0xd9, 0xaa, 0xbd, 0x40, 0x62, 0x94, 0xbd, 0x7b, 0x86, 0x1f, 0xbd, 0xce, 0xce, 0x75, 0xbd, + 0xef, 0x43, 0xa4, 0xbd, 0x21, 0x46, 0xc1, 0xbd, 0x83, 0xf2, 0xb4, 0xbd, 0xd9, 0x66, 0x90, 0xbd, 0x53, 0x54, 0xb4, 0xbd, 0xe6, 0x4e, 0x93, 0xbd, 0x88, 0xed, 0x6d, 0xbd, 0x6c, 0xf3, 0xe3, 0xbd, + 0xda, 0xcf, 0x80, 0xbd, 0x4a, 0x1b, 0x9a, 0xbd, 0x5c, 0xbf, 0x8e, 0xbd, 0xe5, 0x3b, 0x81, 0xbd, 0x52, 0x90, 0x3b, 0xbd, 0x80, 0x39, 0xb6, 0xbd, 0xe0, 0x0d, 0x65, 0xbd, 0x30, 0x99, 0xc0, 0xbd, + 0x5a, 0x30, 0xdf, 0xbd, 0x40, 0xb5, 0xb1, 0xbd, 0xf4, 0xd3, 0x6b, 0xbd, 0xb8, 0xfd, 0xb4, 0xbd, 0xb6, 0xd1, 0x33, 0xbd, 0xfc, 0xa0, 0x62, 0xbd, 0x18, 0x0e, 0xae, 0xbd, 0xdc, 0xf6, 0x52, 0xbd, + 0xd0, 0xfd, 0xbf, 0xbd, 0x0e, 0xdc, 0x61, 0xbd, 0xb0, 0xc1, 0x7e, 0xbd, 0x80, 0x12, 0xab, 0xbd, 0xcc, 0x62, 0x35, 0x3d, 0xf7, 0x46, 0x26, 0x3d, 0x37, 0xa8, 0x37, 0x3d, 0x80, 0xba, 0x88, 0x3d, + 0x90, 0x48, 0x4f, 0x3d, 0xbe, 0x26, 0x54, 0x3d, 0x60, 0x3e, 0x23, 0x3d, 0x1a, 0xe2, 0x42, 0x3d, 0xa4, 0x0e, 0x54, 0x3d, 0x1d, 0xe8, 0x3c, 0x3d, 0xf4, 0x79, 0x67, 0x3d, 0x63, 0x7d, 0x42, 0x3d, + 0x74, 0x6f, 0x4b, 0x3d, 0xc6, 0x08, 0x15, 0x3d, 0xd3, 0x64, 0x35, 0x3d, 0x04, 0x98, 0x48, 0x3d, 0x2c, 0x01, 0x45, 0x3d, 0x80, 0xf8, 0x08, 0x3d, 0xe6, 0x2d, 0x45, 0x3d, 0xff, 0x1d, 0x50, 0x3d, + 0x46, 0xf8, 0x5a, 0x3d, 0x37, 0xe4, 0x8e, 0x3d, 0xab, 0xd3, 0x58, 0x3d, 0x35, 0xab, 0x0b, 0x3d, 0x32, 0x16, 0x8b, 0x3d, 0xde, 0x59, 0x40, 0x3d, 0x12, 0x0c, 0x5c, 0x3d, 0x03, 0xe8, 0x39, 0x3d, + 0xd0, 0xa0, 0x6e, 0x3d, 0x2a, 0xce, 0x4f, 0x3d, 0xa9, 0xdd, 0x3b, 0x3d, 0xcd, 0x0c, 0x8a, 0x3d, 0x6e, 0x9c, 0x7a, 0x3d, 0xe3, 0x67, 0x39, 0x3d, 0x99, 0xac, 0xdc, 0x3c, 0xbf, 0x52, 0x1d, 0x3d, + 0x12, 0xac, 0x30, 0x3d, 0x21, 0xa2, 0x57, 0x3d, 0x17, 0xc9, 0x5f, 0x3d, 0x63, 0xb7, 0x75, 0x3d, 0xae, 0x47, 0x88, 0x3d, 0x26, 0x92, 0x80, 0x3d, 0xa0, 0x68, 0xfb, 0x3c, 0x63, 0x76, 0x86, 0x3d, + 0xea, 0xc1, 0x72, 0x3d, 0x05, 0x45, 0x19, 0x3d, 0x40, 0x9e, 0x83, 0x3d, 0x24, 0xa5, 0x35, 0x3d, 0xf4, 0x24, 0x51, 0x3d, 0xfc, 0x3a, 0x5c, 0x3d, 0x87, 0xed, 0x11, 0x3d, 0x5a, 0x26, 0x4c, 0x3d, + 0x1d, 0x88, 0x8b, 0x3d, 0xce, 0xec, 0x4d, 0x3d, 0xe3, 0x7c, 0x70, 0x3d, 0x63, 0xd4, 0x71, 0x3d, 0x9a, 0x24, 0x35, 0x3d, 0x5c, 0x3f, 0x52, 0x3d, 0x34, 0x32, 0x8a, 0x3d, 0xca, 0xcd, 0x02, 0x3d, + 0x3c, 0xbc, 0x74, 0x3d, 0x62, 0x68, 0x4b, 0x3d, 0x68, 0xa9, 0x68, 0x3d, 0x8a, 0xed, 0x59, 0x3d, 0xa2, 0x50, 0x99, 0x3c, 0x24, 0xf2, 0xa1, 0x3c, 0xba, 0x04, 0x0f, 0x3c, 0xab, 0x9e, 0xca, 0x3c, + 0xb2, 0x50, 0x95, 0x3c, 0xe8, 0x6c, 0x3b, 0x3c, 0x34, 0x70, 0x16, 0x3b, 0xa0, 0x48, 0xe0, 0x3b, 0xa6, 0xa9, 0xd0, 0x3b, 0xa1, 0xbb, 0x0b, 0x3c, 0x00, 0x62, 0xf4, 0x3c, 0x09, 0xc0, 0x9c, 0x3c, + 0xc4, 0x7c, 0x85, 0x3c, 0x0e, 0x60, 0x8b, 0x3c, 0xf2, 0xa0, 0xe1, 0x3b, 0x42, 0x88, 0xd5, 0x3b, 0x98, 0x72, 0x8b, 0x3c, 0x4e, 0x85, 0xea, 0x3b, 0xf0, 0xdd, 0x8f, 0x3b, 0x1e, 0x2e, 0x82, 0x3c, + 0xbe, 0x47, 0xfc, 0x3c, 0x46, 0x4b, 0x45, 0x3c, 0xe0, 0x1a, 0x95, 0x3c, 0xe8, 0x5d, 0x99, 0xbb, 0xf8, 0x34, 0x27, 0x3d, 0x4c, 0x00, 0xa4, 0x3c, 0xc8, 0x16, 0x51, 0x3c, 0x00, 0xa9, 0xfe, 0xb9, + 0x9c, 0x0f, 0xc3, 0x3b, 0xf6, 0x04, 0x10, 0x3d, 0xae, 0xbe, 0xd8, 0x3b, 0x4f, 0xff, 0xac, 0x3c, 0xa4, 0x5a, 0x5e, 0x3b, 0x96, 0xe7, 0xb9, 0x3b, 0x53, 0x9f, 0xc2, 0x3b, 0x13, 0xc8, 0x22, 0x3c, + 0x50, 0xe0, 0xb0, 0x3c, 0x1c, 0x69, 0xe8, 0x3c, 0x28, 0xc3, 0x20, 0x3c, 0x34, 0x92, 0xd1, 0x3b, 0xf0, 0xcb, 0x1c, 0x3c, 0x8c, 0x7c, 0x8e, 0x3c, 0xf2, 0xa3, 0xb0, 0x3c, 0x78, 0x48, 0xe0, 0x3c, + 0xb6, 0x38, 0x81, 0x3c, 0x3e, 0x76, 0x8a, 0x3c, 0x07, 0xb3, 0x10, 0x3c, 0x1f, 0x8b, 0x4d, 0x3c, 0xff, 0x3b, 0xba, 0x3c, 0x35, 0xa2, 0xd5, 0x3c, 0x99, 0xdf, 0x0f, 0x3c, 0x18, 0x57, 0x9d, 0x3c, + 0x6c, 0x2f, 0x62, 0x3c, 0x28, 0x90, 0xb7, 0x3c, 0xdc, 0x9f, 0x08, 0x3d, 0xe5, 0xaf, 0x8c, 0x3c, 0x59, 0x24, 0x28, 0x3c, 0xf2, 0xa0, 0xb5, 0x3c, 0xee, 0xc8, 0xaf, 0x3c, 0x9c, 0xeb, 0xd8, 0x3c, + 0xfc, 0x4a, 0xed, 0x3c, 0x45, 0xd4, 0x29, 0x3c, 0x58, 0xbb, 0xdf, 0x3c, 0x5e, 0x2b, 0x7d, 0x3c, 0x60, 0x8f, 0xb0, 0xbb, 0xd0, 0xa8, 0x96, 0xbb, 0x04, 0xc3, 0xaa, 0xbb, 0x54, 0xff, 0xd6, 0xbb, + 0x25, 0x34, 0xc4, 0xbb, 0xdc, 0x5e, 0xb7, 0xbb, 0x8d, 0xab, 0x9e, 0xbb, 0x20, 0x93, 0xd4, 0xbb, 0x5f, 0xee, 0xd6, 0xbb, 0x39, 0x03, 0xaf, 0xbb, 0x45, 0xff, 0xf5, 0xbb, 0xe8, 0xe3, 0x95, 0xbb, + 0x5b, 0x8c, 0x9d, 0xbb, 0xbb, 0xfd, 0x86, 0xbb, 0x5f, 0x6b, 0x9b, 0xbb, 0xd5, 0x79, 0xd3, 0xbb, 0x22, 0xbe, 0x9a, 0xbb, 0xc5, 0xef, 0x86, 0xbb, 0x1d, 0x75, 0x8b, 0xbb, 0x12, 0x91, 0xa7, 0xbb, + 0xe8, 0x02, 0xc4, 0xbb, 0xa4, 0xb5, 0x03, 0xbc, 0xbb, 0x40, 0xec, 0xbb, 0xec, 0x0c, 0x80, 0xbb, 0xcc, 0x17, 0x0b, 0xbc, 0x3a, 0x6c, 0xbf, 0xbb, 0xdc, 0xb4, 0xca, 0xbb, 0xa4, 0xa0, 0xbd, 0xbb, + 0xe8, 0xbd, 0xdf, 0xbb, 0xd6, 0xe1, 0xbf, 0xbb, 0x4a, 0xb2, 0xae, 0xbb, 0x99, 0xdc, 0x09, 0xbc, 0x3c, 0x2c, 0xd7, 0xbb, 0xe9, 0xa4, 0x98, 0xbb, 0xec, 0x30, 0x47, 0xbb, 0x6c, 0x84, 0x8b, 0xbb, + 0x0d, 0xc6, 0x9f, 0xbb, 0xa5, 0x29, 0xca, 0xbb, 0x15, 0x1d, 0xbc, 0xbb, 0xed, 0x52, 0xe4, 0xbb, 0x73, 0x93, 0xf6, 0xbb, 0x64, 0x40, 0x01, 0xbc, 0xa4, 0x73, 0x75, 0xbb, 0x22, 0xfe, 0xf5, 0xbb, + 0x1f, 0x74, 0xf8, 0xbb, 0xfc, 0x6c, 0x81, 0xbb, 0xb4, 0x38, 0xfe, 0xbb, 0x8a, 0x9a, 0xa9, 0xbb, 0xfe, 0x33, 0xee, 0xbb, 0x16, 0xd1, 0xd1, 0xbb, 0x6e, 0x6b, 0x80, 0xbb, 0x8a, 0xba, 0xad, 0xbb, + 0xe6, 0x75, 0xee, 0xbb, 0x88, 0x0b, 0xbd, 0xbb, 0x82, 0x2c, 0x0c, 0xbc, 0xed, 0xcd, 0xdd, 0xbb, 0xe6, 0x3b, 0xba, 0xbb, 0x39, 0x30, 0xe4, 0xbb, 0xc8, 0xff, 0x08, 0xbc, 0x71, 0x17, 0x90, 0xbb, + 0xd9, 0xbe, 0xee, 0xbb, 0xcd, 0xd7, 0xc8, 0xbb, 0xb0, 0x00, 0x00, 0xbc, 0x85, 0x97, 0xc3, 0xbb, 0xfd, 0x8d, 0x39, 0xbb, 0xfe, 0xd4, 0x24, 0xbb, 0x5a, 0xc2, 0xd1, 0xba, 0x2f, 0x8f, 0x1a, 0xbb, + 0xbe, 0x34, 0x34, 0xbb, 0xcc, 0x89, 0xcc, 0xba, 0x74, 0x63, 0x8e, 0xba, 0x69, 0x31, 0x1e, 0xbb, 0xa4, 0x4d, 0x04, 0xbb, 0xd4, 0x55, 0xcf, 0xba, 0xb4, 0x91, 0x9e, 0xbb, 0x6e, 0xfe, 0xe3, 0xba, + 0x96, 0xc7, 0xc0, 0xba, 0x11, 0x07, 0x0f, 0xbb, 0x04, 0x78, 0x83, 0xba, 0x4a, 0xb5, 0x10, 0xbb, 0x1f, 0xac, 0xd3, 0xba, 0x9b, 0x3f, 0xc3, 0xba, 0x80, 0x8d, 0xf9, 0x38, 0x7c, 0x94, 0xd4, 0xba, + 0xc3, 0x7e, 0x73, 0xbb, 0xc6, 0x80, 0x14, 0xbb, 0xe4, 0xdd, 0x74, 0xbb, 0x40, 0x91, 0xfa, 0xb7, 0xa7, 0x69, 0xbd, 0xbb, 0xe6, 0x48, 0x4e, 0xbb, 0x1e, 0xf9, 0x08, 0xbb, 0x4e, 0x3e, 0x9d, 0xba, + 0xab, 0x61, 0xd1, 0xba, 0x79, 0xda, 0x8c, 0xbb, 0x1d, 0x50, 0xb8, 0xba, 0xad, 0x15, 0x78, 0xbb, 0xc4, 0x69, 0x4e, 0xba, 0xb0, 0x47, 0x37, 0xba, 0x0e, 0x4e, 0x76, 0xba, 0x28, 0xdc, 0xb8, 0xba, + 0x71, 0x5f, 0x32, 0xbb, 0x92, 0x3d, 0x75, 0xbb, 0x60, 0xf3, 0xa4, 0xba, 0xf0, 0xd4, 0xd2, 0xba, 0x99, 0x8c, 0xef, 0xba, 0x87, 0x91, 0x5b, 0xbb, 0x19, 0xa3, 0x39, 0xbb, 0xce, 0x21, 0x71, 0xbb, + 0x58, 0xdf, 0x53, 0xbb, 0x78, 0xc7, 0xf8, 0xba, 0x07, 0x25, 0x11, 0xbb, 0xe7, 0xca, 0x02, 0xbb, 0xa0, 0x61, 0x91, 0xbb, 0xf7, 0x2b, 0x6e, 0xbb, 0xe6, 0xce, 0xa1, 0xba, 0x98, 0xb0, 0x12, 0xbb, + 0x96, 0xc9, 0xf1, 0xba, 0xe6, 0x75, 0x43, 0xbb, 0xcc, 0x41, 0xc5, 0xbb, 0x26, 0x3b, 0x29, 0xbb, 0x60, 0xe5, 0x15, 0xbb, 0x51, 0xa4, 0x84, 0xbb, 0x9d, 0x65, 0x76, 0xbb, 0xe7, 0x6e, 0x7c, 0xbb, + 0xda, 0xdf, 0x89, 0xbb, 0x4c, 0x2a, 0x10, 0xbb, 0x62, 0x5e, 0x9f, 0xbb, 0x52, 0xc4, 0x0f, 0xbb, 0x1e, 0x44, 0x25, 0xbb, 0x8b, 0xd5, 0x24, 0xbb, 0xcf, 0xf8, 0x26, 0xbb, 0x5f, 0xbe, 0x94, 0xbb, + 0x8a, 0xd6, 0x3f, 0xbb, 0x84, 0xdf, 0x4f, 0xbb, 0x86, 0x0c, 0x09, 0xbb, 0x79, 0xd2, 0x0f, 0xbb, 0x08, 0xfd, 0x2b, 0xbb, 0x90, 0x01, 0x2c, 0xbb, 0x40, 0x19, 0x43, 0xbb, 0xce, 0xdd, 0x57, 0xbb, + 0xf2, 0xfb, 0x5d, 0xbb, 0x44, 0x35, 0x13, 0xbb, 0xf1, 0xba, 0x30, 0xbb, 0x68, 0xee, 0x1a, 0xbb, 0x71, 0xcf, 0x55, 0xbb, 0x86, 0xf2, 0xea, 0xba, 0xd9, 0x0f, 0x5b, 0xbb, 0xef, 0x0f, 0x5c, 0xbb, + 0xc4, 0x5c, 0x5f, 0xbb, 0x6e, 0x6d, 0x82, 0xbb, 0x28, 0xf2, 0x28, 0xbb, 0xce, 0x09, 0xef, 0xba, 0xb6, 0xf7, 0x7f, 0xbb, 0x91, 0x35, 0x2b, 0xbb, 0x36, 0xd5, 0x4b, 0xbb, 0x1e, 0x9b, 0x10, 0xbb, + 0x59, 0xaa, 0x52, 0xbb, 0xa9, 0xf8, 0x52, 0xbb, 0x78, 0xe1, 0x28, 0xbb, 0x6a, 0x8e, 0x6e, 0xbb, 0xe0, 0xb4, 0x6e, 0xbb, 0xe1, 0xbb, 0x39, 0xbb, 0xea, 0xc0, 0xcf, 0xba, 0xb9, 0xcd, 0x17, 0xbb, + 0x82, 0xec, 0x2f, 0xbb, 0xe8, 0x37, 0x51, 0xbb, 0x08, 0xbd, 0x5e, 0xbb, 0x6e, 0x34, 0x5b, 0xbb, 0x32, 0xdf, 0x7b, 0xbb, 0xcd, 0x99, 0x5a, 0xbb, 0x78, 0x69, 0xf2, 0xba, 0x0e, 0x40, 0x82, 0xbb, + 0x9a, 0x6e, 0x49, 0xbb, 0x46, 0x4e, 0x20, 0xbb, 0xbd, 0xd6, 0x62, 0xbb, 0x78, 0xa2, 0x27, 0xbb, 0x53, 0xe1, 0x1c, 0xbb, 0x1a, 0xf6, 0x4f, 0xbb, 0xb4, 0x71, 0x0d, 0xbb, 0x9a, 0x79, 0x51, 0xbb, + 0xd8, 0x94, 0x89, 0xbb, 0xef, 0xf9, 0x47, 0xbb, 0xf3, 0xda, 0x33, 0xbb, 0xc2, 0x8e, 0x63, 0xbb, 0x00, 0x3b, 0x14, 0xbb, 0x84, 0x6b, 0x28, 0xbb, 0x12, 0x16, 0x71, 0xbb, 0x49, 0x18, 0xe2, 0xba, + 0xe1, 0x81, 0x61, 0xbb, 0x4c, 0xc8, 0x2d, 0xbb, 0xb1, 0x28, 0x39, 0xbb, 0x3a, 0x56, 0x51, 0xbb, 0x7a, 0xf7, 0xa2, 0xba, 0x74, 0x2e, 0xc1, 0xba, 0x88, 0x99, 0x42, 0xba, 0x03, 0xc5, 0x1f, 0xbb, + 0x89, 0x79, 0xab, 0xba, 0x37, 0x46, 0x97, 0xba, 0x60, 0x9d, 0x5f, 0xb9, 0xd4, 0x80, 0x25, 0xb9, 0xd6, 0xab, 0xb9, 0xb9, 0x32, 0x4f, 0x43, 0xba, 0x5e, 0x57, 0xd7, 0xba, 0x03, 0xf3, 0xf5, 0xba, + 0xf8, 0x42, 0xe1, 0xba, 0x24, 0x76, 0xa7, 0xba, 0x4d, 0xb6, 0x58, 0xba, 0x74, 0xf3, 0x7c, 0xb9, 0xaf, 0xf8, 0xdf, 0xba, 0x28, 0xcb, 0xfa, 0xb9, 0x0d, 0x69, 0x97, 0xba, 0xf5, 0x47, 0xd3, 0xba, + 0x3e, 0x19, 0x15, 0xbb, 0xf0, 0x5f, 0x8f, 0xba, 0x81, 0x81, 0x5c, 0xba, 0x84, 0xae, 0xf1, 0x38, 0xc6, 0x31, 0x27, 0xbb, 0x95, 0xdb, 0xa5, 0xba, 0xd2, 0x39, 0x8b, 0xba, 0x70, 0xbe, 0xb6, 0x38, + 0x6c, 0x13, 0x1d, 0xba, 0x4d, 0xf1, 0x1f, 0xbb, 0xd6, 0x0c, 0x21, 0xba, 0x1d, 0xc0, 0xad, 0xba, 0x72, 0x57, 0x45, 0xba, 0xb5, 0xff, 0x5f, 0xba, 0x02, 0x47, 0x0c, 0xba, 0x60, 0x01, 0x6a, 0xba, + 0x5c, 0xac, 0xd2, 0xba, 0x0c, 0xad, 0x01, 0xbb, 0x98, 0xb9, 0x97, 0xba, 0xf6, 0x5a, 0x2e, 0xba, 0xc8, 0x5d, 0x82, 0xba, 0x90, 0xc3, 0x8b, 0xba, 0xec, 0x98, 0xb6, 0xba, 0xe2, 0x12, 0x07, 0xbb, + 0x9a, 0xa8, 0x6b, 0xba, 0x50, 0x26, 0xba, 0xba, 0x2f, 0xc2, 0x34, 0xba, 0x8c, 0xb5, 0x7d, 0xba, 0xec, 0x1d, 0x80, 0xba, 0x01, 0x76, 0xea, 0xba, 0x02, 0xe8, 0x55, 0xba, 0xb9, 0x2e, 0xda, 0xba, + 0x61, 0x4c, 0xc2, 0xba, 0xb4, 0x52, 0xd9, 0xba, 0x73, 0x87, 0xc1, 0xba, 0xd0, 0xbc, 0xb4, 0xba, 0x28, 0xc7, 0x13, 0xba, 0x38, 0xbf, 0x91, 0xba, 0xa0, 0x92, 0xb4, 0xba, 0x4f, 0x58, 0xbe, 0xba, + 0xc6, 0x45, 0xf9, 0xba, 0x94, 0x16, 0x34, 0xba, 0x6a, 0x0b, 0xb1, 0xba, 0xba, 0x41, 0xab, 0xba, 0xcd, 0xbc, 0x9b, 0xbd, 0x2c, 0x94, 0x7b, 0xbd, 0xb4, 0xf1, 0x7d, 0xbd, 0xdc, 0x78, 0x85, 0xbd, + 0x44, 0xaf, 0xa2, 0xbd, 0xfb, 0x70, 0x75, 0xbd, 0x3c, 0xc4, 0x6a, 0xbd, 0xe6, 0xd8, 0xbd, 0xbd, 0x6c, 0x36, 0xae, 0xbd, 0x81, 0xba, 0x80, 0xbd, 0xce, 0xa7, 0xf6, 0xbd, 0x86, 0x44, 0x37, 0xbd, + 0x58, 0x22, 0x37, 0xbd, 0xc5, 0x27, 0x5f, 0xbd, 0xdd, 0xd6, 0x43, 0xbd, 0xce, 0xdb, 0xb4, 0xbd, 0x52, 0x51, 0x3d, 0xbd, 0x82, 0xd0, 0x5b, 0xbd, 0x6d, 0x99, 0xb9, 0xbc, 0xd2, 0xed, 0x50, 0xbd, + 0x96, 0x7b, 0xa8, 0xbd, 0xb8, 0xe6, 0xbe, 0xbd, 0x47, 0x00, 0xe3, 0xbd, 0x71, 0x8e, 0x17, 0xbd, 0x93, 0xad, 0x07, 0xbe, 0xe0, 0x45, 0xad, 0xbd, 0xaa, 0x1f, 0x98, 0xbd, 0x32, 0xa9, 0x91, 0xbd, + 0xe5, 0x33, 0xa0, 0xbd, 0x5c, 0x01, 0xb4, 0xbd, 0xb9, 0xbb, 0x7c, 0xbd, 0x50, 0xce, 0xee, 0xbd, 0x28, 0x72, 0x7d, 0xbd, 0x61, 0xae, 0x2c, 0xbd, 0x80, 0x78, 0x0f, 0xbd, 0xea, 0xbb, 0x47, 0xbd, + 0xd3, 0xfb, 0x85, 0xbd, 0x64, 0x0b, 0xb4, 0xbd, 0x34, 0x0d, 0x68, 0xbd, 0x06, 0x80, 0xa1, 0xbd, 0x4f, 0x66, 0xaa, 0xbd, 0xfd, 0x1f, 0xde, 0xbd, 0x9b, 0xa0, 0x73, 0xbd, 0xb6, 0xbf, 0xc8, 0xbd, + 0x34, 0xfa, 0xd8, 0xbd, 0xbd, 0x69, 0x40, 0xbd, 0x0c, 0x17, 0xc2, 0xbd, 0x04, 0xed, 0x84, 0xbd, 0x29, 0x43, 0xf6, 0xbd, 0x18, 0xb4, 0xb9, 0xbd, 0x57, 0x36, 0x34, 0xbd, 0xe6, 0x25, 0x7b, 0xbd, + 0x3e, 0x5f, 0x9a, 0xbd, 0x3e, 0x36, 0x9d, 0xbd, 0x2e, 0x2b, 0x19, 0xbe, 0x28, 0x34, 0xaa, 0xbd, 0x8b, 0x2a, 0xa1, 0xbd, 0x51, 0x61, 0xe1, 0xbd, 0xe4, 0xc3, 0xeb, 0xbd, 0x9d, 0xb0, 0xa3, 0xbd, + 0x21, 0xd0, 0xd8, 0xbd, 0x19, 0xab, 0xa3, 0xbd, 0x8f, 0x1b, 0x02, 0xbe, 0xee, 0x4c, 0x91, 0xbd, 0xa5, 0xe2, 0xa3, 0xbc, 0x3a, 0x3a, 0xf8, 0xbc, 0x9a, 0x4e, 0xee, 0xbc, 0xe9, 0x02, 0x80, 0xbd, + 0x98, 0x13, 0x77, 0xbb, 0x6b, 0xbf, 0x87, 0xbc, 0x22, 0xee, 0x3d, 0xbc, 0x07, 0x8b, 0xa1, 0xbc, 0x0c, 0x72, 0xe5, 0xbc, 0x66, 0x93, 0x5f, 0xbc, 0x57, 0x8b, 0x3c, 0xbd, 0x04, 0xec, 0xa2, 0xbd, + 0xec, 0xe3, 0x8a, 0xbd, 0x85, 0x7b, 0x15, 0xbd, 0x98, 0x17, 0xbc, 0xbc, 0x98, 0xf1, 0x88, 0x3c, 0xe2, 0xdf, 0x47, 0xbd, 0x40, 0x0d, 0x99, 0xb9, 0x2c, 0x9f, 0x4e, 0xbd, 0x82, 0x51, 0x88, 0xbd, + 0x14, 0x13, 0xea, 0xbc, 0xf3, 0xa6, 0x48, 0xbd, 0xb4, 0xb8, 0x9a, 0xbc, 0x9a, 0x6f, 0x9f, 0xbb, 0x52, 0x17, 0x6c, 0xbd, 0x59, 0xbe, 0x8d, 0xbc, 0x82, 0xe2, 0x3d, 0xbd, 0x64, 0xf3, 0xe2, 0xba, + 0x4f, 0xc8, 0xb1, 0xbc, 0x3f, 0x67, 0x0d, 0xbd, 0x11, 0x93, 0x41, 0xbd, 0x89, 0x60, 0x39, 0xbd, 0xba, 0xc0, 0xf4, 0xbc, 0xe5, 0x86, 0x23, 0xbd, 0x3c, 0xcc, 0xa5, 0xbc, 0x1e, 0xf9, 0x64, 0xbc, + 0x9b, 0xfc, 0x93, 0xbc, 0xca, 0x0d, 0x0f, 0xbd, 0x8a, 0x6f, 0x17, 0xbd, 0xb4, 0x33, 0x0c, 0xbd, 0xf9, 0x7d, 0x36, 0xbd, 0x05, 0xcb, 0x12, 0xbd, 0xda, 0x1b, 0x2c, 0xbd, 0x8b, 0xef, 0x19, 0xbd, + 0x55, 0x6b, 0x2b, 0xbd, 0xd4, 0xd5, 0x98, 0xbd, 0x16, 0x3c, 0xbf, 0xbc, 0x6e, 0x4c, 0x13, 0xbc, 0xdc, 0x36, 0x2c, 0xbc, 0xf8, 0xdb, 0xd9, 0xbc, 0x10, 0x51, 0x16, 0xbd, 0x2a, 0x79, 0x41, 0xbd, + 0x7f, 0x0b, 0x10, 0xbd, 0x0f, 0x3b, 0xb1, 0xbc, 0x37, 0xf8, 0x1f, 0xbb, 0xd0, 0x41, 0x88, 0xbd, 0x16, 0x4c, 0x28, 0xbd, 0x25, 0x8e, 0x7c, 0xbc, 0x61, 0x84, 0x90, 0xbc, 0xd6, 0x9d, 0xba, 0xbc, + 0xc5, 0x56, 0x60, 0xbd, 0x82, 0x15, 0x2f, 0xbd, 0x60, 0x62, 0xbe, 0xbc, 0x76, 0x32, 0x6c, 0xbd, 0x18, 0xb8, 0x95, 0x3c, 0x23, 0x24, 0xf0, 0x3b, 0x2b, 0x53, 0xcc, 0x3c, 0xbc, 0x2e, 0x1d, 0x3d, + 0x7c, 0x23, 0x6f, 0x3c, 0x3a, 0x6b, 0x85, 0x3c, 0xa4, 0x33, 0xa8, 0x3c, 0x82, 0x0c, 0xaa, 0x3c, 0xef, 0xa9, 0x1d, 0x3d, 0x02, 0xa8, 0x3e, 0x3c, 0xb5, 0x98, 0xd0, 0x3c, 0x5e, 0x02, 0x21, 0x3c, + 0xfb, 0x49, 0xf4, 0x3c, 0x76, 0xa5, 0x67, 0x3c, 0x96, 0x09, 0x00, 0x3c, 0xf1, 0x6a, 0x82, 0x3c, 0x25, 0xee, 0xa4, 0x3c, 0x0f, 0x3c, 0xf5, 0x3b, 0x9d, 0x71, 0xcf, 0x3c, 0xe6, 0xd3, 0x4a, 0x3c, + 0x71, 0x4d, 0x34, 0x3c, 0xcb, 0x89, 0x06, 0x3d, 0xf5, 0x20, 0xbd, 0x3c, 0x8e, 0xa9, 0x8c, 0x3b, 0x6e, 0x6a, 0x3e, 0x3d, 0x50, 0xeb, 0xd8, 0x3b, 0xde, 0x99, 0xb7, 0x3c, 0x6c, 0xa8, 0xcf, 0x3c, + 0xd6, 0xc9, 0xaa, 0x3c, 0x63, 0xba, 0x42, 0x3c, 0x35, 0x99, 0xb2, 0x3c, 0x86, 0x62, 0xed, 0x3c, 0xc8, 0x0c, 0xdb, 0x3c, 0x2f, 0x3d, 0x9e, 0x3c, 0xcb, 0x6b, 0xb5, 0xba, 0xce, 0xd6, 0xab, 0x3c, + 0x6f, 0xa9, 0x8e, 0x3c, 0x14, 0x81, 0x83, 0x3c, 0xca, 0xee, 0xca, 0x3c, 0x22, 0x35, 0x1e, 0x3d, 0xc1, 0xe8, 0xa2, 0x3c, 0x3c, 0xe8, 0x12, 0x3d, 0x71, 0x45, 0x82, 0x3c, 0x2b, 0xae, 0x00, 0x3d, + 0x52, 0xf0, 0xce, 0x3c, 0x69, 0x4e, 0xc9, 0x3c, 0xd5, 0x61, 0x05, 0x3d, 0x6c, 0xa2, 0xd4, 0x3a, 0x4b, 0x6a, 0xc1, 0x3c, 0x46, 0xf5, 0x8b, 0x3c, 0x03, 0x28, 0x0e, 0x3c, 0x00, 0x9f, 0xff, 0x3b, + 0x2c, 0xf4, 0x08, 0x3d, 0x06, 0x9f, 0xe1, 0x3c, 0xaf, 0xc3, 0x89, 0x3c, 0xd6, 0xda, 0xc3, 0x3c, 0xbf, 0xa1, 0xf0, 0x3c, 0x38, 0xce, 0x95, 0x3c, 0xcc, 0xd2, 0xfd, 0x3c, 0x42, 0xeb, 0x7d, 0x3c, + 0x4e, 0x80, 0x0d, 0x3d, 0x0e, 0x1b, 0x05, 0x3d, 0x92, 0xdb, 0x0b, 0x3d, 0x17, 0xa2, 0x1a, 0x3d, 0x07, 0x1d, 0x3f, 0x3c, 0xa0, 0x6e, 0x27, 0x39, 0xbe, 0xca, 0x28, 0xbc, 0x4d, 0x83, 0x1b, 0x3c, + 0xca, 0x9e, 0x99, 0x3b, 0x6d, 0xb9, 0xc2, 0xbb, 0x90, 0x0b, 0x18, 0x3b, 0x5c, 0x90, 0x30, 0x3c, 0xd2, 0xaa, 0x93, 0x3b, 0x68, 0xc3, 0xce, 0x3a, 0x2c, 0x65, 0x2e, 0x3c, 0x26, 0xca, 0x36, 0x3c, + 0xf2, 0x54, 0x4c, 0x3c, 0x69, 0x9d, 0xc5, 0x3b, 0xfd, 0xc2, 0xef, 0xbb, 0x10, 0xd9, 0xa0, 0xbb, 0x1a, 0xd0, 0x38, 0x3c, 0xa6, 0x9f, 0x64, 0xbb, 0x7c, 0x61, 0xa9, 0x3a, 0xe5, 0x6d, 0x0b, 0x3c, + 0x3b, 0x79, 0xc3, 0x3c, 0x90, 0x61, 0x9e, 0x3b, 0x0f, 0xdb, 0x3c, 0x3c, 0xff, 0x71, 0x06, 0xbc, 0x9e, 0xa8, 0xdb, 0x3c, 0x4c, 0x24, 0x0b, 0x3c, 0x52, 0xb4, 0x23, 0x3c, 0xbc, 0x21, 0x0a, 0xbc, + 0x20, 0xa2, 0x2e, 0xb9, 0xf6, 0xef, 0xa3, 0x3c, 0xc8, 0x9c, 0x6a, 0x3c, 0x24, 0x66, 0xb5, 0x3b, 0xa8, 0xe6, 0x09, 0x3a, 0x6f, 0x09, 0x80, 0xbb, 0x19, 0xff, 0x5a, 0x3c, 0xb8, 0x9c, 0x21, 0x3a, + 0x45, 0x03, 0x2d, 0x3c, 0x5e, 0x48, 0x05, 0x3c, 0x5a, 0xb9, 0x2c, 0x3c, 0xda, 0xbe, 0x91, 0x3c, 0xfc, 0x09, 0xd3, 0x3b, 0xd8, 0x9c, 0x8f, 0x3a, 0xdb, 0x6f, 0x97, 0x3c, 0xc0, 0x06, 0x58, 0x3c, + 0x68, 0xc6, 0x81, 0x3c, 0x9f, 0x27, 0x37, 0x3c, 0x12, 0xf1, 0x86, 0x3b, 0x06, 0xef, 0xb7, 0x3a, 0xa2, 0xe6, 0x9f, 0x3c, 0x44, 0xd0, 0xb4, 0x3b, 0x22, 0xbe, 0xb5, 0x3b, 0x84, 0x07, 0x3d, 0x3c, + 0xfc, 0xba, 0x63, 0xbb, 0x40, 0x37, 0x0c, 0xb9, 0xb9, 0x28, 0x35, 0x3c, 0x31, 0x7a, 0x46, 0x3c, 0xe8, 0x2f, 0x83, 0x3b, 0x83, 0x34, 0x88, 0x3c, 0xec, 0x1a, 0x4d, 0xbb, 0x2c, 0x76, 0x18, 0x3c, + 0xe6, 0xc5, 0x88, 0x3c, 0x75, 0xfa, 0xc0, 0x3c, 0xb7, 0x07, 0x7b, 0x3c, 0x09, 0x48, 0x07, 0x3c, 0xd0, 0x2e, 0x2e, 0xbb, 0x08, 0xe4, 0xcb, 0xb8, 0x06, 0xe2, 0x13, 0xbb, 0xcc, 0xe5, 0x87, 0xbb, + 0xfc, 0x9b, 0x16, 0xbb, 0x92, 0xdc, 0xd0, 0xba, 0xcc, 0x1e, 0x38, 0xbb, 0x13, 0x4b, 0x44, 0xbb, 0xba, 0xfc, 0xa5, 0xbb, 0x68, 0x81, 0xb3, 0xba, 0x4e, 0xa3, 0x38, 0xbb, 0xd0, 0x7a, 0x22, 0x3a, + 0x0d, 0xdd, 0x39, 0xbb, 0xf3, 0x0c, 0x9e, 0xba, 0x90, 0x0e, 0x2c, 0xb8, 0x50, 0xef, 0x30, 0xbb, 0x75, 0x47, 0x00, 0xbb, 0x86, 0x01, 0x76, 0xba, 0x04, 0xf8, 0x0e, 0xbb, 0x88, 0xa3, 0x85, 0xb8, + 0xf2, 0xe6, 0xf7, 0xba, 0x28, 0xb7, 0x67, 0xbb, 0x71, 0x77, 0x5f, 0xbb, 0x50, 0xc7, 0x95, 0xb8, 0x17, 0x79, 0xd1, 0xbb, 0x60, 0x6c, 0x5c, 0xba, 0x4e, 0xca, 0x17, 0xbb, 0xd6, 0xc7, 0x59, 0xbb, + 0xe2, 0xfd, 0x1c, 0xbb, 0x4d, 0xfe, 0xd5, 0xba, 0x3c, 0x00, 0x1e, 0xbb, 0x4a, 0x22, 0x4c, 0xbb, 0xf6, 0x50, 0x47, 0xbb, 0xce, 0xbc, 0xb3, 0xba, 0x06, 0x52, 0x60, 0x39, 0x82, 0xb2, 0x31, 0xbb, + 0x6a, 0x55, 0x26, 0xbb, 0x17, 0x55, 0xd8, 0xba, 0x66, 0x4c, 0x45, 0xbb, 0x68, 0x9c, 0xb5, 0xbb, 0x1c, 0x88, 0xee, 0xba, 0x5e, 0xf0, 0x8a, 0xbb, 0xc4, 0x37, 0xfa, 0xba, 0x33, 0xf8, 0x86, 0xbb, + 0x1a, 0xd0, 0x50, 0xbb, 0x34, 0x59, 0xe6, 0xba, 0xc6, 0x4e, 0x8d, 0xbb, 0x70, 0xf7, 0x67, 0xb7, 0x60, 0x8c, 0x88, 0xbb, 0xdb, 0xde, 0xff, 0xba, 0xb0, 0x64, 0xba, 0xb9, 0x2a, 0xe7, 0x34, 0xb9, + 0x88, 0x7a, 0x70, 0xbb, 0xb5, 0x0b, 0x5f, 0xbb, 0x7e, 0x37, 0x44, 0xbb, 0x5f, 0x61, 0x01, 0xbb, 0x4e, 0x1e, 0x54, 0xbb, 0x1e, 0x93, 0x48, 0xbb, 0x83, 0xcb, 0x7f, 0xbb, 0x4c, 0x14, 0x05, 0xbb, + 0x12, 0x55, 0x89, 0xbb, 0x28, 0xa2, 0x97, 0xbb, 0x35, 0x8c, 0xa7, 0xbb, 0x1f, 0x1a, 0x88, 0xbb, 0x59, 0xda, 0x02, 0xbb, 0xd7, 0x70, 0x22, 0x3a, 0x03, 0x30, 0xa6, 0x3a, 0x44, 0x04, 0xa5, 0xba, + 0x19, 0x5b, 0xab, 0xba, 0x22, 0xff, 0x24, 0x3a, 0xe2, 0x86, 0x81, 0xba, 0x36, 0xe1, 0x02, 0xbb, 0xd5, 0x7d, 0xda, 0xba, 0x92, 0xa0, 0xbe, 0xb9, 0x9e, 0x09, 0xaf, 0xba, 0x1c, 0x43, 0xff, 0x39, + 0x7e, 0x54, 0x90, 0xba, 0x2b, 0x1e, 0xcf, 0xb9, 0x36, 0xf5, 0xc0, 0x3a, 0x66, 0xb9, 0x21, 0xba, 0x2a, 0x85, 0x8c, 0xba, 0x5a, 0x86, 0x26, 0x39, 0x72, 0x46, 0x68, 0x39, 0x22, 0x77, 0xa9, 0x39, + 0x7b, 0x35, 0x50, 0xbb, 0x96, 0x85, 0x40, 0xba, 0xb3, 0xb4, 0x13, 0xbb, 0xc3, 0x75, 0x9a, 0x3a, 0xfe, 0x00, 0x91, 0xbb, 0xe6, 0xc1, 0x86, 0xba, 0x46, 0xc8, 0x8c, 0xba, 0x60, 0x4c, 0x66, 0x38, + 0xfc, 0x27, 0x9a, 0xb9, 0x5e, 0x41, 0x20, 0xbb, 0x2a, 0xef, 0xd9, 0xba, 0x01, 0x06, 0x4a, 0xba, 0xd0, 0x30, 0xdc, 0xb9, 0xbc, 0x51, 0x79, 0x3a, 0x9a, 0x4c, 0xa5, 0xba, 0xba, 0x10, 0x31, 0xba, + 0x8d, 0xd1, 0xf2, 0xba, 0x87, 0x1a, 0x61, 0xba, 0x6a, 0x15, 0xd0, 0xba, 0xaf, 0xaf, 0x62, 0xbb, 0x90, 0x2b, 0xf9, 0xb9, 0x38, 0x07, 0x48, 0xba, 0x9e, 0x0a, 0x0e, 0xbb, 0x08, 0x76, 0x16, 0xbb, + 0xf9, 0x14, 0x13, 0xbb, 0xef, 0x20, 0xdb, 0xb9, 0x75, 0x62, 0xc0, 0xba, 0x40, 0x67, 0x07, 0x37, 0x06, 0x49, 0x76, 0xbb, 0xa8, 0x86, 0x50, 0xba, 0x80, 0x40, 0x32, 0xb8, 0x02, 0x2d, 0x0f, 0xba, + 0xc0, 0xd1, 0xb5, 0x37, 0x94, 0xb0, 0x26, 0xba, 0x09, 0x78, 0x1e, 0xbb, 0x86, 0x59, 0x50, 0xba, 0xc4, 0x66, 0x37, 0xba, 0xc0, 0xb1, 0x3d, 0xbb, 0x1c, 0xe7, 0x00, 0xba, 0x21, 0xfe, 0xb8, 0xba, + 0x61, 0xae, 0x1d, 0xbb, 0x31, 0xae, 0x74, 0xbb, 0x30, 0xbc, 0x53, 0xbb, 0xa0, 0xce, 0x9d, 0xba, 0x84, 0xd8, 0x64, 0xba, 0x38, 0xe8, 0x4b, 0xba, 0xcd, 0x05, 0xcb, 0xba, 0x7f, 0xce, 0x1a, 0xbb, + 0xc9, 0x97, 0x10, 0xba, 0x4e, 0xf9, 0x7b, 0xba, 0x4b, 0xe3, 0x74, 0xba, 0xf4, 0xe6, 0x7e, 0xba, 0xaa, 0xff, 0xf2, 0xba, 0x4c, 0xd8, 0x28, 0xba, 0x7c, 0x46, 0xd0, 0xba, 0xa5, 0xce, 0xcb, 0xba, + 0x1b, 0x25, 0x09, 0xbb, 0x50, 0x56, 0x8b, 0xba, 0x4d, 0x1d, 0x49, 0xba, 0x4d, 0x19, 0xc7, 0xb9, 0x9e, 0x13, 0xbb, 0xba, 0xd3, 0x8d, 0xac, 0xb9, 0x1c, 0x8c, 0xe8, 0xba, 0xca, 0x4f, 0xc0, 0xba, + 0xf3, 0xd4, 0x1a, 0xba, 0x4a, 0x45, 0x02, 0xbb, 0xca, 0xd9, 0x87, 0xba, 0x18, 0xb1, 0xb4, 0xb9, 0xc0, 0x3d, 0x1b, 0xbb, 0x1c, 0xb0, 0xe1, 0xb9, 0xe5, 0x0f, 0xc3, 0xba, 0xbf, 0x30, 0x8e, 0xba, + 0x92, 0x27, 0x96, 0xba, 0x94, 0x17, 0x4a, 0xba, 0x05, 0xf0, 0xba, 0xba, 0x8f, 0x3a, 0xe8, 0xba, 0xe3, 0xd5, 0xc3, 0xba, 0x42, 0x8f, 0xbc, 0xba, 0x30, 0x28, 0xf8, 0xb8, 0x85, 0x9f, 0x84, 0xba, + 0x98, 0x84, 0x57, 0xba, 0xa2, 0xde, 0x8d, 0xba, 0xc3, 0x40, 0xb9, 0xba, 0x6e, 0x79, 0xeb, 0xba, 0xcf, 0x85, 0xb8, 0xba, 0x76, 0xc8, 0xfc, 0xba, 0xaf, 0xaa, 0x8b, 0xba, 0xb0, 0xe9, 0xd6, 0xba, + 0xc0, 0xa1, 0xbd, 0xba, 0x0d, 0xbf, 0x04, 0xbb, 0xb3, 0x4e, 0xcc, 0xba, 0xf4, 0x83, 0x4a, 0xb9, 0xcc, 0x0f, 0x56, 0xba, 0xac, 0x07, 0x85, 0xba, 0x10, 0x75, 0x63, 0xba, 0x81, 0x40, 0x75, 0xba, + 0x9d, 0xdc, 0xf6, 0xba, 0x73, 0xda, 0xb6, 0xba, 0x06, 0xc0, 0x0b, 0xba, 0x8d, 0x01, 0xf3, 0xba, 0x08, 0x94, 0xe3, 0xba, 0xf7, 0xa1, 0x40, 0xba, 0xa8, 0xf5, 0xc3, 0xba, 0x4d, 0x63, 0x5d, 0xba, + 0x10, 0x0c, 0x03, 0xbb, 0x61, 0x82, 0xd6, 0xba, 0xd1, 0x7e, 0xc1, 0xba, 0x72, 0x00, 0x15, 0xbb, 0x96, 0x2f, 0x0e, 0xba, 0x18, 0xe3, 0xdb, 0xb9, 0xd2, 0xa1, 0x5f, 0x39, 0x12, 0xa7, 0x70, 0xba, + 0xa0, 0xae, 0x6b, 0xb8, 0x51, 0x8b, 0x1d, 0x39, 0x98, 0x92, 0xc5, 0xb7, 0x75, 0x26, 0xf8, 0xb9, 0x04, 0x29, 0x22, 0xb9, 0x1d, 0xfc, 0x13, 0xb9, 0xfd, 0x4f, 0x5e, 0xba, 0xf5, 0x84, 0xd4, 0xba, + 0x68, 0x97, 0xa0, 0xba, 0x38, 0x67, 0x2c, 0xba, 0xa8, 0xe8, 0x31, 0xb7, 0x48, 0x75, 0x2d, 0x3a, 0xbe, 0x0e, 0x81, 0xba, 0x92, 0x2a, 0x66, 0x39, 0x61, 0x1f, 0x16, 0xba, 0xca, 0xf3, 0xa6, 0xba, + 0x7c, 0xac, 0xa1, 0xba, 0xa6, 0xe9, 0x19, 0xba, 0x31, 0x5c, 0xf0, 0xb9, 0x58, 0xf3, 0x92, 0x39, 0x4e, 0x8b, 0xb5, 0xba, 0xc4, 0x63, 0x09, 0xba, 0x74, 0x53, 0x63, 0xba, 0x8f, 0x60, 0x1e, 0x3a, + 0x02, 0xeb, 0xc6, 0xb8, 0x01, 0x4e, 0x9a, 0xba, 0x5d, 0xe7, 0x89, 0xba, 0x8b, 0x33, 0x1d, 0xba, 0x3b, 0x58, 0x40, 0xb9, 0xb3, 0x71, 0x91, 0xb9, 0xe1, 0x5b, 0x60, 0xba, 0x60, 0xce, 0xce, 0x36, + 0xa0, 0x56, 0xfb, 0xb9, 0x6f, 0xf5, 0x33, 0xba, 0x24, 0xfe, 0x37, 0xba, 0x9a, 0xe0, 0x45, 0xba, 0x66, 0xce, 0x40, 0xba, 0x44, 0x4c, 0x47, 0xb9, 0x6a, 0x99, 0x9c, 0xba, 0xa1, 0xde, 0x3e, 0xba, + 0x40, 0xcd, 0x7f, 0xba, 0x9f, 0xb5, 0xb1, 0xba, 0x14, 0x13, 0x0f, 0xb9, 0xfe, 0x08, 0x3f, 0xb9, 0x56, 0x70, 0x20, 0xba, 0xa4, 0xe7, 0xe4, 0xb9, 0xe8, 0x6d, 0x3a, 0xba, 0x0f, 0x1d, 0x93, 0xba, + 0x1c, 0xce, 0x1e, 0xb8, 0x90, 0x71, 0x3d, 0xb7, 0x38, 0x82, 0x80, 0xb9, 0x8f, 0xb6, 0xa5, 0xba, 0xab, 0x3d, 0xf5, 0xb9, 0x3b, 0xdf, 0x2a, 0xba, 0xa2, 0xe0, 0x5c, 0x39, 0xd5, 0x38, 0x0c, 0xba, + 0x5e, 0x1c, 0x91, 0xba, 0x42, 0xec, 0x9b, 0xba, 0x2c, 0x45, 0x0c, 0xba, 0xea, 0x67, 0x51, 0xba, 0x1c, 0xfe, 0x41, 0xbd, 0xe9, 0x68, 0x5c, 0x3c, 0x78, 0x43, 0x7c, 0xbb, 0x31, 0x1d, 0x39, 0xbd, + 0x65, 0x47, 0x22, 0xbd, 0x27, 0xa6, 0xdb, 0xbb, 0x30, 0x20, 0x23, 0xbd, 0xc2, 0xd9, 0x51, 0xbd, 0xb0, 0xc6, 0x8c, 0xbd, 0xca, 0xc6, 0x83, 0xbc, 0x5e, 0x39, 0x10, 0xbd, 0xfe, 0x74, 0x04, 0x3d, + 0x56, 0xf6, 0xdf, 0xbc, 0x4b, 0x97, 0x11, 0xbc, 0x7e, 0xea, 0xb2, 0x3c, 0x0e, 0xf4, 0x29, 0xbd, 0x2d, 0x42, 0xb3, 0xbc, 0x5a, 0x81, 0x0e, 0xbc, 0xfd, 0xa5, 0x1a, 0xbc, 0x03, 0x3c, 0xa0, 0x3c, + 0x68, 0x4a, 0x4c, 0xbd, 0x86, 0x9a, 0x12, 0xbd, 0xf2, 0xa3, 0x71, 0xbd, 0x89, 0xf9, 0x5f, 0x3c, 0xec, 0xba, 0xdb, 0xbd, 0xd9, 0xce, 0x88, 0xbc, 0x0e, 0x80, 0xd9, 0xbc, 0xe4, 0xd2, 0x14, 0xbd, + 0xf8, 0x6b, 0xcb, 0xbc, 0x8d, 0x8f, 0x18, 0xbd, 0x7b, 0x51, 0x0b, 0xbd, 0x08, 0xb3, 0x04, 0xbd, 0x20, 0xe7, 0x00, 0xbd, 0xb8, 0x25, 0xac, 0x3b, 0x5e, 0x3e, 0xdc, 0xbb, 0xe2, 0xc1, 0x0d, 0xbd, + 0x26, 0xd2, 0x37, 0xbd, 0x42, 0xb8, 0x9f, 0xbc, 0x5c, 0x05, 0x2e, 0xbd, 0x46, 0xdd, 0xbd, 0xbd, 0xbe, 0x8a, 0x70, 0xbc, 0x88, 0xd8, 0x41, 0xbd, 0x1b, 0xd0, 0x10, 0xbd, 0x11, 0x8e, 0x80, 0xbd, + 0x3c, 0x62, 0x4f, 0xbd, 0xb7, 0x96, 0xbe, 0xbb, 0xe0, 0x71, 0x72, 0xbd, 0x18, 0x0e, 0x0f, 0x3b, 0xa9, 0x36, 0xb1, 0xbd, 0x0b, 0xc1, 0xc6, 0xbc, 0xaa, 0xe8, 0x99, 0x3b, 0x86, 0x03, 0x74, 0x3b, + 0x99, 0x13, 0x04, 0xbd, 0xe3, 0xf9, 0x23, 0xbd, 0x89, 0xe3, 0x76, 0xbd, 0x2c, 0xdd, 0x6d, 0xbc, 0x5a, 0x4f, 0x0b, 0xbd, 0xc4, 0x4f, 0x81, 0xbd, 0x2c, 0x73, 0x38, 0xbd, 0xe2, 0x47, 0x07, 0xbd, + 0x8c, 0xe8, 0x77, 0xbd, 0x91, 0xc0, 0xaa, 0xbd, 0x98, 0x58, 0xb4, 0xbd, 0xc2, 0xd2, 0x3b, 0xbd, 0xc3, 0x13, 0xc7, 0xbf, 0x25, 0xed, 0x06, 0xc0, 0xf7, 0xda, 0xcb, 0xbf, 0xe2, 0x45, 0xe8, 0xbf, + 0x5a, 0x21, 0xe0, 0xbf, 0xf4, 0x00, 0x45, 0xc0, 0x4f, 0xe2, 0x2f, 0xc0, 0x39, 0x98, 0x5f, 0xc0, 0xbb, 0x2c, 0xe7, 0xbf, 0xd4, 0xcd, 0xa1, 0xbf, 0x7b, 0xee, 0x47, 0xc0, 0x89, 0xa4, 0xe9, 0xbf, + 0xe1, 0xeb, 0x0c, 0xc0, 0x75, 0xb9, 0x21, 0xc0, 0xf2, 0x7a, 0x02, 0xc0, 0xfe, 0xc7, 0x30, 0xc0, 0xae, 0xa7, 0x4d, 0xc0, 0xb9, 0xc5, 0x3b, 0xc0, 0x38, 0x18, 0x24, 0xc0, 0xe7, 0x1b, 0xd6, 0xbf, + 0xda, 0x1a, 0x02, 0xc0, 0x09, 0x4a, 0x9e, 0xbf, 0xde, 0xcc, 0x36, 0xc0, 0x74, 0x06, 0xd1, 0xbf, 0xcb, 0x7e, 0x30, 0xc0, 0x29, 0xdd, 0xe9, 0xbf, 0x88, 0x8b, 0x44, 0xc0, 0xd6, 0x22, 0xc3, 0xbf, + 0xb7, 0x7d, 0xf0, 0xbe, 0xa4, 0x6b, 0xa7, 0xbf, 0x59, 0x71, 0x5d, 0xc0, 0x22, 0xf9, 0xc9, 0xbf, 0x38, 0xc4, 0x20, 0xbf, 0x9d, 0x4a, 0xcf, 0xbf, 0x25, 0xf1, 0x09, 0xc0, 0xd7, 0xfa, 0x0f, 0xc0, + 0xa8, 0x34, 0x52, 0xbf, 0xbf, 0xd4, 0x17, 0xc0, 0x9c, 0x95, 0x55, 0xc0, 0x56, 0x74, 0x08, 0xc0, 0xab, 0x17, 0x33, 0xc0, 0x24, 0xd9, 0x74, 0xc0, 0x35, 0x86, 0x09, 0xc0, 0xc0, 0x10, 0xa2, 0xbf, + 0x94, 0x7c, 0x0e, 0xc0, 0x57, 0x62, 0xcd, 0xbf, 0xb8, 0xdc, 0x49, 0xc0, 0xb1, 0xba, 0x85, 0xbf, 0xdf, 0x07, 0x1c, 0xc0, 0xc7, 0x8d, 0xae, 0xbf, 0x74, 0x4c, 0x20, 0xc0, 0xa7, 0xc3, 0x38, 0xc0, + 0x8f, 0x84, 0x28, 0xbf, 0x26, 0x92, 0x9b, 0xc0, 0x0c, 0x12, 0xeb, 0xbe, 0xf0, 0x02, 0x0b, 0xc0, 0x8c, 0x7b, 0x6c, 0xc0, 0x35, 0xf5, 0x26, 0xc0, 0xac, 0xfa, 0x15, 0xc0, 0x1a, 0xf4, 0x5f, 0xc0, + 0xc4, 0x1b, 0xeb, 0xbf, 0xdb, 0xe7, 0x28, 0xbf, 0xff, 0x18, 0x51, 0xbf, 0x63, 0xae, 0x29, 0xc0, 0x41, 0x97, 0xbb, 0x3f, 0xfa, 0x2d, 0xa2, 0x3f, 0x34, 0x07, 0xac, 0x3f, 0xf5, 0x53, 0xce, 0x3f, + 0x68, 0xe4, 0x72, 0x3f, 0xd6, 0x97, 0xae, 0x3f, 0xc9, 0x39, 0xd2, 0x3f, 0x6a, 0x40, 0xd3, 0x3f, 0xce, 0x90, 0xf8, 0x3f, 0xb0, 0xed, 0xb9, 0x3f, 0xaf, 0x15, 0xc8, 0x3f, 0x9e, 0x16, 0xc6, 0x3f, + 0x47, 0x7b, 0xb8, 0x3f, 0x32, 0xf9, 0xe6, 0x3f, 0xde, 0xbb, 0xa3, 0x3f, 0x0a, 0x4f, 0xd7, 0x3f, 0x5a, 0xcf, 0x98, 0x3f, 0x2e, 0xa6, 0xe1, 0x3f, 0x2f, 0x62, 0x8a, 0x3f, 0x23, 0x50, 0x34, 0x3f, + 0x0e, 0xb7, 0xa9, 0x3f, 0xdc, 0x20, 0xc0, 0x3f, 0x0a, 0x8b, 0xd9, 0x3f, 0xa4, 0x5a, 0xb9, 0x3f, 0xc6, 0xe5, 0xf8, 0x3f, 0x2a, 0x7c, 0xa3, 0x3f, 0x1a, 0x56, 0x87, 0x3f, 0xfc, 0x49, 0x95, 0x3f, + 0xfc, 0xcf, 0x90, 0x3f, 0x1a, 0xee, 0xb9, 0x3f, 0x21, 0xd3, 0x96, 0x3f, 0xca, 0x2b, 0x8d, 0x3f, 0x15, 0xdf, 0x8c, 0x3f, 0x3e, 0x44, 0x94, 0x3f, 0xe1, 0x73, 0xc8, 0x3f, 0x1c, 0xd7, 0xe5, 0x3e, + 0x0c, 0x63, 0xa3, 0x3f, 0x55, 0xd2, 0x8d, 0x3f, 0x44, 0x04, 0xd1, 0x3f, 0xd2, 0x49, 0x67, 0x3f, 0x42, 0x7b, 0x09, 0x40, 0x27, 0xe6, 0xd3, 0x3f, 0xb0, 0x2d, 0xd2, 0x3f, 0x04, 0xe0, 0xba, 0x3f, + 0x3d, 0x83, 0xe1, 0x3f, 0x0a, 0x7a, 0x7a, 0x3f, 0x60, 0xe9, 0xb3, 0x3f, 0xc1, 0x62, 0x4c, 0x3f, 0xc8, 0xf1, 0xd0, 0x3f, 0xee, 0xe5, 0xa8, 0x3f, 0x14, 0xca, 0xd5, 0x3f, 0x10, 0x3c, 0x93, 0x3f, + 0xf7, 0x29, 0x14, 0x3f, 0x98, 0x48, 0xbc, 0x3f, 0x76, 0x36, 0x16, 0x3f, 0x8e, 0x23, 0xf8, 0x3f, 0xd6, 0x29, 0x07, 0x40, 0x3e, 0x36, 0xcd, 0x3f, 0x58, 0x55, 0xb6, 0x3f, 0x4a, 0xaf, 0x08, 0x40, + 0x1e, 0x83, 0x06, 0x40, 0x82, 0x54, 0xfd, 0x3e, 0x8e, 0x2c, 0x41, 0x3f, 0xda, 0xaa, 0xa6, 0x3f, 0x4b, 0xac, 0xa2, 0x3e, 0x77, 0xff, 0x0c, 0x3f, 0x54, 0x12, 0x18, 0x3f, 0x40, 0xad, 0x41, 0xbd, + 0x84, 0x46, 0x67, 0x3f, 0xa8, 0x44, 0x19, 0x3f, 0x7f, 0x4f, 0xd1, 0x3e, 0xd5, 0xe8, 0xbd, 0x3e, 0x70, 0x1f, 0x33, 0x3d, 0xcd, 0xdc, 0x89, 0x3e, 0x91, 0x12, 0x01, 0x3f, 0xb4, 0x50, 0x31, 0xbe, + 0xff, 0x25, 0xac, 0x3e, 0xba, 0x9e, 0x05, 0x3f, 0xba, 0xa3, 0xc9, 0x3e, 0xa3, 0x9b, 0xf9, 0x3e, 0x43, 0x5e, 0x82, 0x3f, 0xc2, 0x36, 0xf2, 0x3e, 0x8f, 0x1b, 0x4a, 0x3f, 0xd6, 0x98, 0x33, 0x3f, + 0x86, 0xc7, 0x31, 0x3f, 0xba, 0xe0, 0xe4, 0x3e, 0x34, 0x38, 0xdf, 0x3d, 0x34, 0x5b, 0xda, 0xbd, 0x9c, 0xe0, 0x59, 0x3f, 0xda, 0x71, 0x84, 0x3e, 0x1c, 0x00, 0x94, 0x3f, 0x3e, 0x82, 0x1e, 0x3f, + 0xd6, 0x88, 0x01, 0xbe, 0x62, 0xef, 0x33, 0x3e, 0x1a, 0x4b, 0x18, 0x3f, 0x8a, 0x83, 0x7f, 0x3e, 0xf8, 0x1e, 0x0c, 0xbe, 0xbb, 0x93, 0x5d, 0x3f, 0x54, 0xd0, 0x12, 0x3f, 0x1e, 0x13, 0xc8, 0x3d, + 0x60, 0xb8, 0xae, 0x3c, 0x8e, 0x52, 0xc9, 0x3e, 0x84, 0xb2, 0xb6, 0x3e, 0x80, 0x10, 0xdf, 0x3e, 0x6d, 0xa7, 0x04, 0x3f, 0x3d, 0x43, 0x82, 0x3f, 0x20, 0xb2, 0xf9, 0x3e, 0x8e, 0x93, 0x05, 0x3f, + 0xd6, 0xec, 0xc8, 0x3e, 0x10, 0x9d, 0x8f, 0x3e, 0x3a, 0xf6, 0x2a, 0x3f, 0x8b, 0x02, 0xb2, 0x3e, 0x11, 0xde, 0x45, 0x3f, 0x8b, 0x11, 0x28, 0x3f, 0x9e, 0xa5, 0x2c, 0x3f, 0xb2, 0xd5, 0x10, 0x3f, + 0xbd, 0x45, 0xa1, 0xbe, 0x94, 0x88, 0xd0, 0x3f, 0x74, 0xd7, 0x7f, 0x3f, 0xca, 0x30, 0x37, 0x3f, 0x28, 0x56, 0x60, 0x3f, 0x24, 0x5b, 0xa4, 0x3e, 0xae, 0xfa, 0x52, 0x3f, 0x1d, 0x4d, 0x98, 0x3f, + 0x1b, 0x43, 0xe6, 0x3e, 0xd6, 0x23, 0x3b, 0x3f, 0x80, 0xc4, 0xf4, 0x3c, 0x8c, 0x06, 0x5b, 0x3f, 0xff, 0x49, 0x3c, 0xbe, 0xb8, 0xda, 0x16, 0xbe, 0x96, 0x7c, 0x36, 0xbe, 0xae, 0x3b, 0x37, 0xbe, + 0xeb, 0x38, 0x04, 0xbe, 0xc2, 0x18, 0x07, 0xbe, 0xaa, 0x65, 0x33, 0xbe, 0x10, 0x3e, 0x19, 0xbe, 0x08, 0xdf, 0x6e, 0xbe, 0xb0, 0xc0, 0x41, 0xbe, 0x36, 0x49, 0x1f, 0xbe, 0x19, 0x89, 0x26, 0xbe, + 0x44, 0x6f, 0x23, 0xbe, 0x9a, 0xa3, 0x59, 0xbe, 0x1c, 0x0e, 0x13, 0xbe, 0x47, 0x0c, 0x3d, 0xbe, 0x56, 0xac, 0xfb, 0xbd, 0xe8, 0xda, 0x42, 0xbe, 0xe5, 0xed, 0xed, 0xbd, 0xe3, 0x05, 0xae, 0xbd, + 0x1d, 0xd3, 0x29, 0xbe, 0x3a, 0x3b, 0x53, 0xbe, 0x86, 0x23, 0x29, 0xbe, 0xd6, 0x39, 0x21, 0xbe, 0x28, 0x1f, 0x78, 0xbe, 0x13, 0x10, 0x13, 0xbe, 0x3a, 0x8f, 0xe9, 0xbd, 0xf1, 0xcf, 0x1e, 0xbe, + 0xf5, 0x43, 0x17, 0xbe, 0xa6, 0x77, 0x3b, 0xbe, 0xec, 0xb0, 0xba, 0xbd, 0x4a, 0x52, 0x00, 0xbe, 0xa8, 0x9a, 0x0c, 0xbe, 0x24, 0xf5, 0x26, 0xbe, 0x3e, 0x56, 0x44, 0xbe, 0xb0, 0x75, 0x35, 0x3b, + 0xfe, 0x20, 0x29, 0xbe, 0xeb, 0xae, 0xda, 0xbd, 0x94, 0x2d, 0x1b, 0xbe, 0x86, 0x9b, 0xb0, 0xbd, 0x35, 0x7c, 0x82, 0xbe, 0x56, 0xcc, 0x2f, 0xbe, 0x64, 0xd6, 0x4b, 0xbe, 0x77, 0xbf, 0x4f, 0xbe, + 0xfe, 0xb6, 0x56, 0xbe, 0xae, 0xc1, 0xdb, 0xbd, 0x2f, 0x6d, 0x0e, 0xbe, 0xe9, 0x8a, 0xd1, 0xbd, 0x27, 0x17, 0x4f, 0xbe, 0x6b, 0xbe, 0x3d, 0xbe, 0x82, 0x95, 0x4d, 0xbe, 0x10, 0x37, 0xd6, 0xbd, + 0x52, 0x35, 0x49, 0xbd, 0x52, 0x69, 0x0f, 0xbe, 0xcb, 0x3f, 0xfb, 0xbd, 0xf6, 0x21, 0x82, 0xbe, 0xae, 0x22, 0x73, 0xbe, 0x4c, 0xa5, 0x2d, 0xbe, 0xca, 0x01, 0x35, 0xbe, 0xd0, 0xc2, 0x86, 0xbe, + 0x76, 0x94, 0x8d, 0xbe, 0x03, 0xd0, 0xb5, 0xbd, 0xec, 0x1b, 0xb3, 0xbd, 0x31, 0x4f, 0x19, 0xbe, 0x80, 0x80, 0x8d, 0xbd, 0x0d, 0x05, 0x9b, 0xbd, 0xb5, 0x62, 0xd3, 0xbd, 0xe8, 0x42, 0x55, 0xbc, + 0xba, 0x26, 0xff, 0xbd, 0x03, 0xeb, 0x62, 0xbd, 0x88, 0x50, 0x54, 0xbd, 0xc2, 0xc8, 0xb1, 0xbc, 0xb2, 0xfc, 0x31, 0xbd, 0x31, 0x2a, 0x91, 0xbd, 0x8f, 0x63, 0x4b, 0xbd, 0xb0, 0xaa, 0x1a, 0x3c, + 0xc4, 0x1d, 0x49, 0xbd, 0xdf, 0x9d, 0xac, 0xbd, 0x70, 0x95, 0x61, 0xbd, 0xf4, 0x71, 0x85, 0xbd, 0x30, 0xc4, 0xd7, 0xbd, 0x3e, 0x1d, 0x7b, 0xbd, 0xa6, 0x33, 0xb2, 0xbd, 0x3f, 0x73, 0xad, 0xbd, + 0x29, 0x54, 0xd2, 0xbd, 0xc6, 0x9b, 0xce, 0xbd, 0xd0, 0x7c, 0xc8, 0xbb, 0x20, 0x97, 0x01, 0xbb, 0x96, 0x27, 0x08, 0xbe, 0x0c, 0x0e, 0x2b, 0xbd, 0x72, 0xd2, 0xfd, 0xbd, 0xb3, 0x91, 0xcd, 0xbd, + 0x48, 0xb2, 0xb3, 0xbc, 0x0a, 0xde, 0x62, 0xbd, 0x95, 0x69, 0x06, 0xbd, 0x73, 0xbe, 0x23, 0xbd, 0x74, 0x51, 0x5e, 0xbc, 0xa3, 0xf9, 0x08, 0xbe, 0x89, 0x66, 0xbd, 0xbd, 0xd1, 0x32, 0x1b, 0x3d, + 0xd4, 0x69, 0x22, 0xbd, 0x6a, 0x98, 0x10, 0xbd, 0x90, 0x08, 0xc4, 0xbc, 0xf0, 0x9a, 0x21, 0xbd, 0x8b, 0x1f, 0xbc, 0xbd, 0x6b, 0xfa, 0xdc, 0xbd, 0x89, 0x44, 0xab, 0xbd, 0x58, 0x5b, 0xdf, 0xbd, + 0xbd, 0xfa, 0x94, 0xbd, 0x26, 0xa4, 0x19, 0xbd, 0x8a, 0xc3, 0x85, 0xbd, 0xd9, 0x79, 0x6a, 0xbd, 0xea, 0x29, 0xee, 0xbd, 0x95, 0xb4, 0xf3, 0xbd, 0xfc, 0x38, 0xcf, 0xbd, 0xd7, 0x03, 0x3d, 0xbd, + 0x38, 0xdf, 0x24, 0x3d, 0x82, 0x9c, 0x1f, 0xbe, 0x4b, 0xe0, 0x27, 0xbe, 0xcc, 0x07, 0x07, 0xbe, 0x9c, 0x37, 0xe9, 0xbd, 0xba, 0x63, 0x29, 0xbd, 0x60, 0x10, 0xef, 0xbd, 0xd9, 0xaa, 0x2c, 0xbe, + 0x4f, 0xfc, 0xe3, 0xbd, 0x47, 0x31, 0xe6, 0xbd, 0x20, 0x83, 0x75, 0xbc, 0xdc, 0x2b, 0xd7, 0xbd, 0x13, 0x82, 0x9d, 0xbd, 0x2e, 0x2b, 0x9b, 0xbd, 0x03, 0x5f, 0x8e, 0xbd, 0x5d, 0xf2, 0xba, 0xbd, + 0x31, 0x04, 0x5b, 0xbd, 0x69, 0x7f, 0xc2, 0xbd, 0x88, 0x79, 0xd1, 0xbd, 0x88, 0x81, 0xec, 0xbd, 0x9d, 0xaa, 0xd1, 0xbd, 0xcc, 0xcf, 0x93, 0xbd, 0x13, 0xc6, 0xd5, 0xbd, 0x33, 0x97, 0xb9, 0xbd, + 0x56, 0x3d, 0xb1, 0xbd, 0xa0, 0x79, 0xd3, 0xbd, 0x71, 0xbf, 0x9d, 0xbd, 0xf8, 0xfc, 0xd2, 0xbd, 0x86, 0x4b, 0xaf, 0xbd, 0x8e, 0x6c, 0xdf, 0xbd, 0x24, 0x56, 0x96, 0xbd, 0xbc, 0x75, 0x3a, 0xbd, + 0xbc, 0x70, 0x99, 0xbd, 0x28, 0x0b, 0x92, 0xbd, 0xa7, 0x3a, 0xe1, 0xbd, 0x95, 0xae, 0xa9, 0xbd, 0x7e, 0xae, 0xdd, 0xbd, 0x1e, 0xd5, 0x99, 0xbd, 0xfd, 0x6c, 0x9c, 0xbd, 0x84, 0x84, 0x7b, 0xbd, + 0x2f, 0x51, 0x54, 0xbd, 0x83, 0xb4, 0x97, 0xbd, 0x4a, 0x5e, 0xc1, 0xbd, 0x9f, 0x2c, 0x84, 0xbd, 0x3c, 0xec, 0x5a, 0xbd, 0xf0, 0x28, 0x74, 0xbd, 0xc5, 0x28, 0xb3, 0xbd, 0x5a, 0x87, 0x59, 0xbd, + 0xf2, 0x06, 0x7b, 0xbd, 0xdf, 0x00, 0x9c, 0xbd, 0xd3, 0x2f, 0xe6, 0xbd, 0x4d, 0x02, 0x83, 0xbd, 0x75, 0x3a, 0xf7, 0xbd, 0x2f, 0xac, 0xe7, 0xbd, 0xf3, 0xf7, 0xba, 0xbd, 0x20, 0xfc, 0x8d, 0xbd, + 0x13, 0x41, 0xc9, 0xbd, 0x7f, 0x76, 0x75, 0xbd, 0xb9, 0x82, 0xc6, 0xbd, 0x1a, 0x27, 0x30, 0xbd, 0xdc, 0xcb, 0xbc, 0xbd, 0x69, 0x14, 0x83, 0xbd, 0x65, 0x80, 0xc4, 0xbd, 0xf3, 0x65, 0xac, 0xbd, + 0xb4, 0xf6, 0x15, 0xbd, 0xaa, 0x34, 0xed, 0xbd, 0xa6, 0x9a, 0x8c, 0xbc, 0x27, 0xb4, 0xcc, 0xbd, 0x79, 0xf1, 0x04, 0xbe, 0x19, 0xf4, 0xcb, 0xbd, 0x7f, 0x4a, 0xa8, 0xbd, 0xd9, 0x00, 0xfd, 0xbd, + 0xdc, 0x98, 0xd4, 0xbd, 0xa0, 0x39, 0xa7, 0xbc, 0x4e, 0x22, 0x2a, 0xbd, 0x32, 0x98, 0xa8, 0xbd, 0x51, 0xe1, 0x9f, 0xbc, 0xfa, 0xa5, 0x23, 0xbd, 0x90, 0x27, 0x03, 0xbd, 0xe5, 0x56, 0x08, 0xbc, + 0x15, 0xb9, 0x51, 0xbd, 0xcf, 0x42, 0x68, 0xbd, 0xff, 0x4f, 0x26, 0xbd, 0x1f, 0xf9, 0x52, 0xbd, 0x1e, 0xac, 0xf4, 0xbb, 0xa1, 0x21, 0x55, 0xbc, 0x83, 0xab, 0x52, 0xbd, 0xee, 0x36, 0x96, 0xbb, + 0x07, 0x2b, 0x00, 0xbd, 0xe9, 0x49, 0x20, 0xbd, 0x62, 0x2d, 0x06, 0xbd, 0x55, 0x53, 0x31, 0xbd, 0x74, 0x57, 0x9d, 0xbd, 0xec, 0xb1, 0x36, 0xbd, 0x07, 0xf2, 0x70, 0xbd, 0x18, 0xe3, 0x39, 0xbd, + 0x67, 0x8f, 0x31, 0xbd, 0x41, 0x77, 0x98, 0xbc, 0x1d, 0x6c, 0xf9, 0xbc, 0xae, 0xb1, 0xa7, 0xbb, 0xd6, 0x6d, 0x5b, 0xbd, 0x37, 0x22, 0xc6, 0xbc, 0x99, 0x8e, 0xa6, 0xbd, 0x55, 0x76, 0x0b, 0xbd, + 0xa3, 0x28, 0x35, 0x3c, 0xf9, 0xa2, 0x27, 0xbc, 0x39, 0xa0, 0x85, 0xbd, 0xc6, 0x27, 0xb3, 0xbc, 0x8c, 0xfb, 0x09, 0x3c, 0xf0, 0x31, 0x38, 0xbd, 0x99, 0x0b, 0x1b, 0xbd, 0x9e, 0x99, 0x11, 0xbd, + 0xe0, 0xc3, 0xc0, 0x3a, 0x08, 0x9f, 0x25, 0xbd, 0xa2, 0x06, 0x47, 0xbd, 0x8c, 0x36, 0x26, 0xbd, 0xa2, 0xa3, 0x20, 0xbd, 0x0f, 0x5d, 0xa6, 0xbd, 0xa7, 0x87, 0x09, 0xbd, 0x51, 0xa9, 0xb7, 0xbc, + 0xcf, 0x1d, 0xf4, 0xbc, 0xca, 0x0d, 0xcd, 0xbc, 0x9e, 0xee, 0x75, 0xbd, 0x3a, 0xb2, 0xa7, 0xbc, 0x54, 0x93, 0x49, 0xbd, 0x23, 0xc1, 0xfc, 0xbc, 0xc1, 0x0e, 0x3d, 0xbd, 0xc2, 0x16, 0x61, 0xbd, + 0x45, 0x4e, 0x04, 0x3c, 0xda, 0x67, 0xfd, 0xbd, 0x1e, 0xce, 0x1a, 0xbd, 0x0e, 0xf0, 0x1e, 0xbd, 0x12, 0x5c, 0x8b, 0xbd, 0x78, 0x4f, 0x11, 0xbd, 0xca, 0xa1, 0x55, 0xbd, 0x13, 0x26, 0x9c, 0xbd, + 0x5c, 0xb8, 0xad, 0xbc, 0x14, 0xfe, 0x03, 0xbd, 0x30, 0xb6, 0xad, 0xbb, 0xde, 0xbd, 0x75, 0xbd, 0xbc, 0x40, 0x1d, 0xc0, 0x8e, 0xac, 0xfd, 0xbf, 0xee, 0x24, 0x31, 0xc0, 0xd6, 0x58, 0xda, 0xbf, + 0x77, 0xa7, 0x1a, 0xc0, 0xf6, 0x40, 0xab, 0xbf, 0x1e, 0xfa, 0xee, 0xbf, 0x74, 0xf3, 0x86, 0xbf, 0xa5, 0x58, 0x2a, 0xc0, 0x96, 0x0a, 0x27, 0xc0, 0xb7, 0x88, 0xc3, 0xbf, 0xa8, 0x2f, 0xa2, 0xbf, + 0xfd, 0x12, 0xe6, 0xbf, 0x2b, 0x48, 0x2d, 0xc0, 0xe6, 0xdc, 0xdc, 0xbf, 0xb2, 0xa4, 0x07, 0xc0, 0x46, 0xd7, 0xe1, 0xbf, 0xa0, 0x34, 0x06, 0xc0, 0x79, 0x8b, 0xd2, 0xbf, 0x31, 0xca, 0xbe, 0xbf, + 0x50, 0xd3, 0x20, 0xc0, 0x27, 0x5f, 0x49, 0xc0, 0x02, 0x43, 0x9b, 0xbf, 0xab, 0x50, 0xb1, 0xbf, 0x0a, 0x12, 0x62, 0xc0, 0x41, 0x7a, 0xd0, 0xbf, 0xe9, 0x6a, 0xf4, 0xbf, 0x13, 0x8b, 0x1f, 0xc0, + 0x0c, 0x46, 0xe5, 0xbf, 0xb4, 0xc3, 0x16, 0xc0, 0xaf, 0x99, 0x07, 0xbf, 0x64, 0x86, 0xbb, 0xbf, 0x1d, 0x2b, 0xc5, 0xbf, 0x90, 0xca, 0x3b, 0xc0, 0x66, 0x26, 0x2a, 0xc0, 0xff, 0x89, 0x82, 0x3f, + 0x82, 0x54, 0x06, 0xc0, 0x32, 0xae, 0x80, 0xbf, 0xcb, 0xe2, 0x91, 0xbf, 0x50, 0x87, 0x61, 0xbf, 0xd6, 0xdb, 0x4c, 0xc0, 0x6d, 0x15, 0x0c, 0xc0, 0x49, 0x27, 0x29, 0xc0, 0xa4, 0x56, 0x4c, 0xc0, + 0xac, 0xb9, 0x27, 0xc0, 0x4d, 0x2f, 0x9d, 0xbf, 0xaa, 0x68, 0xc1, 0xbf, 0xd6, 0x55, 0xc4, 0xbf, 0x0b, 0x55, 0x3e, 0xc0, 0xa6, 0x45, 0x45, 0xc0, 0x0b, 0x54, 0x31, 0xc0, 0x7b, 0x60, 0x79, 0xbf, + 0xc9, 0xa4, 0x84, 0x3d, 0x05, 0x47, 0x07, 0xc0, 0x92, 0x05, 0x47, 0xc0, 0x81, 0x30, 0x73, 0xc0, 0x0c, 0xd3, 0x41, 0xc0, 0xe6, 0x42, 0xdc, 0xbf, 0x58, 0xd2, 0x2d, 0xc0, 0x0d, 0xed, 0x7e, 0xc0, - 0xee, 0x16, 0xac, 0xbf, 0x7d, 0x67, 0xba, 0xbf, 0x7b, 0x05, 0xd6, 0xbe, 0x76, 0x7e, 0x75, 0xbe, - 0x7a, 0xc2, 0x53, 0xc0, 0x84, 0x5c, 0xd5, 0xbf, 0xb5, 0xc6, 0xc2, 0xbf, 0xbd, 0x5b, 0x7a, 0x3e, - 0x81, 0x24, 0x23, 0xc0, 0x44, 0x9a, 0x35, 0xc0, 0x74, 0x7d, 0x51, 0xbf, 0xcd, 0x4c, 0xb7, 0xbe, - 0xf4, 0x46, 0xee, 0xbf, 0x64, 0x84, 0x34, 0xc0, 0x20, 0xc1, 0xbb, 0xbf, 0xf5, 0xd5, 0x1e, 0xc0, - 0xe1, 0x5f, 0x00, 0xc0, 0xf8, 0x3a, 0x14, 0xc0, 0xa8, 0x24, 0xfe, 0xbf, 0xd0, 0x6a, 0xe9, 0xbf, - 0x78, 0x6d, 0xe6, 0xbf, 0x3f, 0x25, 0x3e, 0xc0, 0xdc, 0x09, 0xa7, 0xbf, 0xfd, 0x31, 0xae, 0xbf, - 0xc1, 0x09, 0x45, 0xc0, 0x80, 0xa3, 0x9c, 0xbf, 0x8f, 0x27, 0x9e, 0xbf, 0xf0, 0x4f, 0x3c, 0xc0, - 0x7f, 0x79, 0x38, 0xc0, 0x6f, 0x66, 0x02, 0xc0, 0xa5, 0xeb, 0xdb, 0xbf, 0x27, 0xe7, 0x20, 0xc0, - 0xb4, 0x6b, 0x9d, 0x3f, 0xfa, 0x6c, 0x98, 0x3f, 0x1f, 0xb7, 0x72, 0x3f, 0x45, 0xff, 0x8f, 0x3f, - 0x8f, 0xcd, 0xb6, 0x3f, 0xe9, 0x87, 0x2f, 0x3f, 0x39, 0x3b, 0x5a, 0x3f, 0xb8, 0x10, 0x85, 0x3f, - 0x6b, 0x3a, 0x8f, 0x3f, 0x99, 0x81, 0xf7, 0x3f, 0xa8, 0x8f, 0xa4, 0x3f, 0x3e, 0x75, 0x00, 0x3f, - 0x08, 0xa0, 0xbf, 0x3f, 0x32, 0xdc, 0x85, 0x3f, 0xef, 0x2c, 0x9d, 0x3f, 0xb0, 0x4b, 0xf2, 0x3f, - 0xcb, 0x87, 0x85, 0x3f, 0xca, 0x64, 0xa2, 0x3f, 0x46, 0x70, 0xb0, 0x3f, 0x14, 0x19, 0xff, 0x3f, - 0x06, 0x10, 0xbb, 0x3f, 0xd7, 0x3a, 0x93, 0x3f, 0xfa, 0xe4, 0xe9, 0x3f, 0xad, 0xaf, 0x55, 0x3f, - 0xec, 0xfe, 0x08, 0x40, 0x43, 0x0a, 0xb1, 0x3f, 0xcc, 0xbf, 0xc6, 0x3f, 0x60, 0xdb, 0xb0, 0x3f, - 0x01, 0x51, 0xcc, 0x3f, 0x22, 0xc7, 0xdc, 0x3f, 0xed, 0xc7, 0xed, 0x3f, 0x46, 0x4a, 0xdc, 0x3f, - 0x0a, 0x58, 0x94, 0x3e, 0x10, 0xef, 0xd4, 0xbc, 0x00, 0xe3, 0xd7, 0x3a, 0xd3, 0x84, 0x9b, 0x3e, - 0xd8, 0x2c, 0x10, 0x3f, 0x0e, 0xa9, 0xc6, 0x3e, 0x9b, 0x83, 0xdd, 0xbd, 0x48, 0x85, 0xaf, 0xbd, - 0xe6, 0x30, 0x87, 0x3f, 0x0b, 0x2f, 0x89, 0x3e, 0x27, 0x42, 0x24, 0x3f, 0x77, 0xcd, 0x86, 0xbe, - 0xda, 0xb2, 0x25, 0x3e, 0x5e, 0x46, 0x99, 0x3f, 0xb9, 0x03, 0xa5, 0x3e, 0x3e, 0x8e, 0x09, 0x3f, - 0x6f, 0xd3, 0x37, 0x3f, 0x08, 0x1b, 0x39, 0x3f, 0xec, 0xce, 0xbc, 0x3e, 0xa8, 0xdd, 0xde, 0x3e, - 0xc6, 0xfd, 0x1c, 0x3f, 0x71, 0xcb, 0x0f, 0x3f, 0x08, 0xce, 0xfe, 0x3d, 0x65, 0x5f, 0x68, 0x3e, - 0xe8, 0x52, 0x9f, 0x3e, 0x2c, 0x3f, 0xc0, 0x3d, 0x42, 0xdb, 0x3f, 0x3f, 0xdf, 0xcc, 0xf5, 0x3e, - 0x7c, 0x3b, 0x6a, 0x3f, 0xc6, 0x9b, 0x14, 0x3f, 0xe6, 0xa6, 0x73, 0x3f, 0xe1, 0x9d, 0xa3, 0x3e, - 0x45, 0xd9, 0x1d, 0xbe, 0x48, 0xdf, 0x03, 0xbe, 0x08, 0x72, 0x03, 0xbe, 0x53, 0x2a, 0x34, 0xbe, - 0x96, 0x42, 0x07, 0xbe, 0x8f, 0x93, 0x88, 0xbd, 0xd8, 0xeb, 0x92, 0xbd, 0xba, 0xa2, 0x23, 0xbe, - 0xdc, 0xf0, 0x0a, 0xbe, 0x04, 0xdf, 0x55, 0xbe, 0x9f, 0x64, 0x4a, 0xbe, 0x84, 0x7f, 0x4d, 0xbd, - 0xcb, 0xb1, 0x2e, 0xbe, 0x27, 0x55, 0xfb, 0xbd, 0x54, 0x0b, 0x1b, 0xbe, 0xb3, 0x8d, 0x69, 0xbe, - 0x09, 0x89, 0x00, 0xbe, 0x5f, 0xd6, 0x18, 0xbe, 0xea, 0xd5, 0x22, 0xbe, 0x6b, 0x14, 0x85, 0xbe, - 0x10, 0x6d, 0x42, 0xbe, 0x33, 0x0c, 0xd0, 0xbd, 0xf6, 0x7f, 0x72, 0xbe, 0xb0, 0xc1, 0xba, 0xbd, - 0xa3, 0xa2, 0x6f, 0xbe, 0xe4, 0x7b, 0x2f, 0xbe, 0x20, 0xb2, 0x6a, 0xbe, 0x0b, 0x60, 0x08, 0xbe, - 0xc6, 0xa3, 0x41, 0xbe, 0xc0, 0x1b, 0x61, 0xbe, 0x07, 0xb6, 0x89, 0xbe, 0x1a, 0xe5, 0x42, 0xbe, - 0xdb, 0x52, 0x76, 0xbd, 0x88, 0xa3, 0xe8, 0xbb, 0xa2, 0x1c, 0x0a, 0xbd, 0x8c, 0x24, 0xc0, 0xbd, - 0x4e, 0xc0, 0x3a, 0xbd, 0x50, 0x3b, 0x17, 0xbd, 0xc6, 0x42, 0xc7, 0x3c, 0x22, 0xa1, 0x41, 0xbd, - 0xd8, 0x82, 0x04, 0xbe, 0x2d, 0x36, 0x2d, 0xbd, 0x8f, 0x72, 0x08, 0xbe, 0xe5, 0xc0, 0xd7, 0x3c, - 0x68, 0xdb, 0x17, 0xbd, 0xb7, 0x32, 0x0d, 0xbe, 0x1a, 0x3a, 0x79, 0xbd, 0x7e, 0x7a, 0xbd, 0xbd, - 0x8c, 0x7b, 0xbe, 0xbd, 0xe7, 0xed, 0xc1, 0xbd, 0x1c, 0xc9, 0x6d, 0xbd, 0x04, 0x4f, 0xd5, 0xbd, - 0x1a, 0x25, 0xd7, 0xbd, 0x02, 0x08, 0x2f, 0xbd, 0xc3, 0x3f, 0x88, 0xbd, 0xea, 0x20, 0xf9, 0xbc, - 0x2d, 0x3a, 0x51, 0xbd, 0x5e, 0xdb, 0x2d, 0xbd, 0x64, 0x70, 0x18, 0xbe, 0xda, 0x32, 0x30, 0xbd, - 0x52, 0xc2, 0xf7, 0xbd, 0xe7, 0xdb, 0xd7, 0xbd, 0x79, 0xa8, 0x36, 0xbe, 0x6e, 0x8b, 0x4d, 0xbd, - 0x01, 0xed, 0x84, 0xbd, 0x22, 0xd4, 0x8d, 0xbd, 0xc0, 0x14, 0x2e, 0xbd, 0x68, 0x9c, 0x2d, 0xbd, - 0xe7, 0x33, 0xd0, 0xbd, 0xc9, 0x3e, 0x47, 0xbd, 0x78, 0x5b, 0x70, 0xbd, 0x37, 0x60, 0x13, 0xbd, - 0x79, 0xe8, 0x91, 0xbd, 0xd1, 0x7f, 0xee, 0xbd, 0x58, 0x7c, 0x5c, 0xbd, 0x76, 0xfd, 0xe5, 0xbc, - 0x89, 0x5f, 0xae, 0xbd, 0xda, 0xef, 0x91, 0xbd, 0x57, 0x1f, 0x88, 0xbd, 0x92, 0x55, 0xd8, 0xbd, - 0xbc, 0x33, 0x82, 0xbd, 0x5b, 0xfb, 0x9d, 0xbd, 0xba, 0x31, 0xa4, 0xbd, 0xfa, 0x28, 0xcc, 0xbd, - 0x5d, 0xfc, 0x9d, 0xbd, 0x8e, 0x5e, 0xaf, 0xbd, 0xfa, 0xb1, 0xb5, 0xbd, 0x4c, 0x7f, 0x51, 0xbd, - 0x04, 0xc2, 0x02, 0xbe, 0x5e, 0x97, 0x91, 0xbd, 0x09, 0x3d, 0x8e, 0xbd, 0xe2, 0x41, 0xc2, 0xbd, - 0x96, 0x8d, 0xc5, 0xbd, 0x12, 0x27, 0xbb, 0xbd, 0xe8, 0xf2, 0xb0, 0xbd, 0xe3, 0xce, 0xd1, 0xbd, - 0xca, 0xa1, 0x92, 0xbc, 0xaf, 0xcf, 0x09, 0xbc, 0xf6, 0x07, 0x9e, 0x3b, 0xb0, 0x89, 0xad, 0xba, - 0xcb, 0x42, 0x6f, 0xbd, 0xe1, 0x48, 0x0a, 0xbd, 0xb8, 0x7b, 0x2e, 0xbc, 0x56, 0x29, 0xa6, 0x3c, - 0x75, 0x7a, 0x8b, 0xbd, 0xb2, 0xdc, 0x07, 0xbd, 0x6f, 0x30, 0xb1, 0xbc, 0xff, 0xc4, 0x26, 0x3c, - 0x62, 0x92, 0x96, 0xbc, 0xfe, 0x77, 0xa1, 0xbd, 0x28, 0x8a, 0xad, 0xbc, 0x3a, 0x0a, 0x1b, 0xbd, - 0xf2, 0xd0, 0x41, 0xbd, 0xde, 0x37, 0x4c, 0xbd, 0x5e, 0x38, 0xf3, 0xbc, 0x9a, 0x9e, 0xb2, 0xbc, - 0xb4, 0x43, 0x0e, 0xbd, 0xec, 0x34, 0x66, 0xbd, 0x90, 0x3c, 0x6a, 0xbb, 0xac, 0x0b, 0xaa, 0xbc, - 0xe0, 0x5f, 0x14, 0xbd, 0x12, 0x25, 0xd9, 0xbb, 0x32, 0xed, 0xef, 0xbc, 0x1c, 0xdd, 0x4b, 0xbd, - 0xf3, 0x95, 0x7f, 0xbd, 0x8a, 0xf2, 0x0b, 0xbd, 0xa6, 0x5e, 0x28, 0xbd, 0x48, 0x99, 0x04, 0xbd, - 0x77, 0xc6, 0x04, 0xc0, 0x4d, 0x11, 0x95, 0xbf, 0x14, 0x06, 0xdc, 0xbf, 0xa1, 0x64, 0x3e, 0xc0, - 0x01, 0xfc, 0x95, 0xbf, 0xf2, 0xb6, 0x3f, 0xbf, 0x50, 0xab, 0x1c, 0xbe, 0x2c, 0x01, 0x19, 0xc0, - 0x3a, 0x14, 0x16, 0xc0, 0xc3, 0x6c, 0x07, 0xc0, 0x75, 0x27, 0x61, 0xc0, 0xf8, 0xe8, 0x36, 0xbe, - 0x6b, 0x7b, 0xef, 0xbf, 0x7f, 0x82, 0x0e, 0xc0, 0x09, 0x5b, 0x01, 0xc0, 0x34, 0x1f, 0x3f, 0xc0, - 0xa3, 0x7f, 0xfa, 0xbf, 0xa0, 0x95, 0x0a, 0xc0, 0x5a, 0x0c, 0xf9, 0xbf, 0x24, 0xf5, 0x68, 0xc0, - 0x0b, 0x8a, 0x37, 0xc0, 0x8a, 0x7d, 0x61, 0xbf, 0x28, 0x87, 0x45, 0xc0, 0x7a, 0x8e, 0x83, 0xbf, - 0x24, 0x03, 0x1c, 0xc0, 0x3c, 0x5e, 0x06, 0xc0, 0x53, 0x6f, 0x7c, 0xc0, 0xf7, 0x90, 0x9f, 0xbf, - 0x18, 0x23, 0x31, 0xc0, 0x36, 0x91, 0x49, 0xc0, 0x7d, 0x4c, 0x93, 0xc0, 0xcf, 0xf0, 0x04, 0xc0, - 0xf9, 0xc8, 0xb3, 0x3f, 0x24, 0x2f, 0xd5, 0x3f, 0xd3, 0x7f, 0x76, 0x3f, 0xd1, 0x13, 0x6e, 0x3f, - 0x51, 0xae, 0xca, 0x3f, 0x74, 0xa1, 0x9c, 0x3f, 0x10, 0x0f, 0x9e, 0x3f, 0x2e, 0xc9, 0x8b, 0x3f, - 0x9e, 0x1c, 0xe7, 0x3f, 0x06, 0x7e, 0xa9, 0x3f, 0xad, 0x58, 0x8f, 0x3f, 0xcc, 0x89, 0x2f, 0x3f, - 0xb3, 0xab, 0x76, 0x3f, 0x89, 0x1e, 0xd0, 0x3f, 0x66, 0xd0, 0x97, 0x3f, 0xa7, 0x3e, 0xa3, 0x3f, - 0x58, 0xf4, 0x9f, 0x3f, 0xd8, 0x73, 0xd1, 0x3f, 0xcc, 0x2b, 0xd9, 0x3f, 0x74, 0xf4, 0x9e, 0x3f, - 0xf3, 0x78, 0x8d, 0x3f, 0xd2, 0xf1, 0x82, 0x3f, 0x47, 0x2c, 0xba, 0x3f, 0xf3, 0xb0, 0x89, 0x3f, - 0x93, 0xc6, 0xc8, 0x3f, 0x0a, 0x3c, 0x00, 0x40, 0xb1, 0x8d, 0x6d, 0x3f, 0x9c, 0x6f, 0xad, 0x3f, - 0xaf, 0x1f, 0xb8, 0x3f, 0x39, 0x2e, 0x39, 0x3f, 0xb5, 0xef, 0x73, 0x3f, 0xec, 0x08, 0xb4, 0x3f, - 0xa8, 0x20, 0x58, 0xbf, 0x2d, 0x3e, 0x6c, 0xbf, 0xab, 0xd5, 0x44, 0xbf, 0xd0, 0x8c, 0x62, 0xbf, - 0x5c, 0x0b, 0x56, 0xbf, 0x08, 0x9d, 0x46, 0xbf, 0xbc, 0xa1, 0x56, 0xbf, 0xd1, 0x72, 0x6f, 0xbf, - 0x10, 0xc5, 0x77, 0xbf, 0xe6, 0xf5, 0x90, 0xbf, 0xf8, 0x3c, 0x76, 0xbf, 0xc3, 0x3a, 0x3b, 0xbf, - 0x8e, 0x4c, 0x75, 0xbf, 0xc0, 0x94, 0x71, 0xbf, 0x40, 0x7e, 0x46, 0xbf, 0x9a, 0x67, 0x86, 0xbf, - 0x32, 0xcd, 0x45, 0xbf, 0xf1, 0xe1, 0x64, 0xbf, 0x4d, 0xc3, 0x77, 0xbf, 0xa0, 0x77, 0x5b, 0xbf, - 0xff, 0x35, 0x87, 0xbf, 0x76, 0xfb, 0x19, 0xbf, 0x4c, 0x89, 0x84, 0xbf, 0xb2, 0x47, 0x4a, 0xbf, - 0x91, 0x84, 0x73, 0xbf, 0x06, 0x90, 0x5c, 0xbf, 0x96, 0xf7, 0x64, 0xbf, 0xb8, 0x69, 0x8b, 0xbf, - 0x8f, 0x72, 0x89, 0xbf, 0xc6, 0x35, 0x2a, 0xbf, 0x4b, 0x9c, 0x4e, 0xbf, 0x52, 0x8a, 0x53, 0xbf, - 0xc0, 0x12, 0xcd, 0xbe, 0x6c, 0x28, 0x9a, 0xbe, 0x20, 0x4d, 0xba, 0xbd, 0x18, 0x85, 0xe2, 0xbd, - 0x67, 0xd1, 0x73, 0xbe, 0x00, 0xc4, 0x9f, 0xbe, 0x7c, 0x70, 0x52, 0xbe, 0xa7, 0x6d, 0x50, 0xbe, - 0xe2, 0xaf, 0xea, 0xbe, 0x3c, 0x82, 0x6c, 0xbe, 0xf1, 0x27, 0x6f, 0xbe, 0xa8, 0x4a, 0x40, 0x3d, - 0x94, 0x65, 0x8d, 0xbd, 0xee, 0xcc, 0xe9, 0xbe, 0xcb, 0xcf, 0x01, 0xbe, 0x54, 0x4e, 0xa9, 0xbe, - 0xb6, 0x9b, 0x8c, 0xbe, 0x00, 0x9d, 0xe6, 0xbe, 0x39, 0xa3, 0x1d, 0xbe, 0x2a, 0x00, 0xa3, 0xbd, - 0xf2, 0x27, 0x64, 0xbe, 0x5b, 0x40, 0x7a, 0xbe, 0xa6, 0x8b, 0x64, 0xbe, 0x49, 0xd9, 0x74, 0xbe, - 0x23, 0x15, 0x30, 0xbe, 0x40, 0xba, 0x1b, 0xbf, 0x98, 0x92, 0x1c, 0xbf, 0x13, 0xf2, 0xa1, 0xbe, - 0xf7, 0x18, 0xda, 0xbe, 0x03, 0x5d, 0x23, 0xbe, 0x24, 0x13, 0xad, 0xbe, 0x41, 0x8b, 0xaa, 0xbe, - 0x7c, 0x7b, 0xcc, 0x3d, 0xf3, 0x9f, 0xc8, 0x3d, 0x92, 0xc8, 0xb3, 0x3d, 0xe3, 0xb2, 0xdb, 0x3d, - 0x78, 0x23, 0xad, 0x3d, 0xbb, 0xc0, 0xba, 0x3d, 0xd4, 0x4d, 0xc2, 0x3d, 0xea, 0xa0, 0xe9, 0x3d, - 0x62, 0x00, 0xdd, 0x3d, 0x84, 0x94, 0x0c, 0x3e, 0x32, 0xf3, 0xf2, 0x3d, 0xd6, 0x0e, 0xad, 0x3d, - 0x5b, 0xa6, 0xeb, 0x3d, 0x11, 0xa9, 0xe1, 0x3d, 0x99, 0x34, 0xaa, 0x3d, 0x2e, 0xaf, 0x06, 0x3e, - 0xa8, 0x2c, 0xb4, 0x3d, 0xe2, 0x12, 0xd1, 0x3d, 0x30, 0x41, 0xc5, 0x3d, 0x1e, 0xbc, 0xba, 0x3d, - 0x57, 0xef, 0x07, 0x3e, 0x7c, 0xf7, 0x8b, 0x3d, 0x45, 0xa6, 0xf1, 0x3d, 0x1e, 0xc9, 0xc1, 0x3d, - 0xd2, 0xbf, 0xca, 0x3d, 0xc1, 0x2e, 0xbe, 0x3d, 0x61, 0x05, 0x09, 0x3e, 0x93, 0x3d, 0x09, 0x3e, - 0xa7, 0xa5, 0x09, 0x3e, 0xfa, 0x34, 0xab, 0x3d, 0x76, 0x7f, 0xd9, 0x3d, 0x48, 0xf0, 0xbf, 0x3d, - 0x20, 0x38, 0x63, 0x3d, 0x5a, 0x96, 0x12, 0x3d, 0x98, 0xc8, 0xa3, 0x3c, 0x5c, 0x71, 0xf4, 0x3c, - 0xa5, 0xaa, 0xca, 0x3c, 0x69, 0x89, 0x37, 0x3d, 0x7b, 0x5f, 0x01, 0x3d, 0xc9, 0x82, 0x27, 0x3d, - 0x2e, 0x52, 0x69, 0x3d, 0xe4, 0x0c, 0x41, 0x3d, 0xd2, 0x94, 0x3b, 0x3d, 0x48, 0x8e, 0xda, 0x3b, - 0x7c, 0x33, 0xd6, 0x3c, 0xe0, 0xd4, 0x7b, 0x3d, 0x3c, 0x7e, 0x95, 0x3c, 0x43, 0x16, 0x74, 0x3d, - 0x5f, 0x5a, 0x1c, 0x3d, 0x0e, 0x6d, 0x6c, 0x3d, 0x20, 0x93, 0x74, 0x3c, 0x42, 0xb0, 0x5b, 0x3c, - 0xc1, 0x53, 0x4a, 0x3d, 0x93, 0xa9, 0x05, 0x3d, 0xd6, 0x99, 0x16, 0x3d, 0xd7, 0xdc, 0x21, 0x3d, - 0x2d, 0x9b, 0xac, 0x3c, 0x89, 0x50, 0x8a, 0x3d, 0x5e, 0x20, 0xd8, 0x3d, 0xa7, 0x65, 0x68, 0x3d, - 0xff, 0x9d, 0x8e, 0x3d, 0x4f, 0x6c, 0x07, 0x3d, 0xf6, 0xe0, 0x72, 0x3d, 0x0c, 0xd9, 0x35, 0x3d, - 0xf9, 0x0d, 0x4d, 0x3d, 0x34, 0x07, 0x70, 0x3d, 0x28, 0x2d, 0x33, 0x3d, 0x1e, 0xc1, 0x41, 0x3d, - 0x0c, 0x55, 0x60, 0x3d, 0x54, 0xba, 0x3a, 0x3d, 0x1e, 0x8e, 0x4a, 0x3d, 0xd6, 0xcc, 0x4f, 0x3d, - 0xe9, 0x73, 0x78, 0x3d, 0xe9, 0x85, 0x7c, 0x3d, 0x8b, 0x3c, 0x54, 0x3d, 0x34, 0x8d, 0x21, 0x3d, - 0x61, 0x6c, 0x51, 0x3d, 0xc7, 0x90, 0x68, 0x3d, 0xa4, 0x92, 0x41, 0x3d, 0x3d, 0x6b, 0x67, 0x3d, - 0xdd, 0xed, 0x3d, 0x3d, 0xea, 0xf9, 0x61, 0x3d, 0xdb, 0x4c, 0x80, 0x3d, 0x9d, 0x61, 0x54, 0x3d, - 0x1c, 0x4c, 0x62, 0x3d, 0xc4, 0xb6, 0x15, 0x3d, 0x98, 0xe8, 0x76, 0x3d, 0x98, 0x8f, 0x36, 0x3d, - 0x0e, 0x63, 0x74, 0x3d, 0xbe, 0xf9, 0x6d, 0x3d, 0xd1, 0x69, 0x29, 0x3d, 0xa6, 0x99, 0x73, 0x3d, - 0x9c, 0x53, 0x71, 0x3d, 0xe0, 0x56, 0x0f, 0x3d, 0x62, 0xb5, 0x2b, 0x3d, 0x6d, 0xe3, 0x4d, 0x3d, - 0x19, 0x5d, 0xe4, 0x3c, 0xdc, 0x64, 0xe1, 0x3c, 0xf4, 0x5a, 0x21, 0x3c, 0x48, 0xaa, 0x0c, 0x3c, - 0x7c, 0xb8, 0xcb, 0x3c, 0x9c, 0x7c, 0xb7, 0x3c, 0x0e, 0xe0, 0x91, 0x3c, 0x42, 0xb5, 0x67, 0x3c, - 0x9d, 0x1c, 0x10, 0x3d, 0xa6, 0xc6, 0x87, 0x3c, 0x09, 0x4f, 0x7c, 0x3c, 0xa0, 0xd0, 0x09, 0x3a, - 0x28, 0x97, 0xda, 0x3b, 0x26, 0xd2, 0x04, 0x3d, 0x66, 0xf6, 0x72, 0x3c, 0x04, 0x30, 0xa8, 0x3c, - 0xc8, 0xdc, 0xaf, 0x3c, 0x90, 0x1d, 0x07, 0x3d, 0xd4, 0xd5, 0xb3, 0x3c, 0x1c, 0x6e, 0x54, 0x3c, - 0x34, 0x5d, 0x5e, 0x3c, 0x22, 0x28, 0x99, 0x3c, 0x7c, 0x1e, 0xa1, 0x3c, 0xca, 0x69, 0x8b, 0x3c, - 0x0f, 0x94, 0xa9, 0x3c, 0x86, 0x1b, 0x3a, 0x3d, 0xda, 0xfe, 0xde, 0x3c, 0xa8, 0xa2, 0xaa, 0x3c, - 0x9a, 0x4c, 0xd9, 0x3c, 0x3a, 0x60, 0x1f, 0x3c, 0xd0, 0x4c, 0x97, 0x3c, 0x56, 0xbf, 0xd1, 0x3c, - 0x50, 0xb6, 0xb2, 0x3f, 0x8a, 0x4d, 0x8b, 0x3f, 0x9e, 0xc6, 0x78, 0x3f, 0x3a, 0xaf, 0xa7, 0x3f, - 0xb5, 0x62, 0x55, 0x3f, 0x75, 0xad, 0x9c, 0x3f, 0x86, 0x96, 0x8e, 0x3f, 0xd4, 0x36, 0xbc, 0x3f, - 0x02, 0xa4, 0xb3, 0x3f, 0xe0, 0x2f, 0xdf, 0x3f, 0x25, 0xef, 0xc8, 0x3f, 0xec, 0xaf, 0x5a, 0x3f, - 0x8e, 0x82, 0xac, 0x3f, 0x14, 0x2c, 0xc3, 0x3f, 0x25, 0x00, 0x58, 0x3f, 0x82, 0x66, 0xea, 0x3f, - 0x16, 0xaa, 0x8d, 0x3f, 0xa7, 0xc2, 0xb1, 0x3f, 0xa2, 0xcb, 0x53, 0x3f, 0x4c, 0xbc, 0x5d, 0x3f, - 0xc7, 0xeb, 0xe2, 0x3f, 0x58, 0x56, 0x61, 0x3f, 0xec, 0x4f, 0xb0, 0x3f, 0xd0, 0xbc, 0x9e, 0x3f, - 0x04, 0x69, 0x75, 0x3f, 0x7b, 0xdb, 0xa6, 0x3f, 0x4e, 0x7a, 0x1d, 0x40, 0x6c, 0x56, 0xe7, 0x3f, - 0x1e, 0xd0, 0xf7, 0x3f, 0x8c, 0xd7, 0x90, 0x3f, 0xbe, 0xd2, 0xcf, 0x3f, 0x06, 0xe1, 0x99, 0x3f, - 0x50, 0xfd, 0xeb, 0xbc, 0x3f, 0x1b, 0xb3, 0x3e, 0xc8, 0x7c, 0xe3, 0x3e, 0x86, 0xb3, 0x3a, 0x3f, - 0x72, 0xa0, 0x5d, 0x3f, 0x42, 0xe7, 0x23, 0x3f, 0x97, 0xf6, 0xb1, 0x3e, 0xe0, 0x6d, 0xdc, 0x3c, - 0xd8, 0x6b, 0x6c, 0x3e, 0xcb, 0x21, 0xf9, 0x3e, 0x64, 0x64, 0x03, 0x3f, 0x22, 0xfb, 0x4b, 0x3e, - 0xed, 0x96, 0x94, 0x3d, 0xf6, 0xfe, 0x55, 0x3e, 0x69, 0x78, 0x80, 0x3f, 0xbc, 0x6a, 0x07, 0x3f, - 0xe0, 0x5c, 0xde, 0x3b, 0x30, 0x13, 0xe5, 0x3e, 0xbe, 0x44, 0x2b, 0x3f, 0x6a, 0x28, 0x04, 0x3f, - 0x70, 0x30, 0x2f, 0x3f, 0xff, 0x5a, 0x81, 0x3f, 0xb7, 0xff, 0x06, 0x3f, 0x60, 0xfa, 0xd5, 0xbb, - 0x3c, 0x15, 0xbd, 0x3d, 0xed, 0x6f, 0x33, 0x3f, 0x6a, 0xef, 0x32, 0xbe, 0x74, 0x09, 0x6b, 0x3f, - 0x4e, 0xb1, 0x28, 0x3f, 0x14, 0x6c, 0xf3, 0x3e, 0x57, 0xec, 0x31, 0x3e, 0xac, 0xa2, 0x63, 0x3f, - 0x56, 0x08, 0xfe, 0xbd, 0x12, 0xc3, 0x59, 0xbe, 0xf4, 0xcf, 0xc3, 0xbe, 0x06, 0x74, 0xb2, 0xbe, - 0x8c, 0xdc, 0xe0, 0xbe, 0x8a, 0x6b, 0xa9, 0xbe, 0x8a, 0x35, 0xdf, 0xbd, 0x46, 0x6f, 0x8c, 0xbe, - 0x56, 0x37, 0xf3, 0xbd, 0xff, 0xba, 0xc9, 0xbe, 0xf8, 0xd1, 0xba, 0xbe, 0x00, 0x7d, 0x0a, 0xbe, - 0xbd, 0x06, 0x57, 0xbe, 0x29, 0xb5, 0x2a, 0xbe, 0xda, 0x91, 0x96, 0xbe, 0x8f, 0x19, 0x90, 0xbe, - 0x00, 0x40, 0x84, 0xbe, 0xcc, 0x1f, 0x26, 0xbe, 0x5d, 0xba, 0xbd, 0xbe, 0x9c, 0x29, 0xdc, 0xbe, - 0x97, 0x9f, 0xc0, 0xbe, 0x64, 0x6c, 0xb0, 0xbe, 0x8b, 0x82, 0xcd, 0xbe, 0x19, 0xc3, 0x83, 0x3d, - 0xff, 0xc2, 0x83, 0xbe, 0xce, 0x56, 0xd0, 0xbe, 0x6e, 0x5c, 0x67, 0xbd, 0x72, 0xe5, 0xc5, 0xbe, - 0x35, 0xda, 0x11, 0xbf, 0x3e, 0xaa, 0xa4, 0xbe, 0x00, 0x36, 0xb5, 0xbe, 0x03, 0x6c, 0x04, 0xbf, - 0x02, 0xdf, 0x04, 0xbe, 0xb7, 0xac, 0x3e, 0x3d, 0x39, 0x4e, 0xbb, 0xbd, 0x9a, 0xc9, 0x41, 0xbe, - 0xa8, 0x83, 0x25, 0xbe, 0x75, 0x34, 0xbd, 0xbd, 0xf4, 0xdb, 0xa0, 0x3d, 0x3c, 0x79, 0xcc, 0x3c, - 0xbe, 0xad, 0xa3, 0xbe, 0x3b, 0xc2, 0xb1, 0xbd, 0x2f, 0xed, 0xcc, 0xbd, 0xee, 0x8c, 0xf0, 0x3d, - 0x74, 0x80, 0x5a, 0x3d, 0xf2, 0x58, 0x1d, 0xbe, 0x4a, 0xce, 0x8f, 0xbe, 0xee, 0xf1, 0x9b, 0xbc, - 0xda, 0x2a, 0x5d, 0xbd, 0xab, 0x36, 0x3b, 0xbd, 0xf4, 0x96, 0x2c, 0xbe, 0xec, 0x70, 0x9d, 0xbe, - 0x30, 0x8a, 0x65, 0xbe, 0x43, 0x0a, 0x05, 0xbe, 0x8c, 0x02, 0x04, 0xbe, 0xaf, 0x45, 0x85, 0xbc, - 0x8b, 0xd0, 0x31, 0x3e, 0x88, 0x43, 0x21, 0xbe, 0xfe, 0x9d, 0x29, 0xbe, 0x26, 0x45, 0x5c, 0xbe, - 0x3e, 0xe5, 0x38, 0xbe, 0xec, 0xc2, 0xd8, 0xbe, 0x23, 0x37, 0xc9, 0xbd, 0x59, 0xba, 0x41, 0xbe, - 0x2e, 0x6f, 0xd5, 0x3c, 0x38, 0xb9, 0x92, 0x3c, 0xc3, 0x0a, 0x41, 0x3d, 0xce, 0x53, 0x19, 0x3d, - 0x5b, 0xbc, 0x38, 0x3d, 0x39, 0xcf, 0x06, 0x3d, 0xa0, 0xe0, 0x5a, 0x3a, 0x67, 0xbc, 0x1f, 0x3d, - 0xb4, 0xd3, 0xd5, 0x3c, 0xcf, 0x6a, 0x41, 0x3d, 0xdf, 0x99, 0x2e, 0x3d, 0x3b, 0xb9, 0xfe, 0x3b, - 0x96, 0x5e, 0xd8, 0x3c, 0xfc, 0xb8, 0xd2, 0x3c, 0x88, 0x57, 0xca, 0x3c, 0x30, 0x31, 0xd1, 0x3c, - 0xf4, 0xd4, 0x28, 0x3d, 0x53, 0xcb, 0x3f, 0x3c, 0x2a, 0xd9, 0x2a, 0x3d, 0xf9, 0x57, 0x80, 0x3d, - 0x2e, 0x9f, 0x37, 0x3d, 0x84, 0xac, 0xc8, 0x3c, 0xb7, 0x03, 0x49, 0x3d, 0x94, 0x38, 0x0d, 0xbc, - 0x26, 0xe4, 0xdb, 0x3c, 0x50, 0x8f, 0x3a, 0x3d, 0xa3, 0xf8, 0xb8, 0x3c, 0x32, 0x2d, 0x1c, 0x3d, - 0x18, 0x87, 0x94, 0x3d, 0xd6, 0x63, 0x5b, 0x3d, 0x01, 0xda, 0x55, 0x3d, 0x12, 0x27, 0x6b, 0x3d, - 0x40, 0x1e, 0xda, 0x3c, 0xf9, 0x3d, 0x03, 0xbc, 0xe2, 0xb4, 0x93, 0x3c, 0xd9, 0x28, 0xb0, 0x3c, - 0xc2, 0x1a, 0x8e, 0x3c, 0x21, 0x75, 0x14, 0x3c, 0x94, 0xc9, 0x92, 0xbc, 0x7b, 0xa8, 0x14, 0x3c, - 0x94, 0x36, 0x3d, 0x3d, 0x51, 0x2b, 0x87, 0x3c, 0xac, 0x42, 0x84, 0x3c, 0x76, 0x54, 0x8f, 0xbc, - 0x00, 0xe7, 0xa5, 0x38, 0xd0, 0x08, 0xc8, 0x3c, 0x0a, 0x85, 0xbf, 0x3c, 0x40, 0xbd, 0x38, 0xba, - 0x14, 0x4c, 0xaa, 0x3c, 0x40, 0x82, 0x61, 0x38, 0x1c, 0x34, 0xb0, 0x3c, 0x98, 0x82, 0x4e, 0x3d, - 0x91, 0xad, 0xf2, 0x3c, 0xcc, 0xa0, 0x46, 0x3b, 0x32, 0xd2, 0xb2, 0x3c, 0x98, 0x82, 0x12, 0xba, - 0xe4, 0x2d, 0x85, 0xbc, 0xc3, 0xc9, 0xa8, 0x3c, 0x46, 0x32, 0x09, 0x3d, 0x64, 0xef, 0xab, 0x3c, - 0x59, 0xa6, 0x09, 0x3d, 0x97, 0x88, 0x82, 0x3d, 0x76, 0x40, 0xda, 0x3c, 0x96, 0x89, 0xc9, 0x3c, - 0xaf, 0x13, 0x27, 0x3b, 0xe1, 0xb6, 0x6b, 0x3c, 0x12, 0xc9, 0xa8, 0x3c, 0x66, 0x44, 0xbb, 0x3c, - 0x01, 0x91, 0xec, 0x3c, 0x5f, 0x9b, 0xb3, 0x3c, 0x60, 0xf6, 0x31, 0x3c, 0x28, 0xfb, 0x34, 0x3c, - 0xfe, 0x13, 0x8d, 0x3b, 0xe6, 0x95, 0xb2, 0x3c, 0xee, 0xeb, 0xab, 0x3c, 0xfe, 0x8a, 0x21, 0x3c, - 0xb9, 0xc0, 0x1f, 0x3c, 0x4c, 0x05, 0x00, 0x3c, 0xa0, 0x9d, 0xc6, 0x3c, 0xa3, 0xbb, 0x9c, 0x3c, - 0xa1, 0x7e, 0x15, 0x3c, 0xb7, 0x7c, 0x54, 0x3c, 0xc9, 0xeb, 0xbb, 0x3c, 0xf0, 0x17, 0xab, 0x3c, - 0xe7, 0x0d, 0xba, 0x3c, 0x78, 0x87, 0xe6, 0x3c, 0xb8, 0x1a, 0xb6, 0x3c, 0x08, 0x93, 0x32, 0xbb, - 0x57, 0xe3, 0x5a, 0x3c, 0xb1, 0x93, 0xcc, 0x3c, 0xc7, 0xbd, 0x3b, 0xbb, 0x55, 0x10, 0xde, 0x3c, - 0xb0, 0x80, 0xf6, 0x3c, 0xf6, 0xb7, 0x70, 0x3c, 0x1c, 0xc7, 0x74, 0x3c, 0x34, 0x78, 0x02, 0x3d, - 0xf9, 0xcf, 0x39, 0x3b, 0x69, 0xf5, 0xba, 0x3a, 0x02, 0x63, 0xc6, 0x3b, 0x0c, 0x0a, 0x74, 0x3c, - 0x0e, 0xc4, 0x75, 0x3c, 0xde, 0xd2, 0x23, 0x3c, 0x35, 0xae, 0xc2, 0x3a, 0xd0, 0x52, 0x80, 0xbb, - 0x73, 0x23, 0x6b, 0x3c, 0xfe, 0x02, 0xd3, 0x3b, 0x92, 0xcd, 0xfd, 0x3b, 0x1b, 0xdf, 0x35, 0xbb, - 0xd6, 0xdf, 0x5f, 0xbb, 0x40, 0xaa, 0xea, 0x3b, 0x60, 0x34, 0xc1, 0x3c, 0x58, 0x00, 0xc5, 0x3b, - 0x98, 0xf9, 0x8e, 0xba, 0x9e, 0x0f, 0xea, 0x3b, 0x5b, 0x59, 0x52, 0x3c, 0x2b, 0xd5, 0x71, 0x3c, - 0x02, 0x8b, 0x77, 0x3c, 0x44, 0x9b, 0x8e, 0x3c, 0x34, 0x00, 0x0d, 0x3c, 0x98, 0x1b, 0xab, 0x3a, - 0xb3, 0x2e, 0x06, 0xbc, 0x85, 0xd2, 0x4c, 0x3c, 0xdc, 0xf1, 0x29, 0x3b, 0xd5, 0xda, 0x97, 0x3c, - 0xb4, 0x31, 0x2e, 0x3c, 0x53, 0x09, 0xa2, 0x3c, 0x43, 0x4e, 0x0d, 0x3b, 0x42, 0x1c, 0x7d, 0x3c, - 0xb3, 0x8f, 0x19, 0x3f, 0x20, 0xfa, 0xc3, 0x3c, 0x8f, 0x39, 0x1f, 0x3f, 0xce, 0x20, 0xf6, 0x3e, - 0x76, 0xe3, 0xf8, 0x3e, 0xf4, 0xd0, 0x9f, 0x3e, 0x32, 0xe4, 0x8e, 0xbe, 0x3a, 0xb9, 0x08, 0x3f, - 0x0f, 0x70, 0x41, 0x3f, 0x1a, 0x60, 0x18, 0x3f, 0x88, 0x6a, 0x08, 0x3f, 0x70, 0x00, 0x23, 0xbe, - 0xb9, 0x4d, 0x8f, 0x3e, 0x32, 0x08, 0x03, 0x3f, 0xad, 0x76, 0x94, 0x3e, 0x1e, 0xa4, 0x16, 0x3e, - 0x4c, 0x25, 0x32, 0x3f, 0x68, 0x7f, 0xa7, 0x3c, 0xd4, 0x41, 0x0b, 0x3f, 0xb7, 0x52, 0x92, 0x3f, - 0xd5, 0xa0, 0x2a, 0x3f, 0x7c, 0x9b, 0x69, 0x3d, 0x94, 0x74, 0x2a, 0x3f, 0x20, 0x20, 0xd0, 0xbd, - 0x4c, 0x9b, 0xc4, 0x3d, 0xb7, 0xa7, 0x11, 0x3f, 0xb9, 0xad, 0x2c, 0x3f, 0x25, 0xff, 0xde, 0x3e, - 0x4e, 0xbd, 0x83, 0x3f, 0x62, 0xb2, 0x99, 0x3f, 0x2c, 0xf9, 0x57, 0x3f, 0x2c, 0x49, 0x33, 0x3f, - 0x67, 0x60, 0x28, 0xbe, 0x1a, 0x06, 0x60, 0xbe, 0x57, 0x61, 0x1c, 0xbe, 0xea, 0x00, 0x2c, 0xbe, - 0x9e, 0x9a, 0x2e, 0xbe, 0x52, 0xd7, 0x23, 0xbe, 0xd0, 0x86, 0x1b, 0xbe, 0x4c, 0x77, 0x31, 0xbe, - 0x54, 0x25, 0x4a, 0xbe, 0x01, 0x73, 0x03, 0xbe, 0xb2, 0x5c, 0x2a, 0xbe, 0xa2, 0x9a, 0xce, 0xbd, - 0x69, 0x5a, 0xb0, 0xbd, 0xca, 0x2a, 0x24, 0xbe, 0x47, 0x7f, 0x37, 0xbe, 0xc1, 0x43, 0x0a, 0xbe, - 0x1c, 0xa7, 0xf7, 0xbd, 0xbe, 0x31, 0x43, 0xbe, 0x66, 0x16, 0x63, 0xbe, 0xd0, 0xee, 0x1a, 0xbe, - 0x80, 0x5b, 0x0f, 0xbe, 0xf8, 0xed, 0xd6, 0xbd, 0x1d, 0xdd, 0x4d, 0xbe, 0xd6, 0xe2, 0xec, 0xbd, - 0x28, 0x92, 0x0c, 0xbe, 0xa4, 0xf0, 0x96, 0xbe, 0x54, 0xd2, 0xb6, 0xbd, 0x85, 0xb2, 0x1a, 0xbe, - 0xf4, 0x4c, 0x1c, 0xbe, 0xc1, 0xce, 0x83, 0xbd, 0x9c, 0x39, 0xc1, 0xbd, 0x06, 0xc1, 0x2f, 0xbe, - 0xd4, 0x18, 0xbc, 0x3d, 0x56, 0xf9, 0xdf, 0x3d, 0xd4, 0xa3, 0xd2, 0x3d, 0x94, 0xc0, 0xe6, 0x3d, - 0xa4, 0x54, 0xcc, 0x3d, 0x70, 0x69, 0xe1, 0x3d, 0x06, 0x86, 0xd2, 0x3d, 0xfb, 0x74, 0xf4, 0x3d, - 0x9d, 0x18, 0xe9, 0x3d, 0xc1, 0x1f, 0x00, 0x3e, 0x79, 0x15, 0xf5, 0x3d, 0xd2, 0x44, 0xcc, 0x3d, - 0x52, 0xa9, 0xd5, 0x3d, 0x72, 0x93, 0xeb, 0x3d, 0x92, 0x7a, 0xb8, 0x3d, 0xb8, 0x2a, 0xde, 0x3d, - 0x7a, 0x98, 0xc0, 0x3d, 0x5a, 0xa9, 0xcc, 0x3d, 0x16, 0x29, 0xf1, 0x3d, 0x4d, 0x57, 0xab, 0x3d, - 0xd0, 0xa1, 0x03, 0x3e, 0x73, 0x8a, 0x8d, 0x3d, 0x0b, 0x72, 0xea, 0x3d, 0x3a, 0x2a, 0xb3, 0x3d, - 0xe7, 0x71, 0xac, 0x3d, 0xec, 0xeb, 0xd3, 0x3d, 0x84, 0x8b, 0xae, 0x3d, 0x5f, 0xec, 0x0b, 0x3e, - 0x83, 0x9f, 0x0b, 0x3e, 0xaf, 0xdf, 0x6e, 0x3d, 0xba, 0x5b, 0x9d, 0x3d, 0x40, 0x54, 0xbd, 0x3d, - 0x8f, 0x1a, 0x69, 0x3d, 0x30, 0x01, 0x33, 0x3d, 0xc7, 0xca, 0x94, 0x3c, 0xc6, 0x7b, 0x82, 0x3c, - 0x1e, 0x00, 0xcd, 0x3c, 0xc3, 0xa9, 0x1d, 0x3d, 0x00, 0xe4, 0xf9, 0x3c, 0x85, 0xa5, 0x03, 0x3d, - 0x64, 0xe4, 0x4a, 0x3d, 0x92, 0x32, 0xf6, 0x3c, 0x78, 0xc3, 0x98, 0x3c, 0x90, 0x9c, 0x87, 0xbb, - 0xb0, 0x69, 0x16, 0x3b, 0x74, 0x89, 0x14, 0x3d, 0x9a, 0xcf, 0xb6, 0x3c, 0xea, 0xb3, 0x05, 0x3d, - 0xda, 0x7e, 0xa3, 0x3c, 0xa6, 0x5f, 0x3b, 0x3d, 0xd2, 0x80, 0x9d, 0x3c, 0x6f, 0xc8, 0x51, 0x3c, - 0x4c, 0x25, 0xc6, 0x3c, 0x01, 0x23, 0xc8, 0x3c, 0xd4, 0x8a, 0x12, 0x3d, 0x1f, 0x84, 0xee, 0x3c, - 0x52, 0xcd, 0xdc, 0x3b, 0x5a, 0x97, 0xc4, 0x3d, 0xe9, 0xaf, 0x99, 0x3d, 0x8c, 0xd7, 0x2c, 0x3d, - 0xb1, 0xc5, 0x2d, 0x3d, 0xee, 0xed, 0xd2, 0x3c, 0x24, 0xba, 0xc3, 0x3c, 0x6a, 0xc4, 0x47, 0x3d, - 0x4a, 0x89, 0x36, 0xbc, 0x1e, 0x17, 0x39, 0xbc, 0x8e, 0x3e, 0x38, 0xbc, 0x41, 0x37, 0x46, 0xbc, - 0x1f, 0x93, 0x2c, 0xbc, 0x1a, 0xa5, 0x56, 0xbc, 0x78, 0xab, 0x42, 0xbc, 0x76, 0x07, 0x61, 0xbc, - 0x93, 0x7b, 0x54, 0xbc, 0x6e, 0x01, 0x83, 0xbc, 0x6a, 0x84, 0x5a, 0xbc, 0x4e, 0xa3, 0x3a, 0xbc, - 0x10, 0x3f, 0x53, 0xbc, 0xbe, 0xd7, 0x60, 0xbc, 0x5d, 0xca, 0x0d, 0xbc, 0x84, 0x99, 0x5b, 0xbc, - 0x6e, 0x6c, 0x35, 0xbc, 0x06, 0xf7, 0x32, 0xbc, 0x93, 0x07, 0x38, 0xbc, 0xaf, 0x77, 0x05, 0xbc, - 0x1c, 0xa3, 0x81, 0xbc, 0x50, 0x8c, 0x04, 0xbc, 0x5c, 0xd9, 0x48, 0xbc, 0x10, 0xae, 0x2f, 0xbc, - 0x98, 0x7b, 0x09, 0xbc, 0x8f, 0xf0, 0x2b, 0xbc, 0x2a, 0x2d, 0x5e, 0xbc, 0x88, 0xf0, 0x8f, 0xbc, - 0xaa, 0x3e, 0x8f, 0xbc, 0xbc, 0x44, 0x01, 0xbc, 0x42, 0xe0, 0x1c, 0xbc, 0xb3, 0x69, 0x2d, 0xbc, - 0xb4, 0x99, 0xfa, 0xbb, 0xa8, 0x06, 0x9b, 0xbb, 0x9a, 0xf6, 0x35, 0xbb, 0xc8, 0xd7, 0x1e, 0xbb, - 0xea, 0x70, 0x48, 0xbb, 0xea, 0xc2, 0xc2, 0xbb, 0x4a, 0x75, 0x98, 0xbb, 0x90, 0x3e, 0xa4, 0xbb, - 0x48, 0x53, 0xd6, 0xbb, 0x25, 0x81, 0xd4, 0xbb, 0xe9, 0x02, 0x54, 0xbb, 0xbc, 0x89, 0x83, 0xba, - 0x60, 0x4f, 0x30, 0xbb, 0xed, 0x97, 0xbf, 0xbb, 0x3a, 0x28, 0xf6, 0xba, 0xce, 0xb1, 0xbe, 0xbb, - 0x47, 0x2f, 0x70, 0xbb, 0xcc, 0x5e, 0xb4, 0xbb, 0x8c, 0xd0, 0xb4, 0xba, 0xe4, 0xfb, 0x8a, 0xba, - 0x8e, 0x95, 0xb0, 0xbb, 0x04, 0x5f, 0x6d, 0xbb, 0xb4, 0xd1, 0x8f, 0xbb, 0x36, 0x1a, 0xa0, 0xbb, - 0x44, 0x12, 0x29, 0xba, 0xe7, 0xac, 0x1f, 0xbc, 0x7a, 0x7d, 0x4d, 0xbc, 0x7c, 0x23, 0x05, 0xbc, - 0xfb, 0x99, 0x04, 0xbc, 0x4c, 0xcf, 0x97, 0xbb, 0x02, 0x45, 0x8c, 0xbb, 0x88, 0xb6, 0xcb, 0xbb, - 0x41, 0x59, 0xb3, 0xbb, 0x9a, 0xf6, 0xeb, 0xbb, 0xe0, 0x58, 0xca, 0xbb, 0x95, 0xa9, 0xdf, 0xbb, - 0x71, 0x78, 0xcd, 0xbb, 0xdb, 0xdc, 0xce, 0xbb, 0x62, 0xc4, 0xc4, 0xbb, 0x76, 0x28, 0xe4, 0xbb, - 0x2c, 0x61, 0xe3, 0xbb, 0x94, 0x8b, 0xd3, 0xbb, 0x9d, 0x45, 0xe6, 0xbb, 0xd2, 0x59, 0xb3, 0xbb, - 0xc7, 0x48, 0xae, 0xbb, 0x36, 0x0c, 0xd6, 0xbb, 0x71, 0x69, 0xc7, 0xbb, 0xc7, 0xab, 0xc1, 0xbb, - 0x60, 0x6c, 0xad, 0xbb, 0xc7, 0x29, 0xd0, 0xbb, 0x9d, 0xf7, 0x00, 0xbc, 0xdf, 0x30, 0xb4, 0xbb, - 0x2f, 0x82, 0xe0, 0xbb, 0x72, 0x3b, 0x84, 0xbb, 0x9b, 0xce, 0xeb, 0xbb, 0x28, 0xbe, 0x9e, 0xbb, - 0x97, 0xad, 0xaf, 0xbb, 0x5e, 0xd6, 0xf8, 0xbb, 0xe6, 0x7b, 0x75, 0xbb, 0x09, 0x21, 0xe9, 0xbb, - 0xe3, 0x87, 0xe9, 0xbb, 0x5b, 0x87, 0x40, 0xbb, 0xc2, 0x27, 0x88, 0xbb, 0x30, 0x5b, 0xbb, 0xbb, - 0xa2, 0x39, 0x74, 0xbb, 0xd0, 0x62, 0x80, 0xbb, 0xde, 0x5f, 0xfe, 0xba, 0x8b, 0x50, 0x02, 0xbb, - 0x1a, 0x03, 0x26, 0xbb, 0x6c, 0x32, 0x33, 0xbb, 0x20, 0xa7, 0x1c, 0xbb, 0x92, 0x80, 0x2a, 0xbb, - 0x7a, 0x1e, 0x74, 0xbb, 0x27, 0x25, 0xdf, 0xba, 0xdd, 0x83, 0x01, 0xbb, 0xba, 0x7b, 0x92, 0xb9, - 0xdc, 0xb1, 0x63, 0xb9, 0xde, 0x00, 0x2a, 0xbb, 0x36, 0xc8, 0x30, 0xbb, 0x55, 0xd6, 0x0a, 0xbb, - 0x0d, 0x2f, 0xd0, 0xba, 0x4e, 0xc4, 0x6e, 0xbb, 0x71, 0x03, 0x41, 0xbb, 0x59, 0x31, 0x00, 0xbb, - 0xf3, 0x1a, 0xd6, 0xba, 0x8b, 0x1b, 0xec, 0xba, 0x33, 0xbc, 0x55, 0xbb, 0xc9, 0x74, 0xfc, 0xba, - 0x8c, 0x0c, 0xc3, 0xba, 0xb6, 0x92, 0xec, 0xbb, 0x87, 0x1c, 0x54, 0xbb, 0xec, 0xc6, 0x1c, 0xbb, - 0x16, 0x49, 0x1f, 0xbb, 0x5b, 0x9a, 0xab, 0xba, 0x07, 0xa8, 0xc5, 0xba, 0x82, 0x99, 0x67, 0xbb, - 0x77, 0x5a, 0x2e, 0xbe, 0x11, 0xa3, 0x00, 0xbe, 0xfe, 0xdf, 0xf4, 0xbd, 0x1a, 0x95, 0xf8, 0xbd, - 0x46, 0xd9, 0xe1, 0xbd, 0xc8, 0x6e, 0x32, 0xbe, 0x18, 0x98, 0x17, 0xbe, 0x6a, 0x99, 0x2b, 0xbe, - 0xd5, 0x36, 0x2e, 0xbe, 0xaa, 0x12, 0x62, 0xbe, 0x70, 0x52, 0x14, 0xbe, 0x32, 0xee, 0xe9, 0xbd, - 0xfe, 0x7a, 0x1d, 0xbe, 0x68, 0x68, 0x38, 0xbe, 0x01, 0xea, 0x96, 0xbd, 0x82, 0xe1, 0x3a, 0xbe, - 0xa9, 0xd3, 0x0a, 0xbe, 0x94, 0x72, 0x0c, 0xbe, 0x1b, 0x08, 0xb1, 0xbd, 0x2d, 0xa7, 0x86, 0xbd, - 0x13, 0x8b, 0x50, 0xbe, 0x3a, 0x1d, 0xd8, 0xbd, 0x46, 0x51, 0x0c, 0xbe, 0x3f, 0x33, 0x16, 0xbe, - 0xd5, 0xe8, 0x89, 0xbd, 0x65, 0xb2, 0x1d, 0xbe, 0xd8, 0xec, 0x8a, 0xbe, 0x48, 0xad, 0x80, 0xbe, - 0x26, 0x99, 0x7f, 0xbe, 0xba, 0x92, 0xff, 0xbd, 0xbd, 0x76, 0x07, 0xbe, 0x3c, 0x9c, 0x15, 0xbe, - 0x10, 0xca, 0xac, 0xbc, 0x77, 0x98, 0xb2, 0xbd, 0x55, 0x8e, 0xce, 0xbd, 0xff, 0x73, 0x17, 0xbe, - 0x48, 0x38, 0xca, 0xbd, 0x22, 0x32, 0xd0, 0xbd, 0x2a, 0x58, 0x81, 0xbd, 0x0b, 0xc9, 0x88, 0xbd, - 0x16, 0x98, 0x23, 0xbd, 0x2a, 0xba, 0x36, 0xbd, 0xd3, 0x7b, 0xd8, 0xbd, 0x70, 0xf0, 0x55, 0xbd, - 0x80, 0xfb, 0x43, 0x3b, 0x04, 0x92, 0x98, 0xbc, 0x7c, 0xd2, 0x24, 0xbe, 0xb4, 0x57, 0x77, 0xbd, - 0xb0, 0x4e, 0x5c, 0x3b, 0x04, 0xe5, 0x92, 0xbd, 0x4f, 0xb7, 0xf3, 0xbd, 0xa1, 0x43, 0xa1, 0xbd, - 0xa2, 0x82, 0xc8, 0xbd, 0x0c, 0x63, 0xd4, 0xbd, 0x8e, 0xd9, 0xdd, 0xbd, 0x60, 0xe2, 0xf3, 0xbb, - 0x20, 0x87, 0x29, 0x3c, 0xca, 0x7a, 0x28, 0xbe, 0x73, 0x08, 0xac, 0x3c, 0x3b, 0xed, 0xdb, 0xbd, - 0x0e, 0xf5, 0x98, 0xbd, 0x6a, 0x10, 0x22, 0xbd, 0x70, 0xa4, 0x86, 0xbc, 0x83, 0x8f, 0xf5, 0xbd, - 0x8b, 0x64, 0xa2, 0x3c, 0x69, 0x10, 0x1d, 0x3d, 0x46, 0x7f, 0x83, 0x3d, 0x78, 0xc9, 0x71, 0x3d, - 0x5a, 0x14, 0x76, 0x3d, 0x07, 0x4a, 0x86, 0x3d, 0x65, 0x8e, 0xf4, 0x3c, 0xe0, 0x24, 0x5a, 0x3d, - 0xce, 0xce, 0xec, 0x3c, 0xba, 0x01, 0x52, 0x3d, 0x92, 0xa4, 0x75, 0x3d, 0x9e, 0x5d, 0x24, 0x3d, - 0xb3, 0xb5, 0xf1, 0x3c, 0xc1, 0xe7, 0x18, 0x3d, 0x04, 0xd3, 0x2b, 0x3d, 0xc4, 0x84, 0x01, 0x3d, - 0x3e, 0x82, 0x2e, 0x3d, 0x0a, 0x1f, 0xdb, 0x3c, 0xc8, 0xae, 0x6d, 0x3d, 0x16, 0xaa, 0x27, 0x3d, - 0xbf, 0xf3, 0x77, 0x3d, 0x45, 0xd9, 0x31, 0x3d, 0xc2, 0x2e, 0x51, 0x3d, 0x3a, 0x45, 0x05, 0x3b, - 0xc0, 0xae, 0x72, 0x3c, 0x3e, 0x9d, 0x6d, 0x3d, 0x20, 0x1e, 0xbc, 0x3a, 0xc7, 0xf1, 0x87, 0x3d, - 0x18, 0x03, 0xb0, 0x3d, 0x14, 0x8a, 0xc4, 0x3c, 0xc8, 0x1b, 0x01, 0x3d, 0x98, 0x11, 0x7c, 0x3d, - 0x1e, 0x63, 0xf4, 0x3c, 0x83, 0x71, 0x12, 0x3c, 0x9e, 0x31, 0x95, 0x3c, 0xd4, 0xe7, 0xc2, 0x3c, - 0x86, 0x5b, 0x8e, 0x3c, 0x1c, 0x62, 0x87, 0x3c, 0x55, 0x90, 0x09, 0x3b, 0xbf, 0x34, 0x18, 0x3c, - 0xe2, 0x15, 0x12, 0x3d, 0x14, 0x18, 0x80, 0x3c, 0x2a, 0xa3, 0xad, 0x3b, 0x5b, 0x54, 0x37, 0xbc, - 0x3e, 0xea, 0x22, 0xbc, 0xe0, 0x7b, 0xc3, 0x3b, 0x84, 0x86, 0x1a, 0x3d, 0xa4, 0xc4, 0xc1, 0x3a, - 0x10, 0xf4, 0x09, 0xbb, 0xb9, 0xd5, 0xac, 0x3b, 0x02, 0x77, 0xa9, 0x3c, 0x69, 0xcc, 0x11, 0x3d, - 0xb4, 0x40, 0xc7, 0x3c, 0x3e, 0xbc, 0x54, 0x3c, 0x2e, 0xdb, 0xd7, 0x3c, 0x72, 0xdb, 0xdf, 0x3b, - 0xd2, 0xea, 0xe3, 0xbc, 0x1c, 0x86, 0x50, 0x3d, 0xef, 0x68, 0xf9, 0x3c, 0xc1, 0x64, 0x03, 0x3d, - 0x35, 0x81, 0x92, 0x3c, 0x4c, 0x87, 0x55, 0x3d, 0x00, 0xd5, 0xb4, 0xb8, 0xbf, 0xd2, 0x0c, 0x3d, - 0x8c, 0x8c, 0x76, 0xbb, 0x97, 0xbd, 0x57, 0xbb, 0xce, 0x44, 0xeb, 0xbb, 0x54, 0x93, 0xa7, 0xbb, - 0x94, 0xea, 0xd7, 0xbb, 0x38, 0x55, 0xee, 0xbb, 0xda, 0x60, 0x22, 0xbb, 0x92, 0x0a, 0xcc, 0xbb, - 0xd0, 0xe4, 0x9c, 0xbb, 0x51, 0xe6, 0xe4, 0xbb, 0xce, 0xfe, 0xbb, 0xbb, 0xf8, 0xb5, 0x73, 0xbb, - 0x30, 0x00, 0x84, 0xbb, 0xf8, 0xea, 0xac, 0xbb, 0xc2, 0x60, 0x38, 0xbb, 0xd8, 0xc5, 0x37, 0xbb, - 0x2b, 0xa2, 0xd1, 0xbb, 0x36, 0x15, 0xf7, 0xba, 0x60, 0x21, 0xbd, 0xbb, 0x55, 0x82, 0xad, 0xbb, - 0x4d, 0x74, 0xe6, 0xbb, 0x83, 0xd6, 0x72, 0xbb, 0xda, 0xde, 0xaf, 0xbb, 0x4a, 0xf0, 0xd0, 0xb9, - 0xbd, 0x17, 0x9a, 0xba, 0xc2, 0xd5, 0xbc, 0xbb, 0x8e, 0xbf, 0x1f, 0xbb, 0xd2, 0xc3, 0x02, 0xbc, - 0x8e, 0x13, 0x39, 0xbc, 0x31, 0x1e, 0xa0, 0xbb, 0x2e, 0xd6, 0x88, 0xbb, 0x9a, 0xd8, 0xe3, 0xbb, - 0x81, 0x12, 0x9c, 0xbb, 0x80, 0x1f, 0xb8, 0xb9, 0x06, 0x1a, 0x29, 0xbb, 0xc2, 0x6e, 0xd0, 0xba, - 0xcf, 0xfd, 0x17, 0xbb, 0x3c, 0x3f, 0x1b, 0xbb, 0x12, 0x15, 0x9c, 0x39, 0x7a, 0xde, 0xef, 0xba, - 0x00, 0x0a, 0xb3, 0xbb, 0x87, 0x43, 0x60, 0xbb, 0x30, 0x84, 0x8d, 0xb9, 0x50, 0x36, 0xb9, 0x3a, - 0x00, 0xf7, 0xae, 0xb8, 0xb3, 0x48, 0x0c, 0xbb, 0x27, 0xb3, 0x1c, 0xbb, 0xf8, 0x4b, 0x5c, 0x39, - 0xe0, 0x83, 0xfc, 0xba, 0x41, 0x82, 0x89, 0x39, 0x4e, 0x8a, 0x05, 0xbb, 0x31, 0x04, 0x9c, 0xbb, - 0x5e, 0x96, 0x5f, 0xbb, 0x1a, 0x7f, 0x2d, 0xba, 0xd4, 0xbb, 0x3d, 0xbb, 0x36, 0x3f, 0x66, 0xba, - 0x4a, 0x5c, 0x4a, 0x3b, 0x0d, 0x90, 0xa5, 0xbb, 0x66, 0xef, 0xae, 0xbb, 0x34, 0x22, 0x95, 0xbb, - 0xa3, 0x22, 0x93, 0xbb, 0x33, 0x53, 0xfc, 0xbb, 0xb0, 0x11, 0x85, 0xba, 0x86, 0xd9, 0x8a, 0xbb, - 0x4e, 0x45, 0x3d, 0xba, 0xd2, 0x2a, 0x36, 0xbb, 0x58, 0x0a, 0x7d, 0xbb, 0xd8, 0x90, 0x8f, 0xbb, - 0x35, 0x43, 0x71, 0xbb, 0xbf, 0x52, 0x81, 0xbb, 0x89, 0xa1, 0x0c, 0xbb, 0x72, 0x3a, 0x44, 0xbb, - 0x93, 0xe6, 0xaa, 0xba, 0x02, 0xec, 0x21, 0xbb, 0x5a, 0x40, 0x81, 0xbb, 0x1b, 0xd6, 0x24, 0xbb, - 0xdc, 0xc8, 0x9a, 0xba, 0xca, 0x65, 0xd6, 0xba, 0xf6, 0xff, 0x77, 0xbb, 0x49, 0x75, 0x0e, 0xbb, - 0x0a, 0x84, 0xcd, 0xba, 0x61, 0xc2, 0x0b, 0xbb, 0x22, 0xa5, 0x7f, 0xbb, 0x4e, 0x72, 0x1d, 0xbb, - 0xeb, 0xc8, 0x6b, 0xbb, 0x71, 0x58, 0x51, 0xbb, 0x6f, 0x3e, 0x5e, 0xbb, 0x60, 0xfb, 0xf0, 0xb8, - 0xad, 0x91, 0x3f, 0xba, 0xa6, 0x49, 0x8c, 0xbb, 0xf3, 0x67, 0x31, 0x3a, 0x1a, 0x26, 0x7e, 0xbb, - 0x14, 0x12, 0x8b, 0xbb, 0x90, 0xe4, 0x6d, 0xba, 0xe2, 0xf9, 0xbe, 0xba, 0xd0, 0x0a, 0x7e, 0xbb, - 0x1d, 0x3b, 0xa0, 0xba, 0xfc, 0x9b, 0xab, 0xba, 0x1b, 0xa5, 0xcc, 0xba, 0x0d, 0xaa, 0x2b, 0xbb, - 0x13, 0xaf, 0xca, 0xba, 0xc9, 0xdf, 0xc3, 0xba, 0x2c, 0xbf, 0x46, 0xba, 0xfc, 0x9d, 0x50, 0xba, - 0xf2, 0x30, 0xd7, 0xba, 0xd2, 0x9d, 0x34, 0xba, 0x52, 0xb4, 0x9e, 0xba, 0x5e, 0x9d, 0x54, 0x38, - 0x97, 0x7f, 0x4f, 0x3a, 0xd0, 0x88, 0x8d, 0xb8, 0x28, 0x29, 0x6a, 0xbb, 0xf4, 0xbf, 0x2e, 0xba, - 0xb9, 0xfc, 0x2e, 0x3a, 0xdf, 0xc9, 0x8a, 0xba, 0xe9, 0x48, 0x05, 0xbb, 0x2a, 0xf4, 0x0b, 0xbb, - 0x9a, 0x3f, 0xea, 0xba, 0x26, 0x3a, 0xdb, 0xba, 0x7f, 0x3d, 0x0d, 0xbb, 0xe9, 0x05, 0xba, 0xb9, - 0x51, 0x9f, 0xb7, 0x3a, 0xcc, 0xa6, 0x80, 0xbb, 0x86, 0xd6, 0x4a, 0xba, 0xaa, 0xc0, 0x0d, 0xbb, - 0xc0, 0x7e, 0x5c, 0xba, 0x26, 0xae, 0x17, 0xbb, 0x04, 0x4d, 0x01, 0x39, 0xbc, 0x0b, 0x25, 0xbb, - 0xcb, 0x9d, 0xb9, 0xbd, 0xae, 0x27, 0x9d, 0xbc, 0xf1, 0x3b, 0xad, 0xbd, 0x1f, 0x7a, 0x1d, 0xbd, - 0x18, 0xb2, 0x9a, 0xbd, 0x9e, 0x1b, 0xaa, 0xbd, 0xa8, 0x20, 0x0f, 0xbc, 0x47, 0x6f, 0x97, 0xbd, - 0xe7, 0xc0, 0xd7, 0xbd, 0x68, 0xf2, 0xd8, 0xbd, 0x86, 0x90, 0x28, 0xbd, 0xfc, 0x0a, 0x53, 0xbc, - 0x5e, 0x88, 0x40, 0xbd, 0xa4, 0x5d, 0xa1, 0xbd, 0x06, 0xff, 0x95, 0xbc, 0x60, 0xcb, 0x66, 0xbc, - 0xd2, 0x81, 0xc1, 0xbd, 0x00, 0x80, 0xaa, 0xb9, 0x86, 0x00, 0x6e, 0xbd, 0xae, 0x61, 0xbf, 0xbd, - 0xfa, 0x83, 0xbe, 0xbd, 0x4a, 0x19, 0xbe, 0xbc, 0x56, 0xe6, 0x86, 0xbd, 0xa0, 0xdc, 0x4c, 0xbc, - 0x31, 0x08, 0x9b, 0x3c, 0x31, 0xf2, 0xa4, 0xbd, 0x32, 0xea, 0xbd, 0xbd, 0x1e, 0xc4, 0xe9, 0xbd, - 0x4b, 0xbd, 0x22, 0xbe, 0xd6, 0xde, 0x06, 0xbe, 0xd9, 0x60, 0x5e, 0xbd, 0x5e, 0x71, 0xc4, 0xbd, - 0xc2, 0x7f, 0x9a, 0xbd, 0xb9, 0xa9, 0xa8, 0xbd, 0xe2, 0xaf, 0x19, 0xbd, 0xb3, 0xbe, 0xf3, 0xbc, - 0x66, 0x62, 0xd4, 0xbd, 0x04, 0x7c, 0x86, 0xbd, 0xa1, 0xe0, 0x89, 0xbd, 0xc0, 0xac, 0x15, 0xbd, - 0x13, 0xc4, 0xdb, 0xbd, 0xef, 0x17, 0xba, 0xbd, 0xec, 0x99, 0x49, 0xbd, 0xba, 0x20, 0xef, 0xbc, - 0x37, 0xaf, 0x87, 0xbd, 0x29, 0xcc, 0xd7, 0xbd, 0xa6, 0xec, 0x66, 0xbd, 0x19, 0x1e, 0xa9, 0xbd, - 0xef, 0x43, 0xa4, 0xbd, 0x21, 0x46, 0xc1, 0xbd, 0x83, 0xf2, 0xb4, 0xbd, 0xd9, 0x66, 0x90, 0xbd, - 0xda, 0xcf, 0x80, 0xbd, 0x4a, 0x1b, 0x9a, 0xbd, 0x5c, 0xbf, 0x8e, 0xbd, 0xe5, 0x3b, 0x81, 0xbd, - 0x5a, 0x30, 0xdf, 0xbd, 0x40, 0xb5, 0xb1, 0xbd, 0xf4, 0xd3, 0x6b, 0xbd, 0xb8, 0xfd, 0xb4, 0xbd, - 0xd0, 0xfd, 0xbf, 0xbd, 0x0e, 0xdc, 0x61, 0xbd, 0xb0, 0xc1, 0x7e, 0xbd, 0x80, 0x12, 0xab, 0xbd, - 0x90, 0x48, 0x4f, 0x3d, 0xbe, 0x26, 0x54, 0x3d, 0x60, 0x3e, 0x23, 0x3d, 0x1a, 0xe2, 0x42, 0x3d, - 0x74, 0x6f, 0x4b, 0x3d, 0xc6, 0x08, 0x15, 0x3d, 0xd3, 0x64, 0x35, 0x3d, 0x04, 0x98, 0x48, 0x3d, - 0x46, 0xf8, 0x5a, 0x3d, 0x37, 0xe4, 0x8e, 0x3d, 0xab, 0xd3, 0x58, 0x3d, 0x35, 0xab, 0x0b, 0x3d, - 0xd0, 0xa0, 0x6e, 0x3d, 0x2a, 0xce, 0x4f, 0x3d, 0xa9, 0xdd, 0x3b, 0x3d, 0xcd, 0x0c, 0x8a, 0x3d, - 0x12, 0xac, 0x30, 0x3d, 0x21, 0xa2, 0x57, 0x3d, 0x17, 0xc9, 0x5f, 0x3d, 0x63, 0xb7, 0x75, 0x3d, - 0xea, 0xc1, 0x72, 0x3d, 0x05, 0x45, 0x19, 0x3d, 0x40, 0x9e, 0x83, 0x3d, 0x24, 0xa5, 0x35, 0x3d, - 0x1d, 0x88, 0x8b, 0x3d, 0xce, 0xec, 0x4d, 0x3d, 0xe3, 0x7c, 0x70, 0x3d, 0x63, 0xd4, 0x71, 0x3d, - 0x3c, 0xbc, 0x74, 0x3d, 0x62, 0x68, 0x4b, 0x3d, 0x68, 0xa9, 0x68, 0x3d, 0x8a, 0xed, 0x59, 0x3d, - 0xb2, 0x50, 0x95, 0x3c, 0xe8, 0x6c, 0x3b, 0x3c, 0x34, 0x70, 0x16, 0x3b, 0xa0, 0x48, 0xe0, 0x3b, - 0xc4, 0x7c, 0x85, 0x3c, 0x0e, 0x60, 0x8b, 0x3c, 0xf2, 0xa0, 0xe1, 0x3b, 0x42, 0x88, 0xd5, 0x3b, - 0xbe, 0x47, 0xfc, 0x3c, 0x46, 0x4b, 0x45, 0x3c, 0xe0, 0x1a, 0x95, 0x3c, 0xe8, 0x5d, 0x99, 0xbb, - 0x9c, 0x0f, 0xc3, 0x3b, 0xf6, 0x04, 0x10, 0x3d, 0xae, 0xbe, 0xd8, 0x3b, 0x4f, 0xff, 0xac, 0x3c, - 0x50, 0xe0, 0xb0, 0x3c, 0x1c, 0x69, 0xe8, 0x3c, 0x28, 0xc3, 0x20, 0x3c, 0x34, 0x92, 0xd1, 0x3b, - 0xb6, 0x38, 0x81, 0x3c, 0x3e, 0x76, 0x8a, 0x3c, 0x07, 0xb3, 0x10, 0x3c, 0x1f, 0x8b, 0x4d, 0x3c, - 0x6c, 0x2f, 0x62, 0x3c, 0x28, 0x90, 0xb7, 0x3c, 0xdc, 0x9f, 0x08, 0x3d, 0xe5, 0xaf, 0x8c, 0x3c, - 0xfc, 0x4a, 0xed, 0x3c, 0x45, 0xd4, 0x29, 0x3c, 0x58, 0xbb, 0xdf, 0x3c, 0x5e, 0x2b, 0x7d, 0x3c, - 0x25, 0x34, 0xc4, 0xbb, 0xdc, 0x5e, 0xb7, 0xbb, 0x8d, 0xab, 0x9e, 0xbb, 0x20, 0x93, 0xd4, 0xbb, - 0x5b, 0x8c, 0x9d, 0xbb, 0xbb, 0xfd, 0x86, 0xbb, 0x5f, 0x6b, 0x9b, 0xbb, 0xd5, 0x79, 0xd3, 0xbb, - 0xe8, 0x02, 0xc4, 0xbb, 0xa4, 0xb5, 0x03, 0xbc, 0xbb, 0x40, 0xec, 0xbb, 0xec, 0x0c, 0x80, 0xbb, - 0xe8, 0xbd, 0xdf, 0xbb, 0xd6, 0xe1, 0xbf, 0xbb, 0x4a, 0xb2, 0xae, 0xbb, 0x99, 0xdc, 0x09, 0xbc, - 0x0d, 0xc6, 0x9f, 0xbb, 0xa5, 0x29, 0xca, 0xbb, 0x15, 0x1d, 0xbc, 0xbb, 0xed, 0x52, 0xe4, 0xbb, - 0x1f, 0x74, 0xf8, 0xbb, 0xfc, 0x6c, 0x81, 0xbb, 0xb4, 0x38, 0xfe, 0xbb, 0x8a, 0x9a, 0xa9, 0xbb, - 0xe6, 0x75, 0xee, 0xbb, 0x88, 0x0b, 0xbd, 0xbb, 0x82, 0x2c, 0x0c, 0xbc, 0xed, 0xcd, 0xdd, 0xbb, - 0xd9, 0xbe, 0xee, 0xbb, 0xcd, 0xd7, 0xc8, 0xbb, 0xb0, 0x00, 0x00, 0xbc, 0x85, 0x97, 0xc3, 0xbb, - 0xbe, 0x34, 0x34, 0xbb, 0xcc, 0x89, 0xcc, 0xba, 0x74, 0x63, 0x8e, 0xba, 0x69, 0x31, 0x1e, 0xbb, - 0x96, 0xc7, 0xc0, 0xba, 0x11, 0x07, 0x0f, 0xbb, 0x04, 0x78, 0x83, 0xba, 0x4a, 0xb5, 0x10, 0xbb, - 0xc3, 0x7e, 0x73, 0xbb, 0xc6, 0x80, 0x14, 0xbb, 0xe4, 0xdd, 0x74, 0xbb, 0x40, 0x91, 0xfa, 0xb7, - 0xab, 0x61, 0xd1, 0xba, 0x79, 0xda, 0x8c, 0xbb, 0x1d, 0x50, 0xb8, 0xba, 0xad, 0x15, 0x78, 0xbb, - 0x71, 0x5f, 0x32, 0xbb, 0x92, 0x3d, 0x75, 0xbb, 0x60, 0xf3, 0xa4, 0xba, 0xf0, 0xd4, 0xd2, 0xba, - 0x58, 0xdf, 0x53, 0xbb, 0x78, 0xc7, 0xf8, 0xba, 0x07, 0x25, 0x11, 0xbb, 0xe7, 0xca, 0x02, 0xbb, - 0x96, 0xc9, 0xf1, 0xba, 0xe6, 0x75, 0x43, 0xbb, 0xcc, 0x41, 0xc5, 0xbb, 0x26, 0x3b, 0x29, 0xbb, - 0xda, 0xdf, 0x89, 0xbb, 0x4c, 0x2a, 0x10, 0xbb, 0x62, 0x5e, 0x9f, 0xbb, 0x52, 0xc4, 0x0f, 0xbb, - 0x8a, 0xd6, 0x3f, 0xbb, 0x84, 0xdf, 0x4f, 0xbb, 0x86, 0x0c, 0x09, 0xbb, 0x79, 0xd2, 0x0f, 0xbb, - 0xf2, 0xfb, 0x5d, 0xbb, 0x44, 0x35, 0x13, 0xbb, 0xf1, 0xba, 0x30, 0xbb, 0x68, 0xee, 0x1a, 0xbb, - 0xc4, 0x5c, 0x5f, 0xbb, 0x6e, 0x6d, 0x82, 0xbb, 0x28, 0xf2, 0x28, 0xbb, 0xce, 0x09, 0xef, 0xba, - 0x59, 0xaa, 0x52, 0xbb, 0xa9, 0xf8, 0x52, 0xbb, 0x78, 0xe1, 0x28, 0xbb, 0x6a, 0x8e, 0x6e, 0xbb, - 0x82, 0xec, 0x2f, 0xbb, 0xe8, 0x37, 0x51, 0xbb, 0x08, 0xbd, 0x5e, 0xbb, 0x6e, 0x34, 0x5b, 0xbb, - 0x9a, 0x6e, 0x49, 0xbb, 0x46, 0x4e, 0x20, 0xbb, 0xbd, 0xd6, 0x62, 0xbb, 0x78, 0xa2, 0x27, 0xbb, - 0xd8, 0x94, 0x89, 0xbb, 0xef, 0xf9, 0x47, 0xbb, 0xf3, 0xda, 0x33, 0xbb, 0xc2, 0x8e, 0x63, 0xbb, - 0xe1, 0x81, 0x61, 0xbb, 0x4c, 0xc8, 0x2d, 0xbb, 0xb1, 0x28, 0x39, 0xbb, 0x3a, 0x56, 0x51, 0xbb, - 0x89, 0x79, 0xab, 0xba, 0x37, 0x46, 0x97, 0xba, 0x60, 0x9d, 0x5f, 0xb9, 0xd4, 0x80, 0x25, 0xb9, - 0xf8, 0x42, 0xe1, 0xba, 0x24, 0x76, 0xa7, 0xba, 0x4d, 0xb6, 0x58, 0xba, 0x74, 0xf3, 0x7c, 0xb9, - 0x3e, 0x19, 0x15, 0xbb, 0xf0, 0x5f, 0x8f, 0xba, 0x81, 0x81, 0x5c, 0xba, 0x84, 0xae, 0xf1, 0x38, - 0x6c, 0x13, 0x1d, 0xba, 0x4d, 0xf1, 0x1f, 0xbb, 0xd6, 0x0c, 0x21, 0xba, 0x1d, 0xc0, 0xad, 0xba, - 0x5c, 0xac, 0xd2, 0xba, 0x0c, 0xad, 0x01, 0xbb, 0x98, 0xb9, 0x97, 0xba, 0xf6, 0x5a, 0x2e, 0xba, - 0x9a, 0xa8, 0x6b, 0xba, 0x50, 0x26, 0xba, 0xba, 0x2f, 0xc2, 0x34, 0xba, 0x8c, 0xb5, 0x7d, 0xba, - 0x61, 0x4c, 0xc2, 0xba, 0xb4, 0x52, 0xd9, 0xba, 0x73, 0x87, 0xc1, 0xba, 0xd0, 0xbc, 0xb4, 0xba, - 0xc6, 0x45, 0xf9, 0xba, 0x94, 0x16, 0x34, 0xba, 0x6a, 0x0b, 0xb1, 0xba, 0xba, 0x41, 0xab, 0xba, - 0x44, 0xaf, 0xa2, 0xbd, 0xfb, 0x70, 0x75, 0xbd, 0x3c, 0xc4, 0x6a, 0xbd, 0xe6, 0xd8, 0xbd, 0xbd, - 0x58, 0x22, 0x37, 0xbd, 0xc5, 0x27, 0x5f, 0xbd, 0xdd, 0xd6, 0x43, 0xbd, 0xce, 0xdb, 0xb4, 0xbd, - 0x96, 0x7b, 0xa8, 0xbd, 0xb8, 0xe6, 0xbe, 0xbd, 0x47, 0x00, 0xe3, 0xbd, 0x71, 0x8e, 0x17, 0xbd, - 0xe5, 0x33, 0xa0, 0xbd, 0x5c, 0x01, 0xb4, 0xbd, 0xb9, 0xbb, 0x7c, 0xbd, 0x50, 0xce, 0xee, 0xbd, - 0xd3, 0xfb, 0x85, 0xbd, 0x64, 0x0b, 0xb4, 0xbd, 0x34, 0x0d, 0x68, 0xbd, 0x06, 0x80, 0xa1, 0xbd, - 0x34, 0xfa, 0xd8, 0xbd, 0xbd, 0x69, 0x40, 0xbd, 0x0c, 0x17, 0xc2, 0xbd, 0x04, 0xed, 0x84, 0xbd, - 0x3e, 0x5f, 0x9a, 0xbd, 0x3e, 0x36, 0x9d, 0xbd, 0x2e, 0x2b, 0x19, 0xbe, 0x28, 0x34, 0xaa, 0xbd, - 0x21, 0xd0, 0xd8, 0xbd, 0x19, 0xab, 0xa3, 0xbd, 0x8f, 0x1b, 0x02, 0xbe, 0xee, 0x4c, 0x91, 0xbd, - 0x98, 0x13, 0x77, 0xbb, 0x6b, 0xbf, 0x87, 0xbc, 0x22, 0xee, 0x3d, 0xbc, 0x07, 0x8b, 0xa1, 0xbc, - 0xec, 0xe3, 0x8a, 0xbd, 0x85, 0x7b, 0x15, 0xbd, 0x98, 0x17, 0xbc, 0xbc, 0x98, 0xf1, 0x88, 0x3c, - 0x14, 0x13, 0xea, 0xbc, 0xf3, 0xa6, 0x48, 0xbd, 0xb4, 0xb8, 0x9a, 0xbc, 0x9a, 0x6f, 0x9f, 0xbb, - 0x4f, 0xc8, 0xb1, 0xbc, 0x3f, 0x67, 0x0d, 0xbd, 0x11, 0x93, 0x41, 0xbd, 0x89, 0x60, 0x39, 0xbd, - 0x9b, 0xfc, 0x93, 0xbc, 0xca, 0x0d, 0x0f, 0xbd, 0x8a, 0x6f, 0x17, 0xbd, 0xb4, 0x33, 0x0c, 0xbd, - 0x55, 0x6b, 0x2b, 0xbd, 0xd4, 0xd5, 0x98, 0xbd, 0x16, 0x3c, 0xbf, 0xbc, 0x6e, 0x4c, 0x13, 0xbc, - 0x7f, 0x0b, 0x10, 0xbd, 0x0f, 0x3b, 0xb1, 0xbc, 0x37, 0xf8, 0x1f, 0xbb, 0xd0, 0x41, 0x88, 0xbd, - 0xc5, 0x56, 0x60, 0xbd, 0x82, 0x15, 0x2f, 0xbd, 0x60, 0x62, 0xbe, 0xbc, 0x76, 0x32, 0x6c, 0xbd, - 0x7c, 0x23, 0x6f, 0x3c, 0x3a, 0x6b, 0x85, 0x3c, 0xa4, 0x33, 0xa8, 0x3c, 0x82, 0x0c, 0xaa, 0x3c, - 0xfb, 0x49, 0xf4, 0x3c, 0x76, 0xa5, 0x67, 0x3c, 0x96, 0x09, 0x00, 0x3c, 0xf1, 0x6a, 0x82, 0x3c, - 0x71, 0x4d, 0x34, 0x3c, 0xcb, 0x89, 0x06, 0x3d, 0xf5, 0x20, 0xbd, 0x3c, 0x8e, 0xa9, 0x8c, 0x3b, - 0xd6, 0xc9, 0xaa, 0x3c, 0x63, 0xba, 0x42, 0x3c, 0x35, 0x99, 0xb2, 0x3c, 0x86, 0x62, 0xed, 0x3c, - 0x6f, 0xa9, 0x8e, 0x3c, 0x14, 0x81, 0x83, 0x3c, 0xca, 0xee, 0xca, 0x3c, 0x22, 0x35, 0x1e, 0x3d, - 0x52, 0xf0, 0xce, 0x3c, 0x69, 0x4e, 0xc9, 0x3c, 0xd5, 0x61, 0x05, 0x3d, 0x6c, 0xa2, 0xd4, 0x3a, - 0x2c, 0xf4, 0x08, 0x3d, 0x06, 0x9f, 0xe1, 0x3c, 0xaf, 0xc3, 0x89, 0x3c, 0xd6, 0xda, 0xc3, 0x3c, - 0x4e, 0x80, 0x0d, 0x3d, 0x0e, 0x1b, 0x05, 0x3d, 0x92, 0xdb, 0x0b, 0x3d, 0x17, 0xa2, 0x1a, 0x3d, - 0xca, 0x9e, 0x99, 0x3b, 0x6d, 0xb9, 0xc2, 0xbb, 0x90, 0x0b, 0x18, 0x3b, 0x5c, 0x90, 0x30, 0x3c, - 0xf2, 0x54, 0x4c, 0x3c, 0x69, 0x9d, 0xc5, 0x3b, 0xfd, 0xc2, 0xef, 0xbb, 0x10, 0xd9, 0xa0, 0xbb, - 0x3b, 0x79, 0xc3, 0x3c, 0x90, 0x61, 0x9e, 0x3b, 0x0f, 0xdb, 0x3c, 0x3c, 0xff, 0x71, 0x06, 0xbc, - 0x20, 0xa2, 0x2e, 0xb9, 0xf6, 0xef, 0xa3, 0x3c, 0xc8, 0x9c, 0x6a, 0x3c, 0x24, 0x66, 0xb5, 0x3b, - 0x45, 0x03, 0x2d, 0x3c, 0x5e, 0x48, 0x05, 0x3c, 0x5a, 0xb9, 0x2c, 0x3c, 0xda, 0xbe, 0x91, 0x3c, - 0x68, 0xc6, 0x81, 0x3c, 0x9f, 0x27, 0x37, 0x3c, 0x12, 0xf1, 0x86, 0x3b, 0x06, 0xef, 0xb7, 0x3a, - 0xfc, 0xba, 0x63, 0xbb, 0x40, 0x37, 0x0c, 0xb9, 0xb9, 0x28, 0x35, 0x3c, 0x31, 0x7a, 0x46, 0x3c, - 0xe6, 0xc5, 0x88, 0x3c, 0x75, 0xfa, 0xc0, 0x3c, 0xb7, 0x07, 0x7b, 0x3c, 0x09, 0x48, 0x07, 0x3c, - 0xfc, 0x9b, 0x16, 0xbb, 0x92, 0xdc, 0xd0, 0xba, 0xcc, 0x1e, 0x38, 0xbb, 0x13, 0x4b, 0x44, 0xbb, - 0x0d, 0xdd, 0x39, 0xbb, 0xf3, 0x0c, 0x9e, 0xba, 0x90, 0x0e, 0x2c, 0xb8, 0x50, 0xef, 0x30, 0xbb, - 0xf2, 0xe6, 0xf7, 0xba, 0x28, 0xb7, 0x67, 0xbb, 0x71, 0x77, 0x5f, 0xbb, 0x50, 0xc7, 0x95, 0xb8, - 0xe2, 0xfd, 0x1c, 0xbb, 0x4d, 0xfe, 0xd5, 0xba, 0x3c, 0x00, 0x1e, 0xbb, 0x4a, 0x22, 0x4c, 0xbb, - 0x6a, 0x55, 0x26, 0xbb, 0x17, 0x55, 0xd8, 0xba, 0x66, 0x4c, 0x45, 0xbb, 0x68, 0x9c, 0xb5, 0xbb, - 0x1a, 0xd0, 0x50, 0xbb, 0x34, 0x59, 0xe6, 0xba, 0xc6, 0x4e, 0x8d, 0xbb, 0x70, 0xf7, 0x67, 0xb7, - 0x88, 0x7a, 0x70, 0xbb, 0xb5, 0x0b, 0x5f, 0xbb, 0x7e, 0x37, 0x44, 0xbb, 0x5f, 0x61, 0x01, 0xbb, - 0x12, 0x55, 0x89, 0xbb, 0x28, 0xa2, 0x97, 0xbb, 0x35, 0x8c, 0xa7, 0xbb, 0x1f, 0x1a, 0x88, 0xbb, - 0x19, 0x5b, 0xab, 0xba, 0x22, 0xff, 0x24, 0x3a, 0xe2, 0x86, 0x81, 0xba, 0x36, 0xe1, 0x02, 0xbb, - 0x7e, 0x54, 0x90, 0xba, 0x2b, 0x1e, 0xcf, 0xb9, 0x36, 0xf5, 0xc0, 0x3a, 0x66, 0xb9, 0x21, 0xba, - 0x7b, 0x35, 0x50, 0xbb, 0x96, 0x85, 0x40, 0xba, 0xb3, 0xb4, 0x13, 0xbb, 0xc3, 0x75, 0x9a, 0x3a, - 0xfc, 0x27, 0x9a, 0xb9, 0x5e, 0x41, 0x20, 0xbb, 0x2a, 0xef, 0xd9, 0xba, 0x01, 0x06, 0x4a, 0xba, - 0x8d, 0xd1, 0xf2, 0xba, 0x87, 0x1a, 0x61, 0xba, 0x6a, 0x15, 0xd0, 0xba, 0xaf, 0xaf, 0x62, 0xbb, - 0xf9, 0x14, 0x13, 0xbb, 0xef, 0x20, 0xdb, 0xb9, 0x75, 0x62, 0xc0, 0xba, 0x40, 0x67, 0x07, 0x37, - 0xc0, 0xd1, 0xb5, 0x37, 0x94, 0xb0, 0x26, 0xba, 0x09, 0x78, 0x1e, 0xbb, 0x86, 0x59, 0x50, 0xba, - 0x61, 0xae, 0x1d, 0xbb, 0x31, 0xae, 0x74, 0xbb, 0x30, 0xbc, 0x53, 0xbb, 0xa0, 0xce, 0x9d, 0xba, - 0xc9, 0x97, 0x10, 0xba, 0x4e, 0xf9, 0x7b, 0xba, 0x4b, 0xe3, 0x74, 0xba, 0xf4, 0xe6, 0x7e, 0xba, - 0x1b, 0x25, 0x09, 0xbb, 0x50, 0x56, 0x8b, 0xba, 0x4d, 0x1d, 0x49, 0xba, 0x4d, 0x19, 0xc7, 0xb9, - 0xf3, 0xd4, 0x1a, 0xba, 0x4a, 0x45, 0x02, 0xbb, 0xca, 0xd9, 0x87, 0xba, 0x18, 0xb1, 0xb4, 0xb9, - 0x92, 0x27, 0x96, 0xba, 0x94, 0x17, 0x4a, 0xba, 0x05, 0xf0, 0xba, 0xba, 0x8f, 0x3a, 0xe8, 0xba, - 0x98, 0x84, 0x57, 0xba, 0xa2, 0xde, 0x8d, 0xba, 0xc3, 0x40, 0xb9, 0xba, 0x6e, 0x79, 0xeb, 0xba, - 0xc0, 0xa1, 0xbd, 0xba, 0x0d, 0xbf, 0x04, 0xbb, 0xb3, 0x4e, 0xcc, 0xba, 0xf4, 0x83, 0x4a, 0xb9, - 0x9d, 0xdc, 0xf6, 0xba, 0x73, 0xda, 0xb6, 0xba, 0x06, 0xc0, 0x0b, 0xba, 0x8d, 0x01, 0xf3, 0xba, - 0x10, 0x0c, 0x03, 0xbb, 0x61, 0x82, 0xd6, 0xba, 0xd1, 0x7e, 0xc1, 0xba, 0x72, 0x00, 0x15, 0xbb, - 0xa0, 0xae, 0x6b, 0xb8, 0x51, 0x8b, 0x1d, 0x39, 0x98, 0x92, 0xc5, 0xb7, 0x75, 0x26, 0xf8, 0xb9, - 0x68, 0x97, 0xa0, 0xba, 0x38, 0x67, 0x2c, 0xba, 0xa8, 0xe8, 0x31, 0xb7, 0x48, 0x75, 0x2d, 0x3a, - 0x7c, 0xac, 0xa1, 0xba, 0xa6, 0xe9, 0x19, 0xba, 0x31, 0x5c, 0xf0, 0xb9, 0x58, 0xf3, 0x92, 0x39, - 0x02, 0xeb, 0xc6, 0xb8, 0x01, 0x4e, 0x9a, 0xba, 0x5d, 0xe7, 0x89, 0xba, 0x8b, 0x33, 0x1d, 0xba, - 0xa0, 0x56, 0xfb, 0xb9, 0x6f, 0xf5, 0x33, 0xba, 0x24, 0xfe, 0x37, 0xba, 0x9a, 0xe0, 0x45, 0xba, - 0x40, 0xcd, 0x7f, 0xba, 0x9f, 0xb5, 0xb1, 0xba, 0x14, 0x13, 0x0f, 0xb9, 0xfe, 0x08, 0x3f, 0xb9, - 0x1c, 0xce, 0x1e, 0xb8, 0x90, 0x71, 0x3d, 0xb7, 0x38, 0x82, 0x80, 0xb9, 0x8f, 0xb6, 0xa5, 0xba, - 0x5e, 0x1c, 0x91, 0xba, 0x42, 0xec, 0x9b, 0xba, 0x2c, 0x45, 0x0c, 0xba, 0xea, 0x67, 0x51, 0xba, - 0x65, 0x47, 0x22, 0xbd, 0x27, 0xa6, 0xdb, 0xbb, 0x30, 0x20, 0x23, 0xbd, 0xc2, 0xd9, 0x51, 0xbd, - 0x56, 0xf6, 0xdf, 0xbc, 0x4b, 0x97, 0x11, 0xbc, 0x7e, 0xea, 0xb2, 0x3c, 0x0e, 0xf4, 0x29, 0xbd, - 0x68, 0x4a, 0x4c, 0xbd, 0x86, 0x9a, 0x12, 0xbd, 0xf2, 0xa3, 0x71, 0xbd, 0x89, 0xf9, 0x5f, 0x3c, - 0xf8, 0x6b, 0xcb, 0xbc, 0x8d, 0x8f, 0x18, 0xbd, 0x7b, 0x51, 0x0b, 0xbd, 0x08, 0xb3, 0x04, 0xbd, - 0x26, 0xd2, 0x37, 0xbd, 0x42, 0xb8, 0x9f, 0xbc, 0x5c, 0x05, 0x2e, 0xbd, 0x46, 0xdd, 0xbd, 0xbd, - 0x3c, 0x62, 0x4f, 0xbd, 0xb7, 0x96, 0xbe, 0xbb, 0xe0, 0x71, 0x72, 0xbd, 0x18, 0x0e, 0x0f, 0x3b, - 0x99, 0x13, 0x04, 0xbd, 0xe3, 0xf9, 0x23, 0xbd, 0x89, 0xe3, 0x76, 0xbd, 0x2c, 0xdd, 0x6d, 0xbc, - 0x8c, 0xe8, 0x77, 0xbd, 0x91, 0xc0, 0xaa, 0xbd, 0x98, 0x58, 0xb4, 0xbd, 0xc2, 0xd2, 0x3b, 0xbd, - 0x5a, 0x21, 0xe0, 0xbf, 0xf4, 0x00, 0x45, 0xc0, 0x4f, 0xe2, 0x2f, 0xc0, 0x39, 0x98, 0x5f, 0xc0, - 0xe1, 0xeb, 0x0c, 0xc0, 0x75, 0xb9, 0x21, 0xc0, 0xf2, 0x7a, 0x02, 0xc0, 0xfe, 0xc7, 0x30, 0xc0, - 0xda, 0x1a, 0x02, 0xc0, 0x09, 0x4a, 0x9e, 0xbf, 0xde, 0xcc, 0x36, 0xc0, 0x74, 0x06, 0xd1, 0xbf, - 0xb7, 0x7d, 0xf0, 0xbe, 0xa4, 0x6b, 0xa7, 0xbf, 0x59, 0x71, 0x5d, 0xc0, 0x22, 0xf9, 0xc9, 0xbf, - 0xa8, 0x34, 0x52, 0xbf, 0xbf, 0xd4, 0x17, 0xc0, 0x9c, 0x95, 0x55, 0xc0, 0x56, 0x74, 0x08, 0xc0, - 0x94, 0x7c, 0x0e, 0xc0, 0x57, 0x62, 0xcd, 0xbf, 0xb8, 0xdc, 0x49, 0xc0, 0xb1, 0xba, 0x85, 0xbf, - 0x8f, 0x84, 0x28, 0xbf, 0x26, 0x92, 0x9b, 0xc0, 0x0c, 0x12, 0xeb, 0xbe, 0xf0, 0x02, 0x0b, 0xc0, - 0xc4, 0x1b, 0xeb, 0xbf, 0xdb, 0xe7, 0x28, 0xbf, 0xff, 0x18, 0x51, 0xbf, 0x63, 0xae, 0x29, 0xc0, - 0x68, 0xe4, 0x72, 0x3f, 0xd6, 0x97, 0xae, 0x3f, 0xc9, 0x39, 0xd2, 0x3f, 0x6a, 0x40, 0xd3, 0x3f, - 0x47, 0x7b, 0xb8, 0x3f, 0x32, 0xf9, 0xe6, 0x3f, 0xde, 0xbb, 0xa3, 0x3f, 0x0a, 0x4f, 0xd7, 0x3f, - 0x0e, 0xb7, 0xa9, 0x3f, 0xdc, 0x20, 0xc0, 0x3f, 0x0a, 0x8b, 0xd9, 0x3f, 0xa4, 0x5a, 0xb9, 0x3f, - 0xfc, 0xcf, 0x90, 0x3f, 0x1a, 0xee, 0xb9, 0x3f, 0x21, 0xd3, 0x96, 0x3f, 0xca, 0x2b, 0x8d, 0x3f, - 0x0c, 0x63, 0xa3, 0x3f, 0x55, 0xd2, 0x8d, 0x3f, 0x44, 0x04, 0xd1, 0x3f, 0xd2, 0x49, 0x67, 0x3f, - 0x3d, 0x83, 0xe1, 0x3f, 0x0a, 0x7a, 0x7a, 0x3f, 0x60, 0xe9, 0xb3, 0x3f, 0xc1, 0x62, 0x4c, 0x3f, - 0xf7, 0x29, 0x14, 0x3f, 0x98, 0x48, 0xbc, 0x3f, 0x76, 0x36, 0x16, 0x3f, 0x8e, 0x23, 0xf8, 0x3f, - 0x1e, 0x83, 0x06, 0x40, 0x82, 0x54, 0xfd, 0x3e, 0x8e, 0x2c, 0x41, 0x3f, 0xda, 0xaa, 0xa6, 0x3f, - 0x84, 0x46, 0x67, 0x3f, 0xa8, 0x44, 0x19, 0x3f, 0x7f, 0x4f, 0xd1, 0x3e, 0xd5, 0xe8, 0xbd, 0x3e, - 0xff, 0x25, 0xac, 0x3e, 0xba, 0x9e, 0x05, 0x3f, 0xba, 0xa3, 0xc9, 0x3e, 0xa3, 0x9b, 0xf9, 0x3e, - 0x86, 0xc7, 0x31, 0x3f, 0xba, 0xe0, 0xe4, 0x3e, 0x34, 0x38, 0xdf, 0x3d, 0x34, 0x5b, 0xda, 0xbd, - 0xd6, 0x88, 0x01, 0xbe, 0x62, 0xef, 0x33, 0x3e, 0x1a, 0x4b, 0x18, 0x3f, 0x8a, 0x83, 0x7f, 0x3e, - 0x60, 0xb8, 0xae, 0x3c, 0x8e, 0x52, 0xc9, 0x3e, 0x84, 0xb2, 0xb6, 0x3e, 0x80, 0x10, 0xdf, 0x3e, - 0xd6, 0xec, 0xc8, 0x3e, 0x10, 0x9d, 0x8f, 0x3e, 0x3a, 0xf6, 0x2a, 0x3f, 0x8b, 0x02, 0xb2, 0x3e, - 0xbd, 0x45, 0xa1, 0xbe, 0x94, 0x88, 0xd0, 0x3f, 0x74, 0xd7, 0x7f, 0x3f, 0xca, 0x30, 0x37, 0x3f, - 0x1b, 0x43, 0xe6, 0x3e, 0xd6, 0x23, 0x3b, 0x3f, 0x80, 0xc4, 0xf4, 0x3c, 0x8c, 0x06, 0x5b, 0x3f, - 0xeb, 0x38, 0x04, 0xbe, 0xc2, 0x18, 0x07, 0xbe, 0xaa, 0x65, 0x33, 0xbe, 0x10, 0x3e, 0x19, 0xbe, - 0x44, 0x6f, 0x23, 0xbe, 0x9a, 0xa3, 0x59, 0xbe, 0x1c, 0x0e, 0x13, 0xbe, 0x47, 0x0c, 0x3d, 0xbe, - 0x1d, 0xd3, 0x29, 0xbe, 0x3a, 0x3b, 0x53, 0xbe, 0x86, 0x23, 0x29, 0xbe, 0xd6, 0x39, 0x21, 0xbe, - 0xf5, 0x43, 0x17, 0xbe, 0xa6, 0x77, 0x3b, 0xbe, 0xec, 0xb0, 0xba, 0xbd, 0x4a, 0x52, 0x00, 0xbe, - 0xfe, 0x20, 0x29, 0xbe, 0xeb, 0xae, 0xda, 0xbd, 0x94, 0x2d, 0x1b, 0xbe, 0x86, 0x9b, 0xb0, 0xbd, - 0xfe, 0xb6, 0x56, 0xbe, 0xae, 0xc1, 0xdb, 0xbd, 0x2f, 0x6d, 0x0e, 0xbe, 0xe9, 0x8a, 0xd1, 0xbd, - 0x52, 0x35, 0x49, 0xbd, 0x52, 0x69, 0x0f, 0xbe, 0xcb, 0x3f, 0xfb, 0xbd, 0xf6, 0x21, 0x82, 0xbe, - 0x76, 0x94, 0x8d, 0xbe, 0x03, 0xd0, 0xb5, 0xbd, 0xec, 0x1b, 0xb3, 0xbd, 0x31, 0x4f, 0x19, 0xbe, - 0xba, 0x26, 0xff, 0xbd, 0x03, 0xeb, 0x62, 0xbd, 0x88, 0x50, 0x54, 0xbd, 0xc2, 0xc8, 0xb1, 0xbc, - 0xc4, 0x1d, 0x49, 0xbd, 0xdf, 0x9d, 0xac, 0xbd, 0x70, 0x95, 0x61, 0xbd, 0xf4, 0x71, 0x85, 0xbd, - 0x29, 0x54, 0xd2, 0xbd, 0xc6, 0x9b, 0xce, 0xbd, 0xd0, 0x7c, 0xc8, 0xbb, 0x20, 0x97, 0x01, 0xbb, - 0x48, 0xb2, 0xb3, 0xbc, 0x0a, 0xde, 0x62, 0xbd, 0x95, 0x69, 0x06, 0xbd, 0x73, 0xbe, 0x23, 0xbd, - 0xd4, 0x69, 0x22, 0xbd, 0x6a, 0x98, 0x10, 0xbd, 0x90, 0x08, 0xc4, 0xbc, 0xf0, 0x9a, 0x21, 0xbd, - 0xbd, 0xfa, 0x94, 0xbd, 0x26, 0xa4, 0x19, 0xbd, 0x8a, 0xc3, 0x85, 0xbd, 0xd9, 0x79, 0x6a, 0xbd, - 0x38, 0xdf, 0x24, 0x3d, 0x82, 0x9c, 0x1f, 0xbe, 0x4b, 0xe0, 0x27, 0xbe, 0xcc, 0x07, 0x07, 0xbe, - 0x4f, 0xfc, 0xe3, 0xbd, 0x47, 0x31, 0xe6, 0xbd, 0x20, 0x83, 0x75, 0xbc, 0xdc, 0x2b, 0xd7, 0xbd, - 0x31, 0x04, 0x5b, 0xbd, 0x69, 0x7f, 0xc2, 0xbd, 0x88, 0x79, 0xd1, 0xbd, 0x88, 0x81, 0xec, 0xbd, - 0x56, 0x3d, 0xb1, 0xbd, 0xa0, 0x79, 0xd3, 0xbd, 0x71, 0xbf, 0x9d, 0xbd, 0xf8, 0xfc, 0xd2, 0xbd, - 0xbc, 0x70, 0x99, 0xbd, 0x28, 0x0b, 0x92, 0xbd, 0xa7, 0x3a, 0xe1, 0xbd, 0x95, 0xae, 0xa9, 0xbd, - 0x2f, 0x51, 0x54, 0xbd, 0x83, 0xb4, 0x97, 0xbd, 0x4a, 0x5e, 0xc1, 0xbd, 0x9f, 0x2c, 0x84, 0xbd, - 0xf2, 0x06, 0x7b, 0xbd, 0xdf, 0x00, 0x9c, 0xbd, 0xd3, 0x2f, 0xe6, 0xbd, 0x4d, 0x02, 0x83, 0xbd, - 0x13, 0x41, 0xc9, 0xbd, 0x7f, 0x76, 0x75, 0xbd, 0xb9, 0x82, 0xc6, 0xbd, 0x1a, 0x27, 0x30, 0xbd, - 0xb4, 0xf6, 0x15, 0xbd, 0xaa, 0x34, 0xed, 0xbd, 0xa6, 0x9a, 0x8c, 0xbc, 0x27, 0xb4, 0xcc, 0xbd, - 0xdc, 0x98, 0xd4, 0xbd, 0xa0, 0x39, 0xa7, 0xbc, 0x4e, 0x22, 0x2a, 0xbd, 0x32, 0x98, 0xa8, 0xbd, - 0x15, 0xb9, 0x51, 0xbd, 0xcf, 0x42, 0x68, 0xbd, 0xff, 0x4f, 0x26, 0xbd, 0x1f, 0xf9, 0x52, 0xbd, - 0x07, 0x2b, 0x00, 0xbd, 0xe9, 0x49, 0x20, 0xbd, 0x62, 0x2d, 0x06, 0xbd, 0x55, 0x53, 0x31, 0xbd, - 0x67, 0x8f, 0x31, 0xbd, 0x41, 0x77, 0x98, 0xbc, 0x1d, 0x6c, 0xf9, 0xbc, 0xae, 0xb1, 0xa7, 0xbb, - 0xa3, 0x28, 0x35, 0x3c, 0xf9, 0xa2, 0x27, 0xbc, 0x39, 0xa0, 0x85, 0xbd, 0xc6, 0x27, 0xb3, 0xbc, - 0xe0, 0xc3, 0xc0, 0x3a, 0x08, 0x9f, 0x25, 0xbd, 0xa2, 0x06, 0x47, 0xbd, 0x8c, 0x36, 0x26, 0xbd, - 0xcf, 0x1d, 0xf4, 0xbc, 0xca, 0x0d, 0xcd, 0xbc, 0x9e, 0xee, 0x75, 0xbd, 0x3a, 0xb2, 0xa7, 0xbc, - 0x45, 0x4e, 0x04, 0x3c, 0xda, 0x67, 0xfd, 0xbd, 0x1e, 0xce, 0x1a, 0xbd, 0x0e, 0xf0, 0x1e, 0xbd, - 0x5c, 0xb8, 0xad, 0xbc, 0x14, 0xfe, 0x03, 0xbd, 0x30, 0xb6, 0xad, 0xbb, 0xde, 0xbd, 0x75, 0xbd, - 0x77, 0xa7, 0x1a, 0xc0, 0xf6, 0x40, 0xab, 0xbf, 0x1e, 0xfa, 0xee, 0xbf, 0x74, 0xf3, 0x86, 0xbf, - 0xfd, 0x12, 0xe6, 0xbf, 0x2b, 0x48, 0x2d, 0xc0, 0xe6, 0xdc, 0xdc, 0xbf, 0xb2, 0xa4, 0x07, 0xc0, - 0x50, 0xd3, 0x20, 0xc0, 0x27, 0x5f, 0x49, 0xc0, 0x02, 0x43, 0x9b, 0xbf, 0xab, 0x50, 0xb1, 0xbf, - 0x0c, 0x46, 0xe5, 0xbf, 0xb4, 0xc3, 0x16, 0xc0, 0xaf, 0x99, 0x07, 0xbf, 0x64, 0x86, 0xbb, 0xbf, - 0x82, 0x54, 0x06, 0xc0, 0x32, 0xae, 0x80, 0xbf, 0xcb, 0xe2, 0x91, 0xbf, 0x50, 0x87, 0x61, 0xbf, - 0xac, 0xb9, 0x27, 0xc0, 0x4d, 0x2f, 0x9d, 0xbf, 0xaa, 0x68, 0xc1, 0xbf, 0xd6, 0x55, 0xc4, 0xbf, - 0xc9, 0xa4, 0x84, 0x3d, 0x05, 0x47, 0x07, 0xc0, 0x92, 0x05, 0x47, 0xc0, 0x81, 0x30, 0x73, 0xc0, - 0x01, 0xdc, 0x79, 0xc0, 0xf6, 0x49, 0x06, 0xc0, 0x11, 0xc1, 0x72, 0xbf, 0x81, 0x05, 0x0d, 0xc0, - 0x43, 0x26, 0x52, 0xc0, 0x24, 0x39, 0x84, 0xbf, 0xe1, 0xe6, 0x04, 0xc0, 0xd1, 0x83, 0xce, 0xbf, - 0x4a, 0x9d, 0xdc, 0xbf, 0x79, 0x7b, 0x22, 0xc0, 0xa3, 0xf6, 0x2b, 0xc0, 0x54, 0x01, 0x88, 0xbf, - 0xc2, 0xd2, 0x72, 0xbf, 0xc5, 0xb6, 0x24, 0xc0, 0x05, 0x6d, 0xf4, 0xbf, 0x11, 0x24, 0xd7, 0xbf, - 0xcb, 0x9e, 0x23, 0xbf, 0x94, 0x87, 0xfa, 0xbf, 0xe3, 0xd4, 0xda, 0xbf, 0x59, 0x8c, 0x1a, 0xc0, - 0xfb, 0x68, 0x1a, 0xc0, 0x40, 0xe7, 0x8a, 0xbf, 0x9e, 0x64, 0xc0, 0xbf, 0x97, 0x9b, 0x04, 0xc0, - 0xae, 0x3d, 0x04, 0xc0, 0x2e, 0xd6, 0x0e, 0xc0, 0xa3, 0x58, 0x00, 0xc0, 0x8e, 0xa2, 0x7b, 0xbf, - 0x97, 0x21, 0xb2, 0xbe, 0xf8, 0xc6, 0x08, 0xc0, 0x65, 0x84, 0x9a, 0xbf, 0x78, 0xb5, 0x28, 0xbf, - 0x6a, 0x3e, 0x04, 0xc0, 0xee, 0xa6, 0x4e, 0xc0, 0xbf, 0x65, 0xfd, 0xbf, 0x3b, 0x3b, 0x2b, 0xc0, - 0x82, 0x5a, 0xcf, 0x3f, 0x8b, 0x8b, 0x30, 0x3f, 0x9c, 0x02, 0xc0, 0x3f, 0xa0, 0xa1, 0x91, 0x3f, - 0x0e, 0x62, 0xaa, 0x3f, 0xe9, 0xc0, 0x9e, 0x3f, 0x23, 0xe3, 0x8c, 0x3f, 0x60, 0xbc, 0xc8, 0x3f, - 0x9e, 0xd7, 0xa6, 0x3f, 0xd6, 0x9a, 0xf7, 0x3f, 0xc1, 0xfb, 0xd4, 0x3f, 0x36, 0xe4, 0x96, 0x3f, - 0xb2, 0x44, 0xc4, 0x3e, 0xb4, 0x7a, 0xdd, 0x3f, 0x7f, 0x8a, 0xa5, 0x3f, 0x06, 0x5c, 0xa6, 0x3f, - 0xe6, 0x06, 0xa8, 0x3f, 0xd8, 0x5d, 0x8d, 0x3f, 0x4e, 0x44, 0x0c, 0x3f, 0xe1, 0x6f, 0x9b, 0x3f, - 0x34, 0x87, 0x99, 0x3f, 0x71, 0x67, 0xbd, 0x3f, 0x38, 0xdd, 0xa0, 0x3f, 0xfe, 0xd8, 0x5b, 0x3f, - 0xad, 0x53, 0x58, 0x3f, 0xa4, 0x3b, 0x78, 0x3f, 0xa0, 0x8f, 0x92, 0x3f, 0xe2, 0x5f, 0x9d, 0x3f, - 0x59, 0x10, 0xac, 0x3f, 0x83, 0x18, 0xa7, 0x3f, 0x10, 0x2f, 0x02, 0x40, 0x50, 0xde, 0xf3, 0x3f, - 0x8a, 0x5a, 0xd9, 0x3e, 0x31, 0xae, 0x56, 0x3e, 0x2c, 0xff, 0xdd, 0x3d, 0x74, 0x6c, 0x06, 0xbd, - 0x59, 0x9e, 0xd3, 0x3e, 0x1f, 0xb9, 0x0d, 0x3f, 0x0a, 0x25, 0x4a, 0x3f, 0xca, 0x15, 0xb1, 0x3e, - 0xce, 0x18, 0xa8, 0x3e, 0x30, 0xfa, 0xd8, 0x3c, 0x22, 0x4e, 0xcd, 0x3e, 0x2c, 0x14, 0xa0, 0x3e, - 0x45, 0x26, 0x81, 0x3e, 0x7d, 0x81, 0xb4, 0x3e, 0xf2, 0x1a, 0x7c, 0x3f, 0xe3, 0xb1, 0x8d, 0x3e, - 0x6c, 0xed, 0xae, 0x3e, 0x4a, 0x3a, 0xca, 0x3e, 0x45, 0xca, 0xc8, 0xbe, 0x65, 0x13, 0x20, 0x3f, - 0x74, 0xc8, 0x2a, 0x3e, 0x54, 0x0b, 0x47, 0x3e, 0xb0, 0xc8, 0x97, 0x3c, 0x28, 0xba, 0x2f, 0xbd, - 0xe2, 0x95, 0xf6, 0x3e, 0xc6, 0x55, 0x0d, 0x3f, 0xa2, 0x07, 0x6a, 0x3e, 0x00, 0xba, 0xf1, 0xbd, - 0x4a, 0x4b, 0x39, 0x3e, 0x12, 0xce, 0xdc, 0x3e, 0x26, 0x37, 0x0c, 0x3f, 0xc6, 0x5f, 0xf0, 0x3e, - 0xdd, 0x7a, 0x1e, 0xbe, 0x99, 0xcd, 0xa2, 0xbd, 0xc6, 0x14, 0x25, 0xbe, 0x6a, 0x03, 0xec, 0xbd, - 0x7d, 0xca, 0x26, 0xbe, 0x10, 0x31, 0x04, 0xbe, 0x42, 0x8f, 0xeb, 0xbd, 0xd0, 0x52, 0x5e, 0xbe, - 0xda, 0xa4, 0x38, 0xbe, 0x6d, 0xa2, 0x52, 0xbe, 0x2e, 0xee, 0x52, 0xbe, 0xeb, 0xb4, 0x0b, 0xbe, - 0xeb, 0xea, 0x47, 0xbd, 0x93, 0x04, 0x59, 0xbe, 0x14, 0xb7, 0x3e, 0xbe, 0x08, 0x60, 0x03, 0xbe, - 0xe6, 0xc5, 0x08, 0xbe, 0x60, 0xd2, 0x18, 0xbe, 0x90, 0x6b, 0x4c, 0xbc, 0xc1, 0xd0, 0x13, 0xbe, - 0x22, 0x7d, 0xf4, 0xbd, 0xbc, 0x0e, 0x21, 0xbe, 0x8e, 0x11, 0xfb, 0xbd, 0x41, 0x52, 0xc0, 0xbd, - 0x4a, 0x94, 0x0e, 0xbe, 0x94, 0x84, 0xd0, 0xbd, 0xc2, 0x5e, 0x12, 0xbe, 0xd0, 0x4f, 0x20, 0xbe, - 0x17, 0x36, 0x11, 0xbe, 0x0c, 0xc3, 0xe0, 0xbd, 0xb0, 0x74, 0x88, 0xbe, 0x29, 0x5b, 0x61, 0xbe, - 0x72, 0x45, 0x10, 0xbd, 0x9a, 0x01, 0x01, 0xbd, 0x2c, 0x95, 0xb4, 0xbc, 0x80, 0xf3, 0xed, 0x3a, - 0xd5, 0x9d, 0x91, 0xbd, 0x24, 0x83, 0x77, 0xbd, 0x12, 0xdb, 0xab, 0xbd, 0xbe, 0x4d, 0xc2, 0xbd, - 0x3d, 0x94, 0xa9, 0xbd, 0x80, 0x34, 0x74, 0xbc, 0x9c, 0x35, 0xa3, 0xbd, 0x89, 0x09, 0x4c, 0xbd, - 0xfa, 0x38, 0x12, 0xbd, 0x04, 0xdf, 0x97, 0xbd, 0xdc, 0x1f, 0x1f, 0xbe, 0x1e, 0x03, 0xd5, 0xbc, - 0x0e, 0x59, 0x15, 0xbd, 0x1e, 0x59, 0xa0, 0xbd, 0x1b, 0xfa, 0xa6, 0x3d, 0x04, 0xfe, 0xae, 0xbd, - 0x6d, 0x00, 0x84, 0xbc, 0x84, 0x82, 0xeb, 0xbc, 0xc0, 0x3e, 0xa1, 0x3a, 0xe0, 0x04, 0x75, 0xbb, - 0x30, 0xbb, 0xd2, 0xbd, 0xc6, 0xff, 0x75, 0xbd, 0xe4, 0x18, 0x52, 0xbd, 0x42, 0x9b, 0xb2, 0xbc, - 0x8a, 0xb7, 0xd0, 0xbc, 0x3c, 0xec, 0xb6, 0xbc, 0x45, 0x68, 0xf1, 0xbd, 0x22, 0xac, 0x9c, 0xbd, - 0xc9, 0x9f, 0xe1, 0xbd, 0x4b, 0xb6, 0x25, 0xbd, 0xe9, 0x4f, 0xb7, 0xbd, 0x23, 0x4a, 0x8f, 0xbd, - 0xba, 0x7a, 0x96, 0xbd, 0xb0, 0xbc, 0xa7, 0xbd, 0x1e, 0x08, 0x9c, 0xbd, 0xf6, 0xda, 0x93, 0xbd, - 0x65, 0x27, 0x78, 0xbd, 0x78, 0xbb, 0xeb, 0xbd, 0x7c, 0xb4, 0xb6, 0xbd, 0xde, 0xe6, 0x8b, 0xbd, - 0xfc, 0x31, 0xb3, 0xbc, 0x92, 0xab, 0xbe, 0xbd, 0x1b, 0x78, 0x84, 0xbd, 0x1b, 0x28, 0xaf, 0xbd, - 0xfe, 0x6b, 0xae, 0xbd, 0xb8, 0x7f, 0x5f, 0xbd, 0x46, 0xd9, 0x4e, 0xbd, 0x92, 0xfa, 0x93, 0xbd, - 0x34, 0x60, 0x9e, 0xbd, 0x8c, 0xd8, 0xb8, 0xbd, 0xd9, 0x7e, 0xa4, 0xbd, 0x34, 0x35, 0x49, 0xbd, - 0x6e, 0xe3, 0x00, 0xbd, 0xc4, 0x48, 0x85, 0xbd, 0xb7, 0x94, 0x76, 0xbd, 0xb4, 0xe1, 0x6f, 0xbd, - 0xd0, 0x22, 0xa9, 0xbd, 0x80, 0x6e, 0xc7, 0xbd, 0x8e, 0xc5, 0xd1, 0xbd, 0x30, 0xa0, 0xe1, 0xbd, - 0x93, 0x6c, 0x4e, 0xbd, 0xfe, 0xd2, 0x86, 0xbc, 0x21, 0x37, 0xa3, 0xbc, 0x35, 0x64, 0x3b, 0xbc, - 0x9c, 0xfc, 0xe1, 0xbc, 0xd0, 0xd8, 0x42, 0xbd, 0x0c, 0x5c, 0x78, 0xbd, 0x1e, 0x78, 0x35, 0xbc, - 0xb7, 0xc7, 0x41, 0xbc, 0x4a, 0x71, 0xa1, 0xbc, 0x70, 0x84, 0xd5, 0xbc, 0x57, 0xd1, 0xcc, 0xbc, - 0x18, 0x00, 0x7b, 0xbc, 0x04, 0x5a, 0xc6, 0xbc, 0xc6, 0xc1, 0x49, 0xbd, 0xee, 0xd0, 0x0c, 0xbd, - 0xb6, 0xf8, 0x15, 0xbd, 0x2b, 0x68, 0x9c, 0xbc, 0x90, 0x1b, 0x65, 0xbb, 0xa6, 0x51, 0x2f, 0xbd, - 0xc4, 0x86, 0xd2, 0xbc, 0x7f, 0xe8, 0xd4, 0xbc, 0x50, 0xce, 0x96, 0xbc, 0xd0, 0x58, 0x84, 0xbb, - 0x2a, 0xd8, 0x59, 0xbc, 0xd6, 0x0c, 0x35, 0xbd, 0x8e, 0x8e, 0x6b, 0xbc, 0x72, 0x5c, 0x10, 0x3c, - 0xa8, 0x0e, 0xc8, 0xbc, 0x04, 0xd5, 0x5b, 0xbd, 0x46, 0x41, 0xe6, 0xbc, 0x30, 0x36, 0x1d, 0xbd, - 0x0e, 0x82, 0xa8, 0xbf, 0xad, 0x89, 0x7d, 0xbf, 0xff, 0x6f, 0xc5, 0xbf, 0x6b, 0x68, 0x5f, 0xbf, - 0x33, 0xe4, 0x0c, 0xc0, 0x5d, 0xab, 0xbf, 0xbf, 0x72, 0x7b, 0xc8, 0xbf, 0x64, 0xa1, 0x4f, 0xc0, - 0xdb, 0x44, 0x2e, 0xc0, 0xa4, 0x4f, 0xeb, 0xbf, 0xb0, 0x22, 0x2f, 0xc0, 0xc0, 0x68, 0xd6, 0xbf, - 0x8a, 0x79, 0x4a, 0xbf, 0xac, 0xaa, 0x2e, 0xc0, 0x85, 0xa0, 0x5a, 0xc0, 0x6b, 0xea, 0x90, 0xbf, - 0x1d, 0xa3, 0xa7, 0xbf, 0x58, 0x76, 0x12, 0xc0, 0x8e, 0x89, 0x93, 0x3f, 0x2f, 0x4d, 0x04, 0xc0, - 0x38, 0xfd, 0x80, 0xbf, 0x93, 0x77, 0xc4, 0xbf, 0x0a, 0x2d, 0x5d, 0xbf, 0x7f, 0x4f, 0x5a, 0xbf, - 0x06, 0xc5, 0x28, 0xc0, 0xa7, 0xb5, 0xa4, 0xbf, 0xe8, 0x10, 0xf1, 0xbf, 0xd8, 0xbe, 0xeb, 0xbf, - 0x1f, 0xc5, 0xae, 0xbf, 0x67, 0xba, 0x2d, 0xbf, 0x08, 0x16, 0x75, 0xc0, 0xce, 0xb9, 0x2a, 0xc0, - 0x2c, 0x68, 0xae, 0x3f, 0xb7, 0x64, 0xa5, 0x3f, 0x2d, 0x39, 0xa3, 0x3f, 0x14, 0xdf, 0x41, 0x3f, - 0xc8, 0x4a, 0x8e, 0x3f, 0xb5, 0xcc, 0xb4, 0x3f, 0x10, 0xaa, 0xc7, 0x3f, 0x78, 0x4b, 0x71, 0x3f, - 0xf2, 0x3a, 0xad, 0x3f, 0x94, 0x60, 0xbf, 0x3f, 0xdd, 0x36, 0x9b, 0x3f, 0x12, 0x42, 0xa3, 0x3f, - 0x97, 0xca, 0x73, 0x3f, 0x56, 0x1b, 0xc4, 0x3f, 0x48, 0xc8, 0x78, 0x3f, 0x3d, 0xe7, 0xe4, 0x3f, - 0xe3, 0x15, 0xa1, 0x3f, 0x56, 0x0a, 0xe1, 0x3f, 0xc6, 0xdd, 0x94, 0x3f, 0xf5, 0x42, 0x9f, 0x3f, - 0x8a, 0xd7, 0x43, 0x3f, 0x43, 0x9a, 0xc7, 0x3f, 0x84, 0x3b, 0xcf, 0x3f, 0xd3, 0x4d, 0x9a, 0x3f, - 0x2a, 0x9b, 0x89, 0x3f, 0xd0, 0x78, 0x12, 0x40, 0x32, 0xea, 0x68, 0x3f, 0xfa, 0x7c, 0x51, 0x3f, - 0x87, 0x4f, 0x9f, 0x3f, 0xfc, 0x37, 0xc9, 0x3f, 0xd9, 0x05, 0xa4, 0x3f, 0xf3, 0xc5, 0x80, 0x3f, - 0x53, 0xf3, 0x49, 0xbf, 0x32, 0xa5, 0x50, 0xbf, 0x08, 0x65, 0x7a, 0xbf, 0x80, 0xc3, 0x65, 0xbf, - 0x80, 0x7c, 0x44, 0xbf, 0x06, 0x58, 0x36, 0xbf, 0x78, 0xec, 0x4c, 0xbf, 0x56, 0x1c, 0x57, 0xbf, - 0xf0, 0x02, 0x8e, 0xbf, 0x6b, 0x0f, 0x80, 0xbf, 0x62, 0xd8, 0x76, 0xbf, 0x25, 0xcb, 0x43, 0xbf, - 0xe5, 0xe3, 0x36, 0xbf, 0x68, 0xc3, 0x7b, 0xbf, 0x93, 0x9b, 0x63, 0xbf, 0x4a, 0x14, 0x80, 0xbf, - 0x19, 0xc3, 0x6d, 0xbf, 0x20, 0x30, 0x73, 0xbf, 0x9c, 0x62, 0x18, 0xbf, 0x9d, 0x99, 0x39, 0xbf, - 0x33, 0x05, 0x54, 0xbf, 0xa5, 0x12, 0x86, 0xbf, 0xd4, 0xd0, 0x60, 0xbf, 0xdb, 0x27, 0x5f, 0xbf, - 0x56, 0x68, 0x41, 0xbf, 0xa0, 0xc0, 0x6e, 0xbf, 0xbc, 0xa5, 0x61, 0xbf, 0x20, 0x59, 0x4e, 0xbf, - 0x02, 0x96, 0x61, 0xbf, 0x54, 0xa2, 0x78, 0xbf, 0x54, 0x4f, 0x6d, 0xbf, 0xd9, 0x9c, 0x3a, 0xbf, - 0x6c, 0x01, 0x86, 0xbe, 0xdf, 0xea, 0xaa, 0xbe, 0xe2, 0x2a, 0x3f, 0xbe, 0x3a, 0x0e, 0x33, 0xbe, - 0x02, 0x5d, 0xaf, 0xbe, 0x10, 0xf7, 0x96, 0xbe, 0x94, 0x6b, 0x03, 0xbf, 0x82, 0x9e, 0xee, 0xbd, - 0x5e, 0x36, 0xce, 0xbe, 0x8c, 0x56, 0x6c, 0xbe, 0xc9, 0x8b, 0xb7, 0xbe, 0x9b, 0x0e, 0x34, 0xbe, - 0x10, 0x9a, 0x8f, 0xbd, 0xe8, 0xea, 0x07, 0xbf, 0x40, 0x9f, 0x78, 0xbe, 0x59, 0x7e, 0xd9, 0xbe, - 0x3a, 0x1c, 0x58, 0xbe, 0x3a, 0xa8, 0x1d, 0xbf, 0x72, 0x71, 0x4c, 0xbe, 0x93, 0x14, 0xa3, 0xbe, - 0xa8, 0x34, 0xca, 0xbc, 0xc1, 0x0d, 0xfe, 0xbe, 0xae, 0x17, 0xb1, 0xbe, 0x7e, 0x53, 0xec, 0xbe, - 0x40, 0x98, 0x84, 0xbe, 0x5a, 0x35, 0x0c, 0xbf, 0xbe, 0x79, 0x1e, 0xbe, 0xd6, 0x08, 0xa6, 0x3d, - 0x74, 0x67, 0x42, 0xbe, 0x4e, 0xec, 0xd1, 0xbe, 0x18, 0xe9, 0x96, 0xbd, 0x08, 0x73, 0x67, 0xbe, - 0xea, 0x1f, 0xb0, 0x3d, 0x18, 0x59, 0xc4, 0x3d, 0xb4, 0x89, 0xe8, 0x3d, 0x46, 0xec, 0xf1, 0x3d, - 0x96, 0xf7, 0xc2, 0x3d, 0x44, 0x94, 0x98, 0x3d, 0x29, 0xfd, 0xbf, 0x3d, 0xf0, 0xb7, 0xcd, 0x3d, - 0x1d, 0xd7, 0x10, 0x3e, 0x18, 0xe9, 0xe4, 0x3d, 0x7d, 0x24, 0xfa, 0x3d, 0x10, 0xe3, 0xa5, 0x3d, - 0x27, 0xa3, 0xa1, 0x3d, 0x8a, 0xe4, 0xfb, 0x3d, 0x45, 0xa6, 0xe7, 0x3d, 0xa4, 0xce, 0xe4, 0x3d, - 0x68, 0x03, 0xdd, 0x3d, 0xef, 0xdd, 0xea, 0x3d, 0x86, 0xd2, 0x77, 0x3d, 0x8a, 0x65, 0xaa, 0x3d, - 0x38, 0xba, 0xcc, 0x3d, 0xbe, 0x10, 0x05, 0x3e, 0x64, 0xac, 0xc2, 0x3d, 0xaf, 0xc1, 0xe8, 0x3d, - 0xd4, 0x37, 0xb9, 0x3d, 0x7d, 0x59, 0xba, 0x3d, 0x8a, 0x83, 0xe0, 0x3d, 0xda, 0x73, 0xb7, 0x3d, - 0x8b, 0x2d, 0xcd, 0x3d, 0x66, 0x07, 0xe9, 0x3d, 0xab, 0xd6, 0xcc, 0x3d, 0xae, 0x66, 0xb2, 0x3d, - 0x40, 0x55, 0x08, 0x3d, 0xf9, 0x98, 0x43, 0x3d, 0x4e, 0xe9, 0x0c, 0x3d, 0xa2, 0xd8, 0x3b, 0x3d, - 0xc6, 0xd8, 0x57, 0x3d, 0x25, 0x2e, 0x06, 0x3d, 0xa5, 0x2f, 0x85, 0x3d, 0xac, 0xf7, 0xe5, 0x3c, - 0x78, 0xf2, 0x90, 0x3d, 0x42, 0x90, 0x0e, 0x3d, 0x66, 0x2b, 0x7c, 0x3d, 0xde, 0x0d, 0xb5, 0x3c, - 0x88, 0x7b, 0x6d, 0x3c, 0x57, 0x37, 0x9f, 0x3d, 0x52, 0x93, 0x46, 0x3d, 0x6e, 0xc1, 0x5d, 0x3d, - 0x23, 0x0d, 0x14, 0x3d, 0x36, 0x71, 0xa6, 0x3d, 0x0e, 0x91, 0xab, 0x3c, 0x28, 0x4c, 0x2e, 0x3d, - 0x4e, 0xaa, 0xa0, 0x3c, 0x5c, 0x3c, 0x99, 0x3d, 0xcb, 0x50, 0x2b, 0x3d, 0x33, 0xc3, 0x94, 0x3d, - 0x53, 0x0f, 0x27, 0x3d, 0x57, 0x07, 0x57, 0x3d, 0xcc, 0x61, 0x17, 0x3d, 0xc0, 0xa1, 0x62, 0x3a, - 0x1b, 0x5c, 0xfe, 0x3c, 0x84, 0x2e, 0x6c, 0x3d, 0x48, 0xa7, 0x70, 0x3c, 0x38, 0xcd, 0x16, 0x3d, - 0x69, 0xc9, 0x48, 0x3d, 0x25, 0x4e, 0x44, 0x3d, 0x1b, 0xbb, 0x63, 0x3d, 0xbf, 0x9a, 0x34, 0x3d, - 0x9e, 0x3e, 0x30, 0x3d, 0x7c, 0xbc, 0x3e, 0x3d, 0x66, 0x28, 0x4b, 0x3d, 0x3b, 0x72, 0x3b, 0x3d, - 0x98, 0x2e, 0x72, 0x3d, 0x5a, 0xec, 0x73, 0x3d, 0x94, 0x88, 0x54, 0x3d, 0x54, 0x8b, 0x43, 0x3d, - 0xb6, 0x27, 0x2b, 0x3d, 0xfb, 0x76, 0x64, 0x3d, 0xb9, 0x79, 0x3e, 0x3d, 0x0a, 0x07, 0x7e, 0x3d, - 0x98, 0xc3, 0x59, 0x3d, 0x25, 0x67, 0x6a, 0x3d, 0xb4, 0x9a, 0x20, 0x3d, 0xd3, 0xfd, 0x33, 0x3d, - 0x45, 0x2c, 0x32, 0x3d, 0xce, 0xef, 0x71, 0x3d, 0x47, 0xb4, 0x63, 0x3d, 0x67, 0x8e, 0x40, 0x3d, - 0xcc, 0x2c, 0x30, 0x3d, 0x27, 0xb9, 0x87, 0x3d, 0x46, 0x93, 0x3d, 0x3d, 0xc2, 0xd2, 0x37, 0x3d, - 0xd9, 0x98, 0x52, 0x3d, 0x3a, 0x81, 0x6b, 0x3d, 0x8f, 0x17, 0x62, 0x3d, 0xff, 0x12, 0x29, 0x3d, - 0x3e, 0xa8, 0xb9, 0x3c, 0x14, 0x83, 0xc3, 0x3c, 0x1c, 0x4c, 0x83, 0x3c, 0x27, 0x6b, 0x02, 0x3c, - 0xd6, 0x66, 0xb2, 0x3c, 0x96, 0x7e, 0xd2, 0x3c, 0xe2, 0x5a, 0x10, 0x3d, 0xda, 0xe0, 0x1c, 0x3c, - 0xad, 0xe5, 0xc2, 0x3c, 0xc9, 0x96, 0xac, 0x3c, 0x94, 0xf3, 0xb0, 0x3c, 0x77, 0xd7, 0x95, 0x3c, - 0xfe, 0xcf, 0x1c, 0x3c, 0xc7, 0xc9, 0x07, 0x3d, 0x75, 0x74, 0x68, 0x3c, 0x1e, 0x19, 0x08, 0x3d, - 0x82, 0x8d, 0x8d, 0x3c, 0x6d, 0xfa, 0x25, 0x3d, 0x6a, 0x2b, 0x9f, 0x3c, 0xb9, 0x7c, 0xc1, 0x3c, - 0x00, 0xca, 0x59, 0x3b, 0xad, 0x0a, 0x01, 0x3d, 0x8f, 0x60, 0xed, 0x3c, 0xd6, 0x1f, 0xd9, 0x3c, - 0x42, 0xf9, 0x94, 0x3c, 0x7c, 0x9c, 0x40, 0x3d, 0x89, 0x02, 0x23, 0x3c, 0xf8, 0x0a, 0x09, 0x3a, - 0xaa, 0x04, 0x8a, 0x3c, 0x2c, 0x22, 0xf1, 0x3c, 0x34, 0x57, 0x4d, 0x3c, 0xda, 0x25, 0x84, 0x3c, - 0x5e, 0x2a, 0x80, 0x3f, 0x3e, 0x79, 0xa5, 0x3f, 0x8c, 0x3c, 0xac, 0x3f, 0x67, 0xe8, 0xd3, 0x3f, - 0x99, 0x65, 0xb1, 0x3f, 0x96, 0x82, 0x5a, 0x3f, 0xfd, 0xb8, 0xb2, 0x3f, 0x6f, 0xe8, 0x9a, 0x3f, - 0x20, 0x25, 0x03, 0x40, 0x76, 0x3a, 0xa3, 0x3f, 0x32, 0xbb, 0xe1, 0x3f, 0x25, 0x4d, 0x57, 0x3f, - 0x80, 0x19, 0x50, 0x3f, 0xc4, 0x08, 0xf0, 0x3f, 0xab, 0x97, 0xc9, 0x3f, 0x7c, 0x9d, 0xb5, 0x3f, - 0x8a, 0x2d, 0xa7, 0x3f, 0x10, 0xe9, 0xe1, 0x3f, 0x22, 0x22, 0x20, 0x3f, 0x6e, 0x45, 0x8d, 0x3f, - 0xee, 0xb5, 0x93, 0x3f, 0x6c, 0xf7, 0xf4, 0x3f, 0xbd, 0x4c, 0x91, 0x3f, 0xfa, 0xad, 0xe5, 0x3f, - 0xda, 0x8f, 0x9a, 0x3f, 0x18, 0x63, 0x85, 0x3f, 0x98, 0xf7, 0xb5, 0x3f, 0x12, 0x19, 0x50, 0x3f, - 0x54, 0x26, 0x95, 0x3f, 0x8e, 0x93, 0xc4, 0x3f, 0x5c, 0x58, 0x78, 0x3f, 0xdb, 0x42, 0x92, 0x3f, - 0x85, 0x95, 0x87, 0x3f, 0x00, 0x26, 0xca, 0x3e, 0x00, 0x41, 0x6a, 0xbe, 0x81, 0xf9, 0x17, 0x3f, - 0xe6, 0x4d, 0x58, 0x3f, 0xeb, 0x87, 0x97, 0x3e, 0x6c, 0x7e, 0xf4, 0x3e, 0x42, 0x57, 0x94, 0x3e, - 0x40, 0x75, 0xfe, 0xbd, 0x1b, 0xe6, 0x29, 0x3f, 0x45, 0xa9, 0xdc, 0x3e, 0x08, 0xd0, 0x78, 0x3f, - 0x50, 0xbf, 0x10, 0x3e, 0x3e, 0x2d, 0x46, 0x3f, 0x9a, 0xfe, 0xf9, 0x3e, 0x21, 0x02, 0x19, 0x3f, - 0xf6, 0xee, 0xfe, 0x3e, 0xab, 0x17, 0xa4, 0x3e, 0x50, 0xf4, 0xd8, 0x3e, 0x4d, 0x9b, 0x50, 0x3f, - 0x91, 0x49, 0xe0, 0x3e, 0x18, 0x7d, 0xa3, 0x3e, 0x34, 0xa0, 0x1c, 0x3f, 0xa0, 0x3b, 0x87, 0xbd, - 0xdc, 0x35, 0x3c, 0xbe, 0xc0, 0x0b, 0x13, 0x3e, 0x26, 0xc0, 0xbe, 0x3e, 0x1d, 0x5b, 0xe6, 0x3e, - 0xc0, 0x12, 0x40, 0x3e, 0xe4, 0x95, 0x49, 0x3f, 0xbd, 0x32, 0x0a, 0x3f, 0x84, 0x1e, 0x9f, 0x3e, - 0x80, 0x9a, 0x6c, 0xbe, 0xb6, 0x82, 0x50, 0xbe, 0x58, 0xa5, 0xac, 0x3d, 0x2d, 0xdd, 0xca, 0xbe, - 0xf6, 0x81, 0x0c, 0xbf, 0x59, 0x1c, 0x5a, 0xbe, 0xf7, 0x69, 0x20, 0xbe, 0x57, 0x26, 0xb7, 0xbe, - 0x3a, 0xa6, 0x3f, 0xbe, 0xb6, 0x30, 0xd0, 0xbe, 0xb9, 0x13, 0xf6, 0xbe, 0x42, 0xb0, 0x9a, 0xbe, - 0xc6, 0x9c, 0xcf, 0x3d, 0x30, 0xe1, 0x21, 0xbf, 0x00, 0x07, 0x85, 0xbe, 0x08, 0xcf, 0xb4, 0xbe, - 0x2c, 0x67, 0x49, 0xbe, 0x5f, 0xdc, 0x38, 0xbe, 0x50, 0x28, 0x30, 0xbe, 0x11, 0x6d, 0xcf, 0xbe, - 0x5d, 0x19, 0xe5, 0xbe, 0xe3, 0x78, 0xe0, 0xbd, 0xac, 0xd6, 0xd0, 0xbe, 0x96, 0x15, 0x45, 0xbe, - 0x42, 0x56, 0x5d, 0xbe, 0xf5, 0xa0, 0x08, 0xbe, 0x22, 0xfc, 0xa6, 0xbe, 0x5b, 0x73, 0x80, 0xbe, - 0x1b, 0x9b, 0x26, 0xbe, 0x79, 0xea, 0xa4, 0xbe, 0x5e, 0xab, 0xfa, 0xbe, 0xcb, 0x79, 0x0a, 0xbf, - 0x4e, 0x0d, 0x2e, 0xbd, 0x18, 0x18, 0xbd, 0xbd, 0x44, 0xe5, 0x45, 0x3e, 0x1d, 0x70, 0x1a, 0x3e, - 0xc2, 0x64, 0x8b, 0xbe, 0x0c, 0xf8, 0x8a, 0xbe, 0x92, 0x2a, 0xf4, 0xbd, 0xa7, 0x37, 0x98, 0xbd, - 0x7c, 0xe9, 0x1d, 0x3d, 0xa6, 0x54, 0x0f, 0xbd, 0xea, 0x01, 0x08, 0xbe, 0xfd, 0x82, 0x4d, 0xbe, - 0xc6, 0xf3, 0x90, 0x3d, 0x94, 0x19, 0x5f, 0xbe, 0xbc, 0x05, 0x93, 0xbe, 0xb6, 0x9f, 0xfd, 0xbd, - 0xd0, 0x36, 0x41, 0xbe, 0x4e, 0xc2, 0x43, 0xbe, 0x3a, 0x9d, 0xaa, 0xbc, 0x6a, 0xdc, 0x0f, 0xbe, - 0x88, 0x0a, 0x7a, 0x3c, 0x19, 0x83, 0xd2, 0x3d, 0x59, 0xb4, 0x89, 0xbd, 0xb9, 0xad, 0xdf, 0x3d, - 0x94, 0x1d, 0x12, 0x3d, 0x6a, 0xfa, 0x8f, 0xbe, 0xc0, 0xf1, 0x48, 0x3c, 0x38, 0x55, 0x12, 0x3d, - 0x60, 0xd1, 0x3d, 0xbb, 0x18, 0xda, 0xca, 0xbe, 0x33, 0x00, 0xf1, 0xbd, 0x8c, 0x4b, 0x76, 0x3d, - 0xa0, 0x55, 0xc1, 0x3a, 0x13, 0x38, 0xb4, 0x3c, 0x5c, 0x8e, 0x70, 0xbc, 0xfd, 0x7e, 0x03, 0x3d, - 0x8a, 0xce, 0x8a, 0x3d, 0xb7, 0x0b, 0x12, 0x3d, 0xd4, 0xdd, 0x5d, 0x3c, 0xa2, 0x71, 0x43, 0x3d, - 0xa0, 0x1d, 0xf8, 0x3c, 0xd4, 0x61, 0x26, 0x3d, 0x52, 0xd3, 0x83, 0x3d, 0x96, 0x0e, 0xbc, 0x3c, - 0x3c, 0x28, 0xc0, 0xbc, 0xec, 0xb7, 0xa3, 0x3d, 0xcc, 0xc6, 0x17, 0x3d, 0x79, 0xba, 0x20, 0x3d, - 0x38, 0x01, 0xb7, 0x3c, 0xe2, 0x9e, 0xd4, 0x3c, 0xe0, 0x6b, 0x50, 0x3c, 0x34, 0x6d, 0x26, 0x3d, - 0xc2, 0xfb, 0x53, 0x3d, 0x10, 0x62, 0x2c, 0x3a, 0xcc, 0xd7, 0x34, 0x3d, 0xe8, 0xcb, 0xd1, 0x3c, - 0x53, 0x97, 0x16, 0x3d, 0xbe, 0xec, 0xef, 0x3c, 0xd4, 0x00, 0x13, 0x3d, 0xac, 0xca, 0xaa, 0x3c, - 0xd6, 0xe4, 0x95, 0x3c, 0x10, 0x87, 0x2b, 0x3d, 0x9b, 0x32, 0x7b, 0x3d, 0x32, 0xcc, 0x8a, 0x3d, - 0x95, 0x63, 0x8e, 0xbc, 0xae, 0x18, 0x32, 0x3c, 0x42, 0x89, 0xd1, 0xbc, 0x01, 0x24, 0xb9, 0xbc, - 0xf0, 0x50, 0x24, 0x3d, 0x04, 0xfa, 0x29, 0x3d, 0x8b, 0x89, 0x20, 0x3c, 0x31, 0xbc, 0x9e, 0x3c, - 0xfc, 0xda, 0xfc, 0x3b, 0x98, 0x9b, 0x62, 0x3b, 0x78, 0x62, 0xf2, 0x3c, 0xea, 0xed, 0x51, 0x3c, - 0xd6, 0x17, 0xa7, 0xbc, 0x5f, 0xab, 0x1d, 0x3d, 0xfd, 0xf8, 0x22, 0x3d, 0xc7, 0x9c, 0x85, 0x3c, - 0x22, 0x74, 0xb0, 0x3c, 0xd4, 0x56, 0xdd, 0x3c, 0x21, 0x3b, 0x1d, 0xbb, 0xdb, 0x1e, 0x68, 0x3c, - 0x00, 0xa0, 0xb3, 0x3b, 0x54, 0x9b, 0xa8, 0xbc, 0x85, 0x40, 0x25, 0x3c, 0x9a, 0x5e, 0x95, 0xbb, - 0x74, 0xc7, 0x3d, 0x3c, 0x52, 0x7e, 0x34, 0x3d, 0x7c, 0x44, 0x06, 0x3b, 0xc6, 0xfb, 0xff, 0xbb, - 0xee, 0xda, 0x17, 0x3b, 0x5c, 0xe0, 0x49, 0x3d, 0x80, 0xb9, 0xc5, 0x3c, 0x88, 0x3c, 0xfb, 0x3b, - 0xb5, 0x40, 0xd3, 0x3c, 0x6b, 0x02, 0x56, 0x3c, 0x5e, 0x5d, 0x95, 0xbb, 0x1f, 0x37, 0xda, 0x3c, - 0xe0, 0x0b, 0x00, 0x3d, 0x6a, 0x18, 0x1c, 0x3c, 0x7b, 0x3a, 0x4a, 0x3c, 0x28, 0xd6, 0x8d, 0x3c, - 0x38, 0xea, 0xb1, 0x3b, 0x92, 0xf1, 0xd3, 0x3c, 0x73, 0xe3, 0xc0, 0x3c, 0x46, 0xf6, 0xcc, 0x3c, - 0x00, 0x07, 0x13, 0xba, 0xe7, 0xd1, 0x0a, 0x3d, 0x7e, 0x27, 0x6a, 0x3c, 0x92, 0x68, 0xb1, 0x3c, - 0xdb, 0x29, 0x5a, 0x3c, 0xda, 0x47, 0x1f, 0x3c, 0x5c, 0x23, 0x59, 0x3c, 0x8e, 0x53, 0xdd, 0x3c, - 0x0a, 0xd5, 0xc6, 0x3c, 0x3a, 0x96, 0x2f, 0x3c, 0xa0, 0xf3, 0xc9, 0x3c, 0xc0, 0x2c, 0xf5, 0x3b, - 0x46, 0xd6, 0xb3, 0x3b, 0x8f, 0x64, 0x7e, 0x3b, 0x4d, 0x50, 0x98, 0x3c, 0x2c, 0x28, 0x8e, 0x3c, - 0x02, 0x97, 0x16, 0x3c, 0x35, 0xe4, 0xa5, 0x3c, 0xac, 0x0e, 0xd4, 0x3c, 0x94, 0xc2, 0xd6, 0x3c, - 0x3d, 0x04, 0x86, 0x3c, 0x5e, 0xad, 0xf5, 0x3b, 0xc3, 0xf0, 0x23, 0xbc, 0xd6, 0x3b, 0x3b, 0xba, - 0x9e, 0xcb, 0x8e, 0x3c, 0x03, 0xf5, 0x4b, 0x3c, 0x57, 0x90, 0x2b, 0x3c, 0xef, 0x14, 0x4e, 0x3b, - 0x89, 0xe3, 0xbf, 0xbb, 0xda, 0xe5, 0xee, 0x3b, 0x26, 0xb8, 0xc9, 0x3b, 0x76, 0x6a, 0xa3, 0x3c, - 0x68, 0xff, 0x7d, 0x3a, 0xa8, 0xbe, 0x57, 0x3c, 0xef, 0x45, 0x80, 0x3c, 0xf8, 0xf8, 0x26, 0x3c, - 0xc5, 0x9c, 0x53, 0x3c, 0xcd, 0xff, 0x27, 0x3c, 0x10, 0x8c, 0xba, 0x3b, 0x25, 0xdc, 0x61, 0x3c, - 0x7a, 0xd4, 0x94, 0x3a, 0xa0, 0xea, 0xcb, 0x38, 0xd6, 0xaf, 0xf9, 0x3b, 0x5e, 0x1a, 0xf9, 0xbb, - 0x92, 0xbf, 0xe8, 0xbb, 0x0a, 0xa9, 0x38, 0x3c, 0xf6, 0x3b, 0xdb, 0x3a, 0xf2, 0x1a, 0x46, 0x3b, - 0x54, 0x6c, 0x9d, 0x3a, 0x81, 0x3d, 0xc4, 0x3c, 0x46, 0xe3, 0xee, 0x3b, 0xf4, 0x71, 0x7b, 0xbb, - 0x87, 0x34, 0xe3, 0xbe, 0x9a, 0xca, 0x8a, 0x3e, 0x6e, 0x9e, 0xca, 0xbe, 0xf0, 0xfb, 0x29, 0xbd, - 0xf0, 0x53, 0x80, 0x3f, 0x0a, 0x43, 0x4b, 0x3f, 0x78, 0xa1, 0x12, 0x3e, 0x27, 0xaf, 0x2e, 0x3f, - 0xa6, 0xce, 0xea, 0x3e, 0x32, 0xf1, 0xa8, 0x3e, 0xa1, 0x02, 0x73, 0x3f, 0xc2, 0x3f, 0x23, 0x3e, - 0x7b, 0x58, 0x07, 0xbf, 0xe9, 0xc5, 0x91, 0x3f, 0x06, 0x0f, 0x3e, 0x3f, 0x65, 0x77, 0xf3, 0x3e, - 0xc5, 0x02, 0xbd, 0x3e, 0x6c, 0x0d, 0x04, 0x3f, 0xc0, 0xe4, 0x26, 0x3c, 0xc3, 0x56, 0xd3, 0x3e, - 0x62, 0xcd, 0x08, 0x3f, 0xd6, 0xa2, 0x9b, 0xbe, 0x0b, 0x18, 0xea, 0x3e, 0x78, 0xb5, 0x81, 0x3e, - 0xd3, 0x01, 0x17, 0x3f, 0x50, 0x11, 0x4a, 0x3f, 0x26, 0xd6, 0xaa, 0x3e, 0xbb, 0x03, 0x0c, 0x3d, - 0x98, 0x76, 0x3d, 0x3e, 0x3f, 0x27, 0x54, 0x3f, 0x1c, 0x96, 0x53, 0x3f, 0x28, 0x60, 0x45, 0x3f, - 0x36, 0x71, 0x17, 0xbe, 0x6c, 0x6d, 0x37, 0xbe, 0x74, 0x8c, 0xe2, 0xbd, 0x54, 0x4b, 0xb7, 0xbd, - 0x4e, 0x2e, 0x1b, 0xbe, 0x96, 0x74, 0x11, 0xbe, 0x2e, 0x55, 0x2c, 0xbe, 0x93, 0xaf, 0xf6, 0xbd, - 0x70, 0x23, 0x29, 0xbe, 0xf8, 0xc8, 0x2f, 0xbe, 0xa0, 0x24, 0x0f, 0xbe, 0x24, 0x4d, 0x3c, 0xbe, - 0x36, 0xb2, 0x03, 0xbe, 0xd4, 0x27, 0x50, 0xbe, 0xdb, 0xd3, 0xe7, 0xbd, 0x20, 0xdc, 0x5e, 0xbe, - 0xa3, 0x71, 0x08, 0xbe, 0xb6, 0x6a, 0x79, 0xbe, 0xba, 0x07, 0x15, 0xbe, 0x6e, 0x1c, 0x22, 0xbe, - 0x92, 0xcf, 0x8a, 0xbd, 0x43, 0xb1, 0x33, 0xbe, 0x84, 0x8d, 0x53, 0xbe, 0x8f, 0x3b, 0x14, 0xbe, - 0x65, 0xd7, 0x0f, 0xbe, 0xf2, 0xf7, 0x8e, 0xbe, 0x1f, 0x3d, 0xeb, 0xbd, 0xcf, 0x77, 0xfc, 0xbd, - 0x16, 0x3f, 0x02, 0xbe, 0xaa, 0x90, 0x2b, 0xbe, 0xf8, 0xe4, 0x1c, 0xbe, 0x9c, 0x3b, 0x9e, 0xbd, - 0x90, 0x69, 0x9b, 0x3d, 0x8f, 0x25, 0xe0, 0x3d, 0x94, 0x6f, 0xbd, 0x3d, 0x82, 0x9e, 0xee, 0x3d, - 0xa0, 0x1d, 0xc9, 0x3d, 0x83, 0xf4, 0x9b, 0x3d, 0x5b, 0xcf, 0xba, 0x3d, 0x24, 0xb4, 0xbb, 0x3d, - 0x47, 0x81, 0x07, 0x3e, 0x3d, 0xed, 0xd8, 0x3d, 0xab, 0xd6, 0xe7, 0x3d, 0x1d, 0x6a, 0xb9, 0x3d, - 0x24, 0xeb, 0xb6, 0x3d, 0xda, 0xbe, 0xf8, 0x3d, 0x3a, 0x72, 0xd3, 0x3d, 0xb0, 0xd3, 0xfe, 0x3d, - 0x28, 0xd7, 0xd7, 0x3d, 0x8c, 0xea, 0xea, 0x3d, 0xd0, 0x12, 0xa3, 0x3d, 0x37, 0x4e, 0xb5, 0x3d, - 0xcf, 0x5c, 0xda, 0x3d, 0x9f, 0xfb, 0xe7, 0x3d, 0x9e, 0xc0, 0xe1, 0x3d, 0x86, 0xde, 0xe4, 0x3d, - 0x1c, 0x89, 0xc4, 0x3d, 0x69, 0xf8, 0xe9, 0x3d, 0x0a, 0x02, 0xe2, 0x3d, 0xf3, 0x5f, 0xbd, 0x3d, - 0xac, 0x98, 0xc3, 0x3d, 0xf2, 0x39, 0xf2, 0x3d, 0x03, 0x0b, 0xc4, 0x3d, 0x2c, 0x43, 0x95, 0x3d, - 0xba, 0x18, 0xdf, 0x3c, 0xe6, 0x62, 0x43, 0x3d, 0x5c, 0x49, 0x71, 0x3c, 0x98, 0x0d, 0xa2, 0x3c, - 0x03, 0xc3, 0x51, 0x3d, 0xf8, 0x28, 0x21, 0x3d, 0xc2, 0x03, 0x68, 0x3d, 0xd0, 0x56, 0x1e, 0x3c, - 0xc3, 0xbb, 0x42, 0x3d, 0x64, 0x5a, 0x11, 0x3d, 0x6e, 0xb5, 0x40, 0x3d, 0xd3, 0xe5, 0xd4, 0x3c, - 0x98, 0x58, 0xbb, 0xba, 0xfa, 0x57, 0xa1, 0x3d, 0x5a, 0x0d, 0x93, 0x3c, 0xbf, 0x91, 0x78, 0x3d, - 0xc6, 0x52, 0xf3, 0x3c, 0x00, 0xcd, 0xb5, 0x3d, 0xac, 0x79, 0x34, 0x3d, 0x4a, 0x18, 0x0b, 0x3d, - 0xd0, 0xb2, 0x2b, 0xbb, 0x14, 0xa3, 0x80, 0x3d, 0x70, 0xc0, 0x5f, 0x3d, 0x61, 0x8a, 0x85, 0x3d, - 0x7f, 0x1a, 0xac, 0x3c, 0x3c, 0x5b, 0x9f, 0x3d, 0x6a, 0x7a, 0x75, 0x3c, 0x2b, 0x0c, 0x24, 0xbc, - 0x41, 0x0f, 0xb9, 0x3c, 0x64, 0x15, 0x89, 0x3d, 0xb0, 0x10, 0xad, 0xba, 0x04, 0xd2, 0x79, 0x3c, - 0x0a, 0x09, 0x00, 0xbc, 0xb1, 0x39, 0x52, 0xbc, 0x72, 0xf9, 0x32, 0xbc, 0x7a, 0xab, 0x7d, 0xbc, - 0x31, 0x87, 0x48, 0xbc, 0x4b, 0xcd, 0x0d, 0xbc, 0x04, 0xaa, 0x32, 0xbc, 0xe4, 0x60, 0x27, 0xbc, - 0xca, 0xf9, 0x88, 0xbc, 0x72, 0xa1, 0x43, 0xbc, 0xb0, 0x67, 0x70, 0xbc, 0x1c, 0x5c, 0x0f, 0xbc, - 0xe6, 0x17, 0x14, 0xbc, 0x0a, 0x04, 0x7c, 0xbc, 0xb6, 0x9e, 0x4e, 0xbc, 0xc8, 0xa5, 0x6c, 0xbc, - 0x20, 0x92, 0x52, 0xbc, 0xf0, 0x98, 0x5d, 0xbc, 0xed, 0x4c, 0x18, 0xbc, 0x28, 0xed, 0x1e, 0xbc, - 0x62, 0xdf, 0x5e, 0xbc, 0x92, 0xff, 0x69, 0xbc, 0x38, 0xd3, 0x4a, 0xbc, 0x7b, 0x00, 0x79, 0xbc, - 0x70, 0x56, 0x30, 0xbc, 0xd8, 0x01, 0x40, 0xbc, 0xce, 0xc0, 0x5c, 0xbc, 0xfc, 0xb8, 0x17, 0xbc, - 0xc1, 0xc1, 0x37, 0xbc, 0xf0, 0x00, 0x7e, 0xbc, 0x31, 0x7e, 0x16, 0xbc, 0x14, 0x69, 0x15, 0xbc, - 0x88, 0x4c, 0x41, 0xbb, 0x02, 0x20, 0xda, 0xbb, 0xa9, 0x37, 0x4e, 0xbb, 0xc9, 0x5e, 0xbe, 0xbb, - 0x96, 0x14, 0xf7, 0xbb, 0xf2, 0x00, 0xa3, 0xbb, 0x46, 0x0b, 0xf4, 0xbb, 0xa3, 0x32, 0x04, 0xbb, - 0x72, 0x09, 0x07, 0xbc, 0x06, 0x76, 0xa0, 0xbb, 0x87, 0x04, 0x04, 0xbc, 0x55, 0x6e, 0x16, 0xbb, - 0x40, 0x51, 0x15, 0x38, 0x24, 0x18, 0x36, 0xbc, 0x33, 0xbf, 0x85, 0xbb, 0xba, 0x36, 0x04, 0xbc, - 0x4f, 0x20, 0xad, 0xbb, 0xe7, 0x1a, 0x33, 0xbc, 0x48, 0x10, 0xbc, 0xbb, 0x33, 0x0a, 0x8b, 0xbb, - 0x74, 0x8f, 0x2f, 0xbb, 0x55, 0x52, 0x17, 0xbc, 0x99, 0x72, 0xdf, 0xbb, 0x2a, 0xbd, 0x2c, 0xbc, - 0x9a, 0x1e, 0x56, 0xbb, 0xe7, 0x50, 0x04, 0xbc, 0x3a, 0xfd, 0x80, 0xbb, 0xbe, 0x74, 0x81, 0x3a, - 0x56, 0x95, 0x80, 0xbb, 0xb2, 0xe3, 0x29, 0xbc, 0xa0, 0x13, 0x08, 0x3a, 0x2c, 0xee, 0x5b, 0xbb, - 0x9e, 0xdc, 0xa2, 0xbb, 0x01, 0xc7, 0xd4, 0xbb, 0xd6, 0x0a, 0xa8, 0xbb, 0x82, 0xf2, 0xb7, 0xbb, - 0xd6, 0x74, 0xb6, 0xbb, 0x86, 0x08, 0x9b, 0xbb, 0x9a, 0xcb, 0xb4, 0xbb, 0x55, 0x72, 0xae, 0xbb, - 0xfe, 0x56, 0xe9, 0xbb, 0x6c, 0x5f, 0xd1, 0xbb, 0xdc, 0x2c, 0xc4, 0xbb, 0xfd, 0x07, 0xc9, 0xbb, - 0x13, 0x93, 0xb4, 0xbb, 0xb1, 0xfe, 0xe3, 0xbb, 0xf6, 0x55, 0xb5, 0xbb, 0xd8, 0xe5, 0xf6, 0xbb, - 0xb2, 0x1b, 0xbe, 0xbb, 0xa3, 0x9c, 0xed, 0xbb, 0x82, 0x47, 0x9f, 0xbb, 0x44, 0x55, 0xb5, 0xbb, - 0x5b, 0x1b, 0xa9, 0xbb, 0x5e, 0x4c, 0xd1, 0xbb, 0x89, 0xe7, 0xe1, 0xbb, 0xbd, 0xab, 0xbd, 0xbb, - 0x16, 0x08, 0xba, 0xbb, 0x5f, 0x81, 0x02, 0xbc, 0xbb, 0x32, 0xc0, 0xbb, 0x06, 0x0d, 0xb9, 0xbb, - 0xd1, 0x91, 0xb1, 0xbb, 0xa5, 0x6b, 0xd1, 0xbb, 0xfa, 0xd7, 0xc9, 0xbb, 0xc4, 0x03, 0x7b, 0xbb, - 0x6b, 0x4d, 0x26, 0xbb, 0xa2, 0x3a, 0x5f, 0xbb, 0x3a, 0x7d, 0xa2, 0xba, 0x66, 0xb3, 0x4b, 0xba, - 0xe1, 0xef, 0x52, 0xbb, 0x68, 0x77, 0x3d, 0xbb, 0x71, 0x4e, 0x78, 0xbb, 0x6b, 0x78, 0xa0, 0xba, - 0xb4, 0xd9, 0x3c, 0xbb, 0xf9, 0xf1, 0x3b, 0xbb, 0x64, 0x93, 0x2f, 0xbb, 0xea, 0x8e, 0x3e, 0xbb, - 0xd7, 0x2e, 0x7f, 0xba, 0xcb, 0x12, 0x9e, 0xbb, 0x6a, 0x5b, 0xa6, 0xba, 0xca, 0x76, 0x8e, 0xbb, - 0x73, 0x33, 0x04, 0xbb, 0x9b, 0x1e, 0xc3, 0xbb, 0x74, 0x05, 0x4a, 0xbb, 0x6b, 0xda, 0x37, 0xbb, - 0x9c, 0x02, 0x9e, 0x39, 0x42, 0x3e, 0x7d, 0xbb, 0x9d, 0xcd, 0x86, 0xbb, 0xd8, 0xd0, 0x62, 0xbb, - 0x30, 0xe5, 0xfc, 0xba, 0xce, 0x51, 0xc9, 0xbb, 0x28, 0xc2, 0x8f, 0xba, 0x93, 0x2b, 0x09, 0xba, - 0xef, 0x6a, 0xe8, 0xba, 0xce, 0x9c, 0x7a, 0xbb, 0x3b, 0x62, 0xab, 0xba, 0xc5, 0xa2, 0x78, 0xba, - 0x02, 0x49, 0xac, 0xbd, 0x76, 0xa4, 0x32, 0xbe, 0x9e, 0xc5, 0x05, 0xbe, 0x7a, 0x60, 0x5e, 0xbe, - 0xb8, 0x10, 0x3d, 0xbe, 0x10, 0x4b, 0xf1, 0xbd, 0x36, 0x56, 0x28, 0xbe, 0x28, 0x2e, 0xdc, 0xbd, - 0x0c, 0x49, 0x75, 0xbe, 0xc4, 0x04, 0x16, 0xbe, 0xee, 0x12, 0x62, 0xbe, 0xc0, 0x55, 0xa0, 0xbd, - 0x87, 0x96, 0x8d, 0xbd, 0xdb, 0x31, 0x7d, 0xbe, 0x54, 0x52, 0x23, 0xbe, 0x02, 0xa4, 0x4b, 0xbe, - 0xb2, 0xcf, 0x2e, 0xbe, 0xc6, 0x63, 0x5b, 0xbe, 0xe1, 0xf1, 0x07, 0xbe, 0x75, 0x32, 0xf0, 0xbd, - 0x56, 0x37, 0x2a, 0xbe, 0x46, 0x01, 0x62, 0xbe, 0x6c, 0x47, 0x28, 0xbe, 0x3a, 0xbd, 0x81, 0xbe, - 0x9e, 0x54, 0xfb, 0xbd, 0x34, 0x8e, 0x1c, 0xbe, 0xf3, 0x28, 0x2b, 0xbe, 0x52, 0xf9, 0x7e, 0xbd, - 0xf6, 0xd8, 0x0d, 0xbe, 0x97, 0x28, 0x7f, 0xbe, 0x7d, 0x72, 0x72, 0xbd, 0x98, 0xde, 0xf7, 0xbd, - 0x95, 0xc4, 0xf0, 0xbd, 0xb2, 0x15, 0xb7, 0xbd, 0x44, 0xeb, 0x22, 0x3d, 0xde, 0xc6, 0x95, 0xbd, - 0xe2, 0xbc, 0xff, 0xbd, 0xa8, 0xf9, 0xf8, 0xbc, 0x0a, 0x00, 0x76, 0xbd, 0x00, 0xf9, 0x61, 0xbd, - 0x88, 0xa6, 0x8e, 0xbc, 0x1a, 0x49, 0xb5, 0xbd, 0x5c, 0x6a, 0x7c, 0xbd, 0x4c, 0x38, 0x1d, 0xbe, - 0x08, 0x9e, 0x42, 0xbd, 0x75, 0x8a, 0x02, 0xbe, 0xab, 0xcc, 0x84, 0xbd, 0x95, 0xe1, 0xc9, 0xbd, - 0xae, 0x3f, 0x6a, 0xbd, 0xf2, 0x67, 0xcc, 0xbd, 0xf2, 0xa7, 0x92, 0xbd, 0xf8, 0x49, 0xec, 0xbd, - 0x54, 0xb4, 0x0f, 0xbd, 0x14, 0xd5, 0x52, 0xbd, 0x93, 0xd5, 0xd8, 0xbd, 0xf0, 0x38, 0x90, 0xbc, - 0xb0, 0xf7, 0x6e, 0xbc, 0xbc, 0x92, 0x84, 0xbd, 0x0c, 0x6a, 0x7a, 0xbd, 0x12, 0x05, 0xb1, 0xbd, - 0xb0, 0x0a, 0xb0, 0xbc, 0xde, 0x72, 0xb6, 0xbd, 0x2b, 0xe9, 0xa1, 0xbd, 0x46, 0x7e, 0xfd, 0xbb, - 0xd4, 0xb0, 0xa4, 0x3c, 0x7e, 0xdd, 0x45, 0x3d, 0xd7, 0x27, 0x61, 0xbc, 0x2d, 0xf4, 0x87, 0x3d, - 0x32, 0x55, 0x9c, 0x3d, 0x0d, 0xcf, 0xd6, 0x3c, 0xb0, 0xd0, 0xdb, 0x3c, 0x01, 0xc0, 0x31, 0x3d, - 0x2e, 0x3e, 0x24, 0x3d, 0xe6, 0x9c, 0x3e, 0x3d, 0x0e, 0xcb, 0x84, 0x3d, 0x68, 0x4f, 0x35, 0x3d, - 0x90, 0x85, 0xfd, 0x3b, 0xae, 0xd6, 0xb0, 0x3d, 0x54, 0x24, 0x25, 0x3d, 0x7a, 0x72, 0x74, 0x3d, - 0x9f, 0xd2, 0x03, 0x3d, 0x07, 0xad, 0x1a, 0x3d, 0x2e, 0xcb, 0x18, 0x3d, 0x80, 0x97, 0x67, 0x3d, - 0xcb, 0x62, 0x8c, 0x3d, 0x84, 0xa7, 0x9f, 0x3c, 0x6b, 0x6f, 0x81, 0x3d, 0xe9, 0x85, 0x33, 0x3d, - 0x65, 0x27, 0x2c, 0x3d, 0x99, 0x96, 0x0c, 0x3d, 0x6a, 0x91, 0x60, 0x3d, 0x62, 0x5a, 0x17, 0x3d, - 0x88, 0xcb, 0xc1, 0x3c, 0x26, 0x92, 0x5a, 0x3d, 0xfb, 0xec, 0x54, 0x3d, 0x74, 0x81, 0x5d, 0x3d, - 0x16, 0xfe, 0xad, 0x3b, 0x3c, 0xf1, 0xc0, 0x3c, 0x76, 0x9b, 0xbe, 0xbc, 0xc5, 0x7b, 0x51, 0xbc, - 0x6b, 0xfc, 0x34, 0x3d, 0x28, 0x90, 0x17, 0x3d, 0xb2, 0x26, 0x8d, 0x3c, 0x7c, 0x6f, 0xb2, 0x3b, - 0x8c, 0x8e, 0x5f, 0x3b, 0x46, 0x4d, 0x82, 0x3c, 0xbc, 0x8c, 0xc8, 0x3c, 0xef, 0x42, 0xe9, 0x3c, - 0xf1, 0x42, 0x7e, 0xbc, 0x89, 0x75, 0x42, 0x3d, 0xea, 0x63, 0xb7, 0x3c, 0x3a, 0x99, 0xfa, 0x3c, - 0xa4, 0x01, 0xe1, 0x3c, 0x5c, 0xa7, 0x3d, 0x3d, 0x1b, 0x75, 0xd6, 0x3c, 0x9b, 0x59, 0x84, 0x3c, - 0x54, 0xbd, 0xd8, 0xbb, 0xac, 0xdb, 0x3e, 0x3b, 0xb9, 0x3c, 0xdb, 0x3c, 0x48, 0x23, 0x05, 0x3c, - 0xd5, 0x8a, 0x0a, 0xbc, 0xd6, 0x8f, 0x51, 0x3d, 0xd0, 0x6a, 0xe1, 0xba, 0x96, 0x81, 0xb3, 0xbb, - 0x9a, 0x45, 0x7f, 0x3b, 0x4e, 0x41, 0x86, 0x3d, 0x7c, 0xc1, 0x49, 0x3b, 0x40, 0x04, 0x5b, 0xbc, - 0xf4, 0x29, 0x27, 0x3a, 0xaa, 0x61, 0xb2, 0xbb, 0x6d, 0xe4, 0xf8, 0x3a, 0xb6, 0x75, 0xe1, 0xbb, - 0x8e, 0x99, 0x1b, 0xbc, 0x57, 0x50, 0x9c, 0xbb, 0xe0, 0x09, 0x3d, 0xbb, 0x62, 0xf6, 0xa1, 0xbb, - 0x0a, 0xa9, 0xb7, 0xbb, 0x36, 0x1c, 0x9e, 0xbb, 0x5b, 0xc0, 0x11, 0xbc, 0xea, 0x0b, 0x41, 0xbb, - 0xef, 0x2e, 0x9d, 0x3a, 0xf2, 0x79, 0x35, 0xbc, 0x9e, 0x05, 0xa4, 0xbb, 0x50, 0xc9, 0xeb, 0xbb, - 0xe4, 0xb8, 0x8c, 0xbb, 0xbe, 0x74, 0x98, 0xbb, 0x01, 0x07, 0x94, 0xbb, 0x02, 0x5c, 0xb2, 0xbb, - 0x4e, 0xe2, 0x0f, 0xbc, 0x51, 0x5e, 0xb1, 0xba, 0xd8, 0xd8, 0xee, 0xbb, 0x08, 0x83, 0xd1, 0xbb, - 0x9c, 0xcc, 0xb0, 0xbb, 0x32, 0xd3, 0xb5, 0xbb, 0x04, 0xe3, 0xc7, 0xbb, 0xc2, 0x79, 0x1c, 0xbb, - 0x8a, 0x5f, 0x46, 0xbb, 0x0c, 0x31, 0x08, 0xbc, 0xcb, 0x33, 0xae, 0xbb, 0x94, 0x74, 0xeb, 0xbb, - 0xbd, 0xbe, 0x0a, 0x3b, 0x55, 0x88, 0x42, 0xbb, 0xa4, 0xde, 0x3a, 0x3b, 0x18, 0xda, 0x87, 0x3a, - 0x20, 0xdb, 0xcd, 0xbb, 0x0b, 0xa4, 0xbf, 0xbb, 0xf6, 0x36, 0xfc, 0xba, 0x82, 0x6c, 0x96, 0xba, - 0x7f, 0xc6, 0xfd, 0xba, 0x69, 0xf6, 0xe6, 0xba, 0xeb, 0x40, 0x9f, 0xbb, 0x69, 0x1e, 0xb3, 0xba, - 0x28, 0x00, 0x67, 0x3b, 0x09, 0x94, 0xeb, 0xbb, 0x73, 0x87, 0x52, 0xbb, 0xd2, 0x77, 0x8c, 0xbb, - 0x81, 0x88, 0x7a, 0xbb, 0x34, 0x70, 0xb4, 0xbb, 0x9b, 0x26, 0x5f, 0xbb, 0x95, 0x1a, 0xb8, 0xba, - 0x0c, 0xe8, 0xa5, 0xba, 0xb2, 0xa0, 0x9f, 0x39, 0x21, 0xe4, 0x6e, 0xbb, 0xb6, 0x0a, 0x39, 0xbb, - 0x18, 0xe2, 0xb5, 0xb9, 0xfc, 0x00, 0xed, 0xbb, 0x08, 0x8d, 0xe9, 0xb9, 0x5c, 0x2e, 0xf3, 0x3a, - 0x8b, 0xba, 0x89, 0xba, 0xd5, 0x2a, 0x1c, 0xbc, 0x4c, 0x31, 0xdf, 0xb9, 0x10, 0x29, 0xe7, 0xb9, - 0x8b, 0x77, 0x2d, 0xbb, 0xf2, 0x11, 0x45, 0xbb, 0x79, 0xa1, 0x68, 0x3a, 0xed, 0x7f, 0x7d, 0xbb, - 0xd8, 0xe5, 0x8f, 0xbb, 0x47, 0xa7, 0x86, 0xba, 0xe5, 0xdb, 0xea, 0xba, 0xd1, 0x72, 0x22, 0xbb, - 0xee, 0x8a, 0xe5, 0xba, 0xef, 0x05, 0x46, 0xbb, 0x8e, 0x9b, 0x4e, 0xbb, 0x45, 0x29, 0x7d, 0xbb, - 0x14, 0x40, 0xa7, 0xba, 0x6f, 0x47, 0x9c, 0xbb, 0x90, 0x92, 0x17, 0xbb, 0x47, 0x47, 0x65, 0xbb, - 0x61, 0x1e, 0xee, 0xba, 0x2e, 0xe4, 0x23, 0xbb, 0x70, 0x72, 0x14, 0xbb, 0xa8, 0xbd, 0x7b, 0xbb, - 0xe7, 0x33, 0x54, 0xbb, 0x0f, 0x77, 0xcb, 0xba, 0x24, 0x0b, 0x79, 0xbb, 0xc0, 0x92, 0xf0, 0xba, - 0x1d, 0x4d, 0xfc, 0xba, 0xe6, 0x4a, 0xe2, 0xba, 0x42, 0xed, 0x4a, 0xbb, 0xc7, 0xb5, 0x3d, 0xbb, - 0xf6, 0xd6, 0x9c, 0xba, 0x19, 0xbf, 0x2e, 0xbb, 0xfe, 0x95, 0x53, 0xbb, 0xd4, 0x4d, 0x18, 0xbb, - 0xd6, 0xfa, 0xf9, 0xba, 0xe3, 0xe8, 0xe7, 0xba, 0x2a, 0xbd, 0xb2, 0x3a, 0xd4, 0x89, 0xbd, 0x38, - 0xb7, 0x73, 0x36, 0xbb, 0xb0, 0x4e, 0xcd, 0xba, 0x7f, 0xed, 0xab, 0xba, 0x3e, 0x5e, 0x18, 0xba, - 0x84, 0x3e, 0xae, 0x38, 0xa5, 0x4e, 0xc3, 0xba, 0x86, 0xb7, 0x94, 0xba, 0xeb, 0x6a, 0x49, 0xbb, - 0x38, 0x76, 0xed, 0xb8, 0x01, 0x2f, 0x39, 0xbb, 0x55, 0xa1, 0xb9, 0xba, 0xc9, 0xf5, 0x05, 0xbb, - 0x19, 0x35, 0xcf, 0xba, 0xa6, 0xdf, 0x3f, 0xbb, 0x78, 0xfd, 0xdf, 0xba, 0x99, 0xd0, 0xee, 0xba, - 0xc9, 0x5a, 0x08, 0x3a, 0xa0, 0xa7, 0x3d, 0xba, 0x5c, 0xa4, 0x01, 0xbb, 0xd0, 0xec, 0x52, 0xb8, - 0xb2, 0x1f, 0x1d, 0x3a, 0x3d, 0x53, 0x28, 0xbb, 0x7f, 0x18, 0x8d, 0xb9, 0x3b, 0xb5, 0x4a, 0xba, - 0x96, 0xa5, 0x5a, 0xb9, 0xab, 0xb2, 0x56, 0xbb, 0x22, 0x55, 0x4d, 0xba, 0x5f, 0x68, 0x89, 0x3a, - 0xc7, 0x52, 0x77, 0x3d, 0x2d, 0x0b, 0x94, 0xbd, 0x09, 0x2a, 0x32, 0x3d, 0x6c, 0x67, 0x42, 0xbd, - 0x64, 0xcf, 0x15, 0xbe, 0x3a, 0x18, 0xe5, 0xbd, 0x42, 0x16, 0x20, 0xbd, 0x5f, 0x26, 0x60, 0xbd, - 0x57, 0xd4, 0xa4, 0xbd, 0xc0, 0xe5, 0x54, 0xbd, 0x6d, 0xb1, 0x0e, 0xbe, 0x14, 0xf3, 0x25, 0xbc, - 0x0f, 0x2b, 0x85, 0x3d, 0x2c, 0xc9, 0x31, 0xbe, 0x67, 0x24, 0x9c, 0xbd, 0x66, 0x15, 0xd6, 0xbd, - 0x08, 0xbe, 0x9d, 0xbd, 0xfd, 0xf6, 0xb6, 0xbd, 0x82, 0xde, 0x91, 0xbd, 0xbe, 0x65, 0x45, 0xbd, - 0x7e, 0x89, 0xd5, 0xbd, 0x80, 0x37, 0x8c, 0x3a, 0x46, 0xb7, 0xc5, 0xbd, 0x56, 0x5a, 0xcb, 0xbd, - 0xa0, 0xac, 0x79, 0xbd, 0x40, 0xac, 0x02, 0xbe, 0x9f, 0x03, 0x6f, 0xbd, 0xe2, 0xec, 0x7f, 0x3c, - 0x6b, 0xe5, 0x25, 0xbd, 0x3f, 0xc2, 0x36, 0xbe, 0x15, 0xa2, 0x37, 0xbd, 0x8d, 0xc0, 0xab, 0xbd, - 0x68, 0x9b, 0xbd, 0xbd, 0xbb, 0xc6, 0x78, 0xbd, 0xde, 0xab, 0xae, 0xbd, 0xb7, 0x11, 0x44, 0xbd, - 0x94, 0x26, 0x75, 0xbd, 0x7b, 0x8f, 0xbb, 0xbd, 0xa8, 0x49, 0xc7, 0xbd, 0x9a, 0x7a, 0x4a, 0xbd, - 0xce, 0x7e, 0x8b, 0xbd, 0x14, 0xaf, 0xb9, 0xbd, 0x98, 0x46, 0x93, 0xbd, 0xca, 0x71, 0x82, 0xbd, - 0x67, 0xab, 0x36, 0xbd, 0x80, 0xa7, 0xa4, 0xbd, 0xea, 0xfa, 0x70, 0xbd, 0x84, 0x77, 0xcc, 0xbd, - 0xb5, 0x41, 0xa6, 0xbd, 0x4a, 0xdc, 0xa3, 0xbd, 0xe2, 0xc6, 0x81, 0xbd, 0xa0, 0x6b, 0x91, 0xbd, - 0x29, 0x22, 0x6c, 0xbd, 0x44, 0xc4, 0xbb, 0xbd, 0xd4, 0x20, 0xb1, 0xbd, 0xf6, 0xa5, 0x7f, 0xbd, - 0xe6, 0x49, 0x46, 0xbd, 0xbc, 0x09, 0xf5, 0xbd, 0x3f, 0x15, 0x4b, 0xbd, 0x78, 0xa5, 0x13, 0xbd, - 0x89, 0x50, 0xa1, 0xbd, 0x80, 0xc9, 0xd2, 0xbd, 0xa3, 0x2a, 0x98, 0xbd, 0xa3, 0xa1, 0xa0, 0xbd, - 0xbe, 0xed, 0x5b, 0x3d, 0x20, 0xd7, 0x21, 0x3d, 0xac, 0x7a, 0x80, 0x3d, 0xea, 0x3d, 0x43, 0x3d, - 0x45, 0xa9, 0x34, 0x3d, 0x5a, 0x2b, 0x37, 0x3d, 0xca, 0x12, 0x3e, 0x3d, 0x82, 0xcf, 0x59, 0x3d, - 0x58, 0xd0, 0x79, 0x3d, 0x60, 0x74, 0x84, 0x3d, 0x32, 0x17, 0x6d, 0x3d, 0x93, 0x7b, 0x36, 0x3d, - 0xf3, 0x17, 0x0d, 0x3d, 0xec, 0xcd, 0x6c, 0x3d, 0xe8, 0x8c, 0x53, 0x3d, 0xdc, 0x56, 0x60, 0x3d, - 0x95, 0x33, 0x5e, 0x3d, 0xb6, 0x10, 0x54, 0x3d, 0x33, 0x0e, 0xf0, 0x3c, 0x66, 0x84, 0x2d, 0x3d, - 0x17, 0x2f, 0x3a, 0x3d, 0x43, 0x25, 0x80, 0x3d, 0xc0, 0xcc, 0x47, 0x3d, 0x4d, 0xaa, 0x36, 0x3d, - 0x34, 0xaa, 0x22, 0x3d, 0x91, 0x9d, 0x4a, 0x3d, 0x18, 0xdb, 0x44, 0x3d, 0x9c, 0xe5, 0x42, 0x3d, - 0xe6, 0xc9, 0x5a, 0x3d, 0x7d, 0xce, 0x5d, 0x3d, 0x43, 0x17, 0x80, 0x3d, 0xeb, 0x55, 0x56, 0x3d, - 0x54, 0x03, 0x86, 0x3c, 0x8e, 0x70, 0x73, 0x3c, 0x12, 0xe1, 0x37, 0x3c, 0x01, 0x5c, 0x01, 0x3c, - 0x58, 0xc5, 0x85, 0x3c, 0xf6, 0x5e, 0x8a, 0x3c, 0x72, 0x4e, 0xfd, 0x3c, 0x36, 0xcf, 0x16, 0x3c, - 0x48, 0x80, 0xac, 0x3c, 0x6e, 0xde, 0x0d, 0x3c, 0x63, 0x46, 0x97, 0x3c, 0xa7, 0x4e, 0x18, 0x3c, - 0x38, 0xe9, 0xfc, 0x3b, 0x10, 0xed, 0xbc, 0x3c, 0x66, 0xa3, 0xb3, 0x3c, 0xa8, 0x99, 0x9b, 0x3c, - 0x65, 0x51, 0x38, 0x3c, 0x41, 0x8a, 0xdf, 0x3c, 0xd7, 0xff, 0x34, 0x3a, 0xfb, 0xd5, 0xa8, 0x3c, - 0x1c, 0x55, 0x7f, 0x3b, 0x41, 0x7f, 0xbd, 0x3c, 0xcd, 0xcc, 0x4b, 0x3c, 0x3e, 0x1f, 0x93, 0x3c, - 0x42, 0xf7, 0x94, 0x3c, 0x4a, 0xd5, 0xd6, 0x3c, 0x4e, 0x01, 0x20, 0x3c, 0xa6, 0x72, 0x95, 0xbb, - 0x06, 0x76, 0x25, 0x3c, 0xc8, 0x43, 0x90, 0x3c, 0x78, 0xbc, 0x2a, 0x3c, 0x24, 0xb1, 0x8b, 0x3c, - 0xdf, 0x97, 0xbd, 0xbb, 0x48, 0x36, 0x98, 0xbb, 0x76, 0x94, 0xe9, 0xbb, 0xe8, 0xf0, 0xc2, 0xbb, - 0x93, 0x2c, 0xb2, 0xbb, 0xf4, 0x71, 0x93, 0xbb, 0x6b, 0x76, 0xac, 0xbb, 0x92, 0x75, 0xde, 0xbb, - 0x8e, 0xf0, 0x01, 0xbc, 0xe6, 0x0b, 0xe9, 0xbb, 0x76, 0x37, 0xec, 0xbb, 0xbc, 0xa3, 0xa4, 0xbb, - 0xf6, 0x3e, 0x85, 0xbb, 0x84, 0x22, 0xea, 0xbb, 0xdc, 0x86, 0xe1, 0xbb, 0x39, 0x47, 0xbf, 0xbb, - 0x42, 0x67, 0xc4, 0xbb, 0x48, 0xcb, 0xd3, 0xbb, 0x7f, 0xe6, 0x17, 0xbb, 0x24, 0x6e, 0xa4, 0xbb, - 0xcb, 0xd5, 0xa6, 0xbb, 0x78, 0x4e, 0xf5, 0xbb, 0xe0, 0xab, 0xa5, 0xbb, 0x27, 0x73, 0xb4, 0xbb, - 0x30, 0x1d, 0xac, 0xbb, 0x14, 0x6b, 0x9b, 0xbb, 0x73, 0x12, 0xc6, 0xbb, 0x14, 0x08, 0xbb, 0xbb, - 0xde, 0x0e, 0xc1, 0xbb, 0xe1, 0x99, 0xb6, 0xbb, 0x13, 0x56, 0xf3, 0xbb, 0x96, 0x78, 0xc7, 0xbb, - 0xd2, 0x82, 0x06, 0xbb, 0xfb, 0xda, 0x0e, 0xbb, 0x00, 0x94, 0x01, 0xbb, 0x27, 0x3d, 0x01, 0xbb, - 0x97, 0x4e, 0x2e, 0xbb, 0x3e, 0x08, 0xe1, 0xba, 0xef, 0x73, 0x73, 0xbb, 0xd1, 0xbe, 0x1c, 0xbb, - 0x58, 0x07, 0x81, 0xbb, 0xbf, 0xf8, 0xc5, 0xba, 0xb8, 0x1b, 0x56, 0xbb, 0x9e, 0x75, 0xc4, 0xba, - 0x17, 0xa6, 0xb6, 0xba, 0x03, 0x86, 0x70, 0xbb, 0x4c, 0x24, 0x80, 0xbb, 0xfe, 0x17, 0x14, 0xbb, - 0xe6, 0x05, 0xde, 0xba, 0xd1, 0x8e, 0x83, 0xbb, 0xde, 0x79, 0x0e, 0x3a, 0x3a, 0x4d, 0x3a, 0xbb, - 0x98, 0x3c, 0x75, 0xba, 0x1c, 0x25, 0x68, 0xbb, 0xb9, 0x5d, 0xba, 0xba, 0x38, 0x55, 0x3a, 0xbb, - 0x0f, 0x23, 0x4b, 0xbb, 0xae, 0x84, 0x1e, 0xbb, 0x43, 0x2d, 0x11, 0xbb, 0x94, 0x5a, 0x01, 0xba, - 0x31, 0x7a, 0xcc, 0xba, 0x65, 0x72, 0xfb, 0xba, 0x7e, 0x13, 0x11, 0xbb, 0x38, 0xc2, 0x27, 0xbb, - 0xfe, 0xad, 0x5b, 0xbb, 0xc7, 0x45, 0x17, 0xbb, 0x9e, 0x23, 0x6e, 0xbb, 0x57, 0xf6, 0x22, 0xbb, - 0xa6, 0x64, 0x20, 0xbb, 0xc5, 0x18, 0x44, 0xbb, 0x0c, 0x00, 0x43, 0xbb, 0xe7, 0x23, 0x31, 0xbb, - 0x25, 0x09, 0x4f, 0xbb, 0x2b, 0xe2, 0x7a, 0xbb, 0x88, 0xac, 0x4d, 0xbb, 0xd5, 0x71, 0x2b, 0xbb, - 0x29, 0x17, 0xfe, 0xba, 0x72, 0x01, 0x53, 0xbb, 0x50, 0x3a, 0x2d, 0xbb, 0xb1, 0x17, 0x64, 0xbb, - 0x75, 0xc6, 0x54, 0xbb, 0x05, 0x47, 0x40, 0xbb, 0x55, 0xaa, 0x0c, 0xbb, 0xc6, 0xc8, 0x24, 0xbb, - 0xd6, 0x7c, 0x2b, 0xbb, 0x99, 0xf7, 0x6a, 0xbb, 0x85, 0x28, 0x4c, 0xbb, 0x50, 0x0f, 0x23, 0xbb, - 0x4d, 0x94, 0x07, 0xbb, 0xde, 0xc5, 0x66, 0xbb, 0xcc, 0x44, 0x24, 0xbb, 0xf5, 0x0b, 0x20, 0xbb, - 0xdc, 0x08, 0x51, 0xbb, 0xc9, 0x13, 0x67, 0xbb, 0xe7, 0x53, 0x62, 0xbb, 0x60, 0x4d, 0x48, 0xbb, - 0x11, 0xaf, 0xc2, 0xba, 0xfa, 0xf9, 0x8c, 0xba, 0x50, 0xb2, 0x8a, 0xba, 0x08, 0x90, 0x02, 0xba, - 0xbe, 0xbe, 0x8a, 0xba, 0xc1, 0xd1, 0xd1, 0xba, 0x98, 0x43, 0x10, 0xbb, 0xf8, 0x36, 0x04, 0xba, - 0xc2, 0x5e, 0x98, 0xba, 0xea, 0xa8, 0x86, 0xba, 0xdb, 0x05, 0x99, 0xba, 0x05, 0xae, 0x5f, 0xba, - 0x92, 0xd8, 0x1d, 0xba, 0xdf, 0x43, 0xc2, 0xba, 0x80, 0xaf, 0x97, 0xba, 0xee, 0xb8, 0xdd, 0xba, - 0x58, 0xc1, 0x8f, 0xba, 0x1c, 0x15, 0xe0, 0xba, 0x67, 0x9f, 0x3b, 0xba, 0x7f, 0x02, 0xbb, 0xba, - 0x76, 0x3a, 0x0d, 0xba, 0x5e, 0x77, 0xd3, 0xba, 0x27, 0x28, 0xaa, 0xba, 0x8a, 0x8d, 0x97, 0xba, - 0x4a, 0x11, 0x82, 0xba, 0x22, 0x9d, 0x1a, 0xbb, 0xe7, 0x23, 0x1b, 0xba, 0xba, 0x0c, 0x2d, 0x39, - 0x16, 0x31, 0x86, 0xba, 0x02, 0xad, 0xde, 0xba, 0x4a, 0x1e, 0x5e, 0xba, 0xcc, 0x6b, 0xa9, 0xba, - 0xed, 0x31, 0x85, 0xbd, 0x9d, 0x4c, 0x7c, 0xbd, 0x73, 0x1f, 0xa6, 0xbd, 0xac, 0xfe, 0x9d, 0xbd, - 0x26, 0xea, 0x9b, 0xbd, 0x68, 0xad, 0x40, 0xbd, 0x9b, 0x07, 0x9d, 0xbd, 0x6c, 0x36, 0xba, 0xbd, - 0xbe, 0x93, 0xee, 0xbd, 0x52, 0x3d, 0x98, 0xbd, 0x17, 0x8b, 0xcc, 0xbd, 0xb3, 0x5e, 0x6b, 0xbd, - 0x13, 0xec, 0x4d, 0xbd, 0x81, 0x05, 0xd0, 0xbd, 0xfa, 0x8c, 0xda, 0xbd, 0xa2, 0x70, 0x87, 0xbd, - 0x37, 0x8f, 0x87, 0xbd, 0x47, 0x74, 0xc8, 0xbd, 0x9e, 0xf9, 0x86, 0xbb, 0xaf, 0xc1, 0x90, 0xbd, - 0xc2, 0xaf, 0x5a, 0xbd, 0x27, 0x27, 0xcf, 0xbd, 0x3b, 0xee, 0x51, 0xbd, 0x2e, 0x6e, 0xa0, 0xbd, - 0x9c, 0xa5, 0xa7, 0xbd, 0x00, 0x7f, 0x4f, 0xbd, 0x3a, 0xec, 0xa4, 0xbd, 0xe9, 0xd6, 0x78, 0xbd, - 0xcc, 0x5b, 0x83, 0xbd, 0xda, 0xc9, 0x6f, 0xbd, 0xb7, 0x0e, 0xb8, 0xbd, 0xa9, 0x12, 0x9f, 0xbd, - 0x7c, 0x8c, 0x9e, 0xbd, 0x2e, 0x03, 0x82, 0xbc, 0x97, 0x56, 0x25, 0xbc, 0x41, 0x8d, 0x22, 0xbd, - 0xda, 0x86, 0x3e, 0xbd, 0x33, 0x74, 0x12, 0xbd, 0xfa, 0xe8, 0x30, 0xbd, 0x0d, 0x8f, 0x89, 0xbc, - 0x1a, 0xfd, 0xbc, 0x3b, 0x53, 0x15, 0x49, 0xbd, 0x1e, 0x79, 0x06, 0xbd, 0xe4, 0xb9, 0x46, 0xbd, - 0xea, 0xaf, 0x84, 0xbb, 0x41, 0x14, 0x2e, 0xbd, 0xba, 0xf3, 0x0d, 0xbd, 0x5c, 0x18, 0x25, 0xbd, - 0xfc, 0xbf, 0x30, 0xbd, 0x66, 0x2c, 0x05, 0xbc, 0x80, 0x4c, 0xd8, 0xbc, 0x5d, 0xe8, 0x4a, 0xbd, - 0xc1, 0x2c, 0x29, 0xbd, 0x18, 0x21, 0xf3, 0xbc, 0x32, 0xfc, 0x13, 0xbd, 0x78, 0x5d, 0x98, 0x3a, - 0xb9, 0x17, 0x6e, 0x3c, 0xaa, 0x19, 0x4b, 0xbc, 0x4c, 0x1a, 0xba, 0xbc, 0x76, 0x65, 0x90, 0xbc, - 0x27, 0x4b, 0xd4, 0xbc, 0x44, 0x72, 0x82, 0xbd, 0x80, 0x74, 0x18, 0xbd, 0x8e, 0xdf, 0x32, 0xbd, - 0xc4, 0x60, 0xd3, 0x3c, 0x41, 0x81, 0x12, 0x3c, 0x23, 0x83, 0x1d, 0x3c, 0x2b, 0x27, 0xb9, 0x3c, - 0xd7, 0xe0, 0x07, 0x3d, 0x34, 0xd5, 0xa1, 0x3c, 0x6c, 0xde, 0x68, 0x3c, 0xbe, 0xf6, 0xed, 0x3c, - 0x8c, 0xb3, 0x73, 0x3c, 0xf6, 0x49, 0x0f, 0x3d, 0x94, 0x0b, 0x0a, 0x3d, 0x52, 0x72, 0xaf, 0x3c, - 0xc2, 0xd6, 0xcf, 0xbb, 0xbe, 0xe5, 0x24, 0x3d, 0xb3, 0x59, 0xa5, 0x3c, 0xd1, 0x78, 0xb7, 0x3c, - 0x79, 0x8b, 0x90, 0x3c, 0xb9, 0xf2, 0x59, 0x3c, 0x7d, 0x67, 0x05, 0x3c, 0xde, 0x03, 0xd8, 0x3c, - 0x28, 0x6a, 0xd8, 0x3c, 0x8f, 0x71, 0x80, 0x3c, 0x1a, 0xf7, 0xce, 0x3c, 0xee, 0x3a, 0x2e, 0x3c, - 0x26, 0xd3, 0x50, 0x3c, 0xd4, 0xc1, 0x1b, 0x3c, 0x85, 0x43, 0xa6, 0x3c, 0xb4, 0x65, 0xa2, 0x3c, - 0x3b, 0x48, 0x8f, 0x3c, 0x3a, 0xbb, 0xb1, 0x3c, 0x02, 0x9a, 0x26, 0x3d, 0x7a, 0xd3, 0x2f, 0x3d, - 0xe6, 0xda, 0xb0, 0x3b, 0xfc, 0x6f, 0x36, 0x3b, 0x1a, 0xbe, 0xfe, 0xbb, 0x7c, 0x6f, 0x09, 0xbc, - 0x80, 0xfd, 0x51, 0x3c, 0x26, 0xc6, 0x80, 0x3c, 0x14, 0xbc, 0x37, 0x3c, 0x58, 0x7f, 0xe8, 0x3b, - 0x38, 0x29, 0x84, 0xba, 0x9c, 0x48, 0x94, 0xba, 0x47, 0x5d, 0xeb, 0x3b, 0xc2, 0xab, 0x2c, 0x3c, - 0xb8, 0x8a, 0x30, 0x3a, 0x90, 0xca, 0xf2, 0x3b, 0xae, 0xce, 0xc5, 0x3c, 0x1b, 0x52, 0x81, 0x3b, - 0x44, 0x00, 0x26, 0x3c, 0x75, 0x5e, 0xc6, 0x3b, 0x3d, 0x2e, 0x07, 0xbc, 0xfa, 0xd4, 0x3f, 0x3c, - 0x40, 0xe2, 0xf8, 0x3a, 0xb0, 0x64, 0xdf, 0xbb, 0xce, 0xba, 0xc3, 0xba, 0x64, 0x59, 0x2d, 0xbc, - 0x6c, 0x3a, 0x62, 0x3b, 0x52, 0x5d, 0x53, 0x3c, 0x50, 0x6b, 0xb9, 0x3a, 0x88, 0x4e, 0x16, 0xbb, - 0xb4, 0xf7, 0x44, 0x3a, 0x9d, 0x9b, 0x8a, 0x3c, 0xb3, 0xc5, 0x4e, 0x3c, 0x94, 0xdc, 0x33, 0x3b, - 0xda, 0x70, 0xcc, 0xba, 0x5a, 0x6b, 0x79, 0xba, 0xb6, 0xf3, 0x3b, 0xba, 0x25, 0x01, 0xd8, 0xba, - 0xee, 0x3e, 0x85, 0xbb, 0xa9, 0x99, 0x27, 0xbb, 0x7f, 0x3b, 0xa4, 0xba, 0xe0, 0x0f, 0x88, 0xbb, - 0x1a, 0x84, 0x1b, 0xbb, 0x1f, 0xf1, 0x68, 0xbb, 0x20, 0x90, 0x8d, 0xbb, 0x32, 0x15, 0x0b, 0xbb, - 0x3e, 0xfb, 0x89, 0x3a, 0x6a, 0x98, 0xa3, 0xbb, 0xed, 0xd5, 0x49, 0xbb, 0x32, 0x0c, 0x11, 0xbb, - 0x54, 0xc6, 0xe0, 0xba, 0x6d, 0xd9, 0x04, 0xbb, 0x50, 0x8b, 0x95, 0x38, 0xfa, 0x5d, 0x3d, 0xbb, - 0x70, 0xb1, 0x2f, 0xbb, 0xc1, 0xea, 0x85, 0xba, 0x26, 0x8a, 0x24, 0xbb, 0xbd, 0xd0, 0x90, 0xba, - 0x1a, 0x5a, 0x28, 0xbb, 0x58, 0x4b, 0xd9, 0xba, 0x4c, 0x86, 0x1a, 0xbb, 0x10, 0xdb, 0x14, 0xbb, - 0xff, 0x93, 0xeb, 0xba, 0xf8, 0x93, 0x01, 0xbb, 0xc8, 0xf1, 0xb3, 0xbb, 0xcd, 0xc9, 0xa7, 0xbb, - 0x72, 0x16, 0xfc, 0x39, 0x13, 0xf9, 0xaf, 0xb9, 0x4d, 0xee, 0x85, 0x3a, 0x51, 0x30, 0xbe, 0x3a, - 0x1d, 0x15, 0x05, 0xbb, 0x38, 0x27, 0x0d, 0xbb, 0xa4, 0xd9, 0x79, 0xba, 0x9f, 0x40, 0x00, 0xbb, - 0xb4, 0xb5, 0x4d, 0xba, 0x00, 0x03, 0x6f, 0x38, 0x2b, 0xa7, 0xda, 0xba, 0x7c, 0x96, 0x87, 0xba, - 0x36, 0x13, 0xb8, 0x39, 0x84, 0xd3, 0xdf, 0xba, 0x1e, 0xcd, 0x63, 0xbb, 0xec, 0x12, 0xc1, 0xb9, - 0x92, 0x9b, 0x7c, 0xba, 0x0f, 0xb0, 0xaa, 0xba, 0xe4, 0x36, 0xe0, 0x3a, 0x58, 0x93, 0xba, 0xba, - 0xd8, 0x16, 0x70, 0xb9, 0x38, 0xf3, 0xa0, 0x3a, 0xc8, 0x1a, 0x2d, 0x39, 0x85, 0x3f, 0x85, 0x3a, - 0xc6, 0xe3, 0xd6, 0xba, 0xde, 0xe3, 0x02, 0xbb, 0xc2, 0x46, 0xf6, 0xb9, 0xb0, 0x0f, 0x7a, 0xb8, - 0x80, 0x66, 0x01, 0xb9, 0xf4, 0x8e, 0xc4, 0xba, 0x07, 0x09, 0x30, 0xbb, 0x67, 0x7b, 0x90, 0xba, - 0x26, 0x8d, 0x0f, 0xbb, 0xfc, 0x1e, 0x14, 0xba, 0x62, 0x12, 0x24, 0xba, 0x97, 0x87, 0xd3, 0xba, - 0x50, 0xab, 0xf3, 0xba, 0x21, 0x74, 0x95, 0xba, 0xd7, 0xd6, 0x91, 0xba, 0xf5, 0xd5, 0xa7, 0xba, - 0x16, 0xa7, 0xf8, 0xb9, 0xb7, 0x85, 0x0c, 0xbb, 0xe3, 0x9a, 0xe1, 0xba, 0xf5, 0xfc, 0xc1, 0xba, - 0xa0, 0x77, 0x3b, 0x39, 0x2e, 0x4a, 0x0b, 0xbb, 0x46, 0x5b, 0x87, 0xba, 0xfa, 0xaa, 0xbf, 0xba, - 0xb7, 0x64, 0xa4, 0xba, 0xe5, 0x71, 0x12, 0xba, 0x25, 0x70, 0x5e, 0xba, 0xaf, 0xa4, 0xda, 0xba, - 0xdd, 0xf6, 0xd8, 0xba, 0xff, 0x11, 0x99, 0xba, 0x1a, 0x8d, 0xcd, 0xba, 0x0d, 0x21, 0x06, 0xba, - 0xe4, 0xf7, 0x49, 0xb9, 0xd2, 0xae, 0xd2, 0xb9, 0x63, 0x11, 0x93, 0xba, 0xa4, 0xb3, 0x8b, 0xba, - 0x90, 0x59, 0x8d, 0xba, 0x5d, 0x0f, 0xda, 0xba, 0xec, 0x46, 0x02, 0xbb, 0x69, 0xf2, 0x16, 0xbb, - 0x12, 0x5f, 0x99, 0xba, 0x27, 0xcf, 0x86, 0xb9, 0x13, 0x57, 0x7f, 0x39, 0xb6, 0xed, 0x86, 0xb8, - 0x22, 0x85, 0x5c, 0xba, 0x60, 0x03, 0x76, 0xba, 0x02, 0x5f, 0x7c, 0xba, 0xac, 0xb6, 0x3f, 0xb9, - 0xa0, 0xb3, 0xa7, 0x39, 0x11, 0x79, 0xb1, 0xb9, 0xe7, 0x38, 0xcf, 0xb9, 0x02, 0x66, 0x75, 0xba, - 0x23, 0x62, 0x34, 0xb9, 0x29, 0x35, 0xfd, 0xb9, 0x76, 0x52, 0xa1, 0xba, 0x4a, 0x82, 0x0d, 0xba, - 0xe4, 0x50, 0x66, 0xba, 0x66, 0xbc, 0x4d, 0xb9, 0xa8, 0x3e, 0xf2, 0xb7, 0xc2, 0x20, 0x75, 0xba, - 0x6d, 0x1b, 0xe1, 0xb9, 0x5b, 0x60, 0x34, 0xb8, 0x3a, 0x8a, 0x71, 0xb9, 0x35, 0xef, 0x0f, 0x3a, - 0xec, 0xa4, 0x8e, 0x39, 0xcd, 0xd3, 0x15, 0xba, 0x78, 0x9f, 0x3a, 0xb9, 0x92, 0xe5, 0x27, 0x38, - 0xc6, 0x2f, 0x7d, 0xb9, 0xdf, 0xc2, 0xba, 0xba, 0x9e, 0x66, 0x19, 0xba, 0xac, 0x3b, 0x9a, 0xb9, - 0x57, 0x74, 0xec, 0x3b, 0x74, 0x3b, 0x2c, 0xbc, 0xcd, 0x0a, 0xc3, 0x3b, 0x30, 0x7f, 0xfc, 0x3b, - 0xa1, 0x51, 0x69, 0xbd, 0x92, 0x7a, 0x34, 0xbd, 0xb7, 0xb7, 0x65, 0xbc, 0xf0, 0x28, 0x84, 0xbd, - 0xbc, 0x69, 0x17, 0xbd, 0xf8, 0xc7, 0xdd, 0xbc, 0x2a, 0x53, 0x72, 0xbd, 0xab, 0x8e, 0xbf, 0xbc, - 0x06, 0xab, 0x89, 0x3c, 0xd1, 0x9c, 0x84, 0xbd, 0xda, 0x4c, 0x85, 0xbd, 0x4c, 0x5d, 0x9b, 0xbc, - 0x26, 0x7a, 0x9e, 0xbc, 0xff, 0x17, 0x11, 0xbd, 0x18, 0x4c, 0xda, 0x3c, 0xc9, 0x03, 0x14, 0xbd, - 0x26, 0x95, 0xb7, 0xbc, 0x74, 0x94, 0x1f, 0x3c, 0x52, 0x2d, 0x8e, 0xbc, 0x20, 0x39, 0xa8, 0xba, - 0xea, 0xcc, 0x50, 0xbd, 0x1a, 0xbb, 0x1b, 0xbd, 0xb7, 0x10, 0xd6, 0xbc, 0x49, 0xb9, 0xb2, 0xbc, - 0x8e, 0xe1, 0x77, 0xbc, 0x55, 0xfa, 0xbd, 0xbc, 0x9d, 0x5a, 0xa9, 0xbd, 0x47, 0x07, 0x72, 0xbd, - 0x25, 0x97, 0x0a, 0xc0, 0x6c, 0xcf, 0x30, 0xc0, 0x6d, 0x42, 0x8e, 0xbe, 0xd7, 0x8f, 0xb6, 0xbf, - 0x4d, 0x71, 0x2b, 0xc0, 0x19, 0x12, 0xa7, 0xbf, 0x7a, 0xd8, 0xf2, 0xbf, 0xa8, 0xe4, 0xda, 0xbf, - 0xc3, 0x92, 0xe8, 0xbf, 0x50, 0x27, 0x12, 0xc0, 0xa4, 0xed, 0xe4, 0xbf, 0x4f, 0x0d, 0x58, 0xc0, - 0x52, 0x2a, 0xe9, 0xbf, 0x69, 0xd4, 0x4e, 0xc0, 0xa5, 0xab, 0xc9, 0xbf, 0xfe, 0x08, 0x3d, 0xc0, - 0xb3, 0x05, 0xc5, 0xbf, 0xaa, 0xde, 0x65, 0xc0, 0xe6, 0x4b, 0x04, 0xc0, 0x18, 0xe7, 0x23, 0xc0, - 0x02, 0x01, 0x24, 0xbf, 0x18, 0x65, 0xff, 0xbf, 0x6e, 0xe4, 0x40, 0xc0, 0x71, 0x59, 0xca, 0xbf, - 0x91, 0x25, 0xd4, 0xbf, 0x84, 0x76, 0x55, 0xc0, 0x29, 0xcf, 0xd6, 0xbf, 0x8f, 0x72, 0x0c, 0xc0, - 0x62, 0xfb, 0x93, 0xbf, 0x5e, 0x1c, 0x05, 0xc0, 0x3f, 0x90, 0x07, 0xc0, 0x63, 0x20, 0x93, 0xbe, - 0x5b, 0xc5, 0x2c, 0x3f, 0xd6, 0x18, 0xcd, 0x3f, 0x9a, 0xcf, 0x02, 0x3f, 0xba, 0xf6, 0xe2, 0x3f, - 0x39, 0xbc, 0xd0, 0x3f, 0x2e, 0x74, 0x57, 0x3f, 0xb2, 0xc4, 0x86, 0x3f, 0x11, 0x49, 0x90, 0x3f, - 0xc0, 0x95, 0xce, 0x3f, 0x41, 0xa3, 0x9d, 0x3f, 0xc6, 0x22, 0xcb, 0x3f, 0x7a, 0x63, 0x9c, 0x3f, - 0x52, 0xfa, 0x7e, 0x3f, 0x76, 0xde, 0xf3, 0x3f, 0x76, 0x58, 0xa5, 0x3f, 0x14, 0x86, 0xdf, 0x3f, - 0x9a, 0x19, 0x9c, 0x3f, 0x47, 0x36, 0xb8, 0x3f, 0x1f, 0xd8, 0x96, 0x3f, 0x49, 0x0a, 0xaa, 0x3f, - 0xea, 0xdc, 0xd5, 0x3f, 0x82, 0xd3, 0x8f, 0x3f, 0x78, 0x86, 0xd1, 0x3f, 0x69, 0x8e, 0xc4, 0x3f, - 0xcc, 0xab, 0xab, 0x3f, 0x26, 0xe8, 0xb6, 0x3f, 0xa6, 0x38, 0xc9, 0x3f, 0x45, 0x05, 0x93, 0x3f, - 0x09, 0x94, 0x80, 0x3f, 0x00, 0x62, 0xcd, 0x3f, 0x86, 0x9b, 0x93, 0x3f, 0x54, 0xb6, 0x73, 0x3f, - 0x69, 0x1c, 0x85, 0x3e, 0x76, 0xcb, 0x3f, 0x3f, 0xe0, 0x45, 0xfc, 0xbd, 0xc0, 0xfd, 0xb1, 0x3d, - 0xce, 0x4a, 0x78, 0x3f, 0xe2, 0x73, 0x38, 0x3f, 0xfc, 0x38, 0x22, 0x3f, 0xb2, 0x0a, 0xab, 0x3d, - 0xde, 0x77, 0xfb, 0x3e, 0xc4, 0x50, 0x12, 0x3f, 0x46, 0x57, 0x34, 0x3f, 0xce, 0xd7, 0x08, 0x3f, - 0xbd, 0xe8, 0x81, 0xbe, 0x17, 0x87, 0xaa, 0x3f, 0x54, 0x94, 0x54, 0x3e, 0x1c, 0x2a, 0x75, 0x3f, - 0x00, 0xc8, 0x0e, 0x3f, 0x55, 0xdd, 0xb5, 0x3f, 0x01, 0x64, 0x63, 0x3f, 0x77, 0x9f, 0xd0, 0x3e, - 0x1c, 0x13, 0x15, 0xbe, 0xc6, 0xd7, 0x30, 0x3f, 0x16, 0x37, 0x69, 0x3f, 0xb4, 0xf0, 0x55, 0x3f, - 0xc0, 0x85, 0xaa, 0x3c, 0xfd, 0x0d, 0xa8, 0x3f, 0x94, 0x90, 0xbc, 0x3d, 0xdf, 0x3c, 0x14, 0xbe, - 0xed, 0x64, 0x81, 0x3e, 0x15, 0xfc, 0xb0, 0x3f, 0x3d, 0xec, 0xef, 0xbd, 0x68, 0x0e, 0xc5, 0xbd, - 0x2f, 0x3c, 0x2c, 0xbd, 0x33, 0x2d, 0x3e, 0xbe, 0x79, 0xc8, 0x7a, 0xbd, 0x86, 0x1f, 0x64, 0xbe, - 0x66, 0xb4, 0x50, 0xbe, 0xa6, 0x38, 0xf3, 0xbd, 0x5f, 0x68, 0x01, 0xbe, 0x14, 0x18, 0xee, 0xbd, - 0x7b, 0x07, 0x53, 0xbe, 0xd4, 0x7a, 0x0c, 0xbe, 0x9c, 0xc8, 0x5a, 0xbe, 0x42, 0x9d, 0xc7, 0xbd, - 0xff, 0xa2, 0x9b, 0xbd, 0x70, 0x8b, 0x7a, 0xbe, 0xcc, 0x43, 0x1b, 0xbe, 0x53, 0x71, 0x58, 0xbe, - 0x45, 0x56, 0x23, 0xbe, 0xfb, 0x33, 0x2b, 0xbe, 0xde, 0xee, 0x1b, 0xbe, 0xe8, 0x7e, 0x09, 0xbe, - 0x8c, 0x50, 0x63, 0xbe, 0xce, 0xda, 0x0b, 0xbe, 0x8d, 0x32, 0x43, 0xbe, 0x1e, 0xb2, 0x60, 0xbe, - 0xa4, 0x7f, 0x16, 0xbe, 0xf8, 0xdc, 0x2c, 0xbe, 0xb0, 0xe5, 0x3c, 0xbe, 0x08, 0x37, 0xbc, 0xbd, - 0x01, 0x8b, 0xff, 0xbd, 0xc4, 0x42, 0x76, 0xbe, 0x30, 0xa9, 0xc5, 0xbd, 0x14, 0x7e, 0x03, 0xbe, - 0x70, 0x95, 0x12, 0xba, 0x3c, 0xa2, 0xcd, 0xbd, 0x08, 0x81, 0x0f, 0x3b, 0x86, 0xce, 0x5d, 0xbd, - 0x8b, 0x08, 0x0d, 0xbe, 0x9d, 0x6b, 0xda, 0xbd, 0xd1, 0xf6, 0xac, 0xbd, 0x84, 0xbb, 0x42, 0xbc, - 0x1c, 0x1c, 0xc0, 0xbd, 0xa8, 0xca, 0x91, 0xbd, 0x4e, 0x69, 0x00, 0xbe, 0xc6, 0x77, 0xeb, 0xbc, - 0x4f, 0x9e, 0x48, 0x3d, 0x24, 0xdf, 0x3f, 0xbe, 0x3f, 0xfc, 0x30, 0xbd, 0x81, 0xb0, 0x07, 0xbe, - 0x93, 0x23, 0xbf, 0xbd, 0x39, 0x53, 0x29, 0xbe, 0x8b, 0x6d, 0xfc, 0xbd, 0xae, 0xc0, 0x2c, 0xbd, - 0x66, 0x44, 0x25, 0xbd, 0xd1, 0x0f, 0xbf, 0xbd, 0x05, 0xba, 0xf1, 0xbd, 0xdf, 0x06, 0x19, 0xbe, - 0x30, 0xb1, 0x74, 0xbc, 0x3e, 0xfb, 0x20, 0xbe, 0xc6, 0x64, 0x15, 0xbd, 0xbf, 0x54, 0x19, 0x3d, - 0xa2, 0x26, 0x4b, 0xbd, 0xd5, 0x8a, 0x5f, 0xbe, 0x82, 0x03, 0xfa, 0x3c, 0x02, 0x3d, 0xc1, 0xbc, - 0x66, 0x2e, 0x6e, 0xbd, 0x72, 0x2d, 0xc6, 0xbd, 0xcc, 0x85, 0xcf, 0xbc, 0xf4, 0xa2, 0xb6, 0xbd, - 0x14, 0xd9, 0xbf, 0xbd, 0x74, 0x7c, 0x35, 0xbd, 0xc7, 0x65, 0x81, 0xbd, 0x04, 0xce, 0x8e, 0xbd, - 0xa9, 0x65, 0xad, 0xbd, 0x50, 0xe6, 0x9d, 0xbd, 0x1b, 0xe5, 0xa4, 0xbd, 0x67, 0x60, 0xc3, 0xbd, - 0x8d, 0x2a, 0x91, 0xbd, 0x69, 0x86, 0xde, 0xbd, 0xf0, 0xab, 0x93, 0xbd, 0xa3, 0x6a, 0xd2, 0xbd, - 0xd0, 0xeb, 0x83, 0xbd, 0x64, 0xc0, 0xc4, 0xbd, 0x5a, 0x53, 0x8a, 0xbd, 0xfb, 0x01, 0xb3, 0xbd, - 0x6e, 0xe9, 0x99, 0xbd, 0x7c, 0xb7, 0x88, 0xbd, 0x00, 0x45, 0xcd, 0xbd, 0xe6, 0x7f, 0x96, 0xbd, - 0xd0, 0x0a, 0x9f, 0xbd, 0xe7, 0xfa, 0xbd, 0xbd, 0x7e, 0xa8, 0xaf, 0xbd, 0xda, 0xd2, 0xa6, 0xbd, - 0x3a, 0x47, 0x5c, 0xbd, 0x3f, 0xd3, 0x9e, 0xbd, 0x34, 0xdd, 0xa3, 0xbd, 0x47, 0xc5, 0x2a, 0xbd, - 0x7a, 0x35, 0x19, 0xbd, 0xba, 0xa2, 0x5d, 0xbd, 0xf9, 0xea, 0xd0, 0x3b, 0xf8, 0x70, 0x9f, 0xbb, - 0x73, 0x5a, 0x78, 0xbd, 0x6a, 0xaf, 0x1c, 0xbd, 0xa3, 0xf1, 0x2c, 0xbd, 0xc8, 0xb6, 0x8f, 0xbc, - 0xef, 0xb0, 0xe9, 0xbc, 0xa0, 0xa1, 0x34, 0xbd, 0x9a, 0x0b, 0x15, 0xbd, 0x7e, 0x01, 0x7a, 0xbd, - 0xcd, 0x8b, 0x09, 0xbc, 0x1d, 0xda, 0xa3, 0xbd, 0xd2, 0x9c, 0x92, 0xbc, 0xd1, 0xa9, 0x81, 0xbd, - 0xaa, 0x4e, 0x00, 0xbd, 0xa2, 0xdf, 0xc2, 0xbd, 0x83, 0x36, 0x59, 0xbd, 0x7e, 0x66, 0x29, 0xbd, - 0x10, 0x58, 0x80, 0x3c, 0x2e, 0xc9, 0x38, 0xbd, 0x76, 0xef, 0x82, 0xbd, 0x4b, 0xa9, 0x1d, 0xbd, - 0xf6, 0xb1, 0x3e, 0xbc, 0x2d, 0x19, 0xb2, 0xbd, 0x00, 0xe0, 0x40, 0xbc, 0x96, 0x88, 0x89, 0xbc, - 0x12, 0x9f, 0x84, 0xbc, 0x50, 0x1b, 0x88, 0xbd, 0xad, 0x20, 0x87, 0xbc, 0x83, 0x9e, 0x2f, 0x3c, - 0x05, 0x09, 0x7e, 0x3d, 0xf6, 0xce, 0x21, 0xc0, 0x3c, 0x3e, 0x18, 0xbf, 0x7d, 0x91, 0x30, 0xc0, - 0x87, 0x0c, 0x4b, 0xc0, 0x7c, 0x69, 0x0d, 0xc0, 0x31, 0xe2, 0xf2, 0xbf, 0x3f, 0xcd, 0x81, 0xbf, - 0xcc, 0xd8, 0x3a, 0xc0, 0x90, 0xfb, 0xe2, 0xbf, 0x63, 0x81, 0x57, 0xc0, 0xf1, 0x6f, 0x13, 0xbf, - 0x02, 0x1d, 0x1b, 0x3e, 0xeb, 0xae, 0x81, 0xc0, 0x1b, 0x25, 0xe6, 0xbf, 0x89, 0x55, 0x48, 0xc0, - 0x74, 0x6a, 0x1d, 0xc0, 0x66, 0xf7, 0x35, 0xc0, 0x7d, 0x4f, 0x25, 0xc0, 0xa7, 0x22, 0xac, 0xbf, - 0x98, 0xa1, 0x32, 0xc0, 0xc5, 0x94, 0x05, 0xc0, 0xd5, 0x01, 0x2e, 0xc0, 0xe3, 0x4a, 0x70, 0xc0, - 0xad, 0x1b, 0xb4, 0xbf, 0x64, 0x5e, 0x35, 0xc0, 0x51, 0x3a, 0x04, 0xc0, 0xed, 0x59, 0x83, 0xbd, - 0x78, 0x1a, 0xd6, 0xbf, 0xbc, 0x86, 0x94, 0xc0, 0xbb, 0x01, 0x5a, 0xbe, 0xfe, 0xae, 0xd1, 0xbf, - 0xd9, 0xb7, 0xac, 0xbf, 0x8e, 0x01, 0x2e, 0xc0, 0x6e, 0xb2, 0xe5, 0xbf, 0xe6, 0x56, 0x2c, 0xc0, - 0x42, 0xd4, 0x41, 0xc0, 0x0f, 0xc5, 0x84, 0xbf, 0x31, 0xa1, 0x81, 0x3e, 0xae, 0xea, 0x13, 0xc0, - 0x41, 0xbf, 0x41, 0xc0, 0xee, 0x61, 0xaf, 0xbf, 0xf0, 0x02, 0x7d, 0xbf, 0xb5, 0xa3, 0xe7, 0xbf, - 0x76, 0x5d, 0x77, 0xbf, 0xc5, 0xef, 0x02, 0xbf, 0xd4, 0x13, 0x13, 0xc0, 0xc0, 0x36, 0x5e, 0xc0, - 0x6e, 0x53, 0x3e, 0xc0, 0xa6, 0x18, 0x58, 0xc0, 0x62, 0x52, 0x23, 0xc0, 0xfc, 0xe9, 0x23, 0xc0, - 0xfe, 0x2a, 0x0b, 0xc0, 0x41, 0xc1, 0x14, 0xbf, 0xac, 0x1f, 0xdf, 0xbf, 0xd3, 0x3d, 0x00, 0xc0, - 0x08, 0x2f, 0xd7, 0xbf, 0x5f, 0x58, 0x7d, 0xc0, 0x38, 0xf5, 0xfa, 0xbf, 0xcb, 0x1f, 0xaf, 0xbf, - 0x10, 0xa2, 0x78, 0xc0, 0x8b, 0x1b, 0x42, 0xc0, 0x79, 0xb9, 0xfb, 0x3c, 0x74, 0x7d, 0x95, 0xbf, - 0x12, 0x67, 0x2e, 0x3f, 0xda, 0xf2, 0x65, 0x3f, 0xf8, 0xe2, 0xc6, 0x3f, 0xaa, 0xe8, 0x94, 0x3f, - 0x3d, 0x9f, 0x96, 0x3f, 0x48, 0x84, 0xb8, 0x3f, 0xfa, 0x5c, 0x8d, 0x3f, 0x02, 0x84, 0xf7, 0x3f, - 0x68, 0xa8, 0xc3, 0x3f, 0x90, 0xda, 0x96, 0x3f, 0xf0, 0xe9, 0x87, 0x3f, 0x37, 0xb3, 0xbf, 0x3f, - 0x27, 0xee, 0x3b, 0x3f, 0xf6, 0x92, 0x19, 0x3f, 0xfc, 0x71, 0xab, 0x3f, 0xd8, 0x08, 0xe0, 0x3f, - 0x6e, 0x24, 0xca, 0x3f, 0x7e, 0x5e, 0xac, 0x3f, 0xcc, 0x58, 0x9c, 0x3f, 0x2c, 0x79, 0x87, 0x3f, - 0x74, 0xd9, 0xf3, 0x3f, 0xd9, 0x9f, 0x90, 0x3f, 0x53, 0x8a, 0x9b, 0x3f, 0x40, 0xb7, 0xbf, 0x3f, - 0x2a, 0xf1, 0xd8, 0x3f, 0xaa, 0xf1, 0x02, 0x40, 0x9b, 0xc0, 0xc0, 0x3f, 0x80, 0x76, 0x93, 0x3f, - 0xa8, 0xd4, 0x02, 0x40, 0x66, 0xda, 0xa4, 0x3f, 0x9a, 0x10, 0xf9, 0x3e, 0x54, 0xef, 0xa9, 0x3f, - 0x80, 0x59, 0x3d, 0x3e, 0xac, 0x68, 0x80, 0x3f, 0x72, 0x26, 0x61, 0x3f, 0xc8, 0x22, 0x85, 0x3f, - 0x4e, 0x73, 0x50, 0x3f, 0xf4, 0xfc, 0xfc, 0x3d, 0x02, 0x46, 0xbd, 0xbe, 0xc8, 0x41, 0xc0, 0x3e, - 0xb0, 0x5b, 0x92, 0x3f, 0xcb, 0x1e, 0x25, 0x3f, 0xd4, 0xc3, 0xe4, 0xbd, 0x42, 0x98, 0x1a, 0x3e, - 0xc0, 0x86, 0x12, 0x3f, 0x24, 0xa7, 0x59, 0xbe, 0x84, 0x0d, 0x28, 0x3f, 0x1e, 0x51, 0x74, 0x3f, - 0x72, 0xcc, 0xc7, 0xbd, 0x0a, 0xbf, 0x8a, 0x3f, 0x0a, 0x72, 0x6b, 0x3e, 0xd8, 0x54, 0x44, 0x3f, - 0xfd, 0x18, 0x4e, 0x3f, 0x68, 0xc8, 0x41, 0x3d, 0x94, 0xb2, 0x3a, 0x3e, 0xd6, 0x16, 0x7d, 0x3f, - 0x17, 0xfe, 0x04, 0x3f, 0x6d, 0xa7, 0xa6, 0x3e, 0x6f, 0x97, 0xb7, 0x3e, 0xbf, 0xcd, 0xbf, 0x3e, - 0xcc, 0xa1, 0x1a, 0x3f, 0xe2, 0xc1, 0x21, 0x3f, 0xda, 0x26, 0x37, 0x3f, 0xbe, 0xe5, 0x18, 0x3e, - 0x0c, 0x1a, 0x88, 0xbd, 0x19, 0x0f, 0xc1, 0xbd, 0x01, 0x71, 0x5e, 0xbe, 0xd3, 0x07, 0x0c, 0xbe, - 0x38, 0x1f, 0xee, 0xbd, 0xea, 0x56, 0x40, 0xbe, 0xb8, 0x59, 0x1f, 0xbe, 0x07, 0x5b, 0x6d, 0xbe, - 0x3b, 0x00, 0x3e, 0xbe, 0xa7, 0x47, 0x27, 0xbe, 0x02, 0x49, 0xf7, 0xbd, 0xd2, 0xff, 0x2f, 0xbe, - 0xb1, 0x29, 0xda, 0xbd, 0xe0, 0x36, 0x7f, 0xbd, 0x4f, 0xe7, 0x20, 0xbe, 0xd6, 0x30, 0x47, 0xbe, - 0x93, 0x11, 0x08, 0xbe, 0x0f, 0x17, 0x13, 0xbe, 0x2e, 0x91, 0xe0, 0xbd, 0xb2, 0xd3, 0xe4, 0xbd, - 0xd7, 0xce, 0x81, 0xbe, 0x3a, 0x23, 0x1c, 0xbe, 0xce, 0x7b, 0x08, 0xbe, 0x07, 0x48, 0x54, 0xbe, - 0xf0, 0x8b, 0x65, 0xbe, 0x46, 0xd0, 0x43, 0xbe, 0x6a, 0xbd, 0x36, 0xbe, 0xa2, 0x6f, 0x15, 0xbe, - 0xd5, 0x50, 0x54, 0xbe, 0x66, 0x4d, 0xfd, 0xbd, 0x1e, 0xcc, 0xe0, 0xbd, 0xe6, 0xb3, 0x2b, 0xbe, - 0x5e, 0xcc, 0x89, 0xbc, 0xe4, 0x8d, 0xd6, 0xbd, 0x69, 0x64, 0x19, 0xbe, 0x0a, 0xd3, 0xfe, 0xbd, - 0x14, 0xe3, 0xa3, 0xbd, 0x8a, 0x80, 0x65, 0xbd, 0xc0, 0x7b, 0x06, 0xbc, 0xc0, 0x96, 0x9b, 0xbd, - 0xa8, 0x8f, 0x16, 0xbe, 0x3d, 0x4a, 0xe1, 0xbd, 0x70, 0x4a, 0x5e, 0xbb, 0x15, 0x65, 0x18, 0xbd, - 0x2b, 0x0a, 0xb9, 0xbd, 0x74, 0x21, 0x9b, 0x3c, 0xdb, 0xef, 0xb5, 0xbd, 0x38, 0x61, 0xeb, 0xbd, - 0x5c, 0x8a, 0x0e, 0x3d, 0x31, 0x62, 0xf0, 0xbd, 0x74, 0x02, 0x2c, 0xbc, 0x80, 0x22, 0xa9, 0xbd, - 0xb6, 0xf9, 0x12, 0xbe, 0x6a, 0x21, 0x35, 0xbd, 0xd2, 0xc4, 0xf5, 0xbc, 0xf7, 0x24, 0x20, 0xbe, - 0x48, 0x61, 0xda, 0xbd, 0x58, 0x12, 0x9d, 0xbc, 0x3a, 0x83, 0x82, 0xbd, 0x28, 0xa7, 0x8b, 0xbd, - 0xce, 0xad, 0x81, 0xbd, 0xef, 0xe2, 0x6d, 0xbd, 0xae, 0xd8, 0x07, 0xbe, 0x9f, 0x2d, 0x4c, 0xbd, - 0x5a, 0x4a, 0x3b, 0xbd, 0x27, 0x1b, 0x88, 0xbd, 0xc9, 0x11, 0x9e, 0xbd, 0xec, 0xa1, 0x9a, 0xbd, - 0x5a, 0xe7, 0xac, 0xbd, 0xf4, 0xf3, 0x8e, 0xbd, 0x7c, 0x47, 0x2b, 0xbd, 0x14, 0xc9, 0xd9, 0xbd, - 0x71, 0x74, 0xbf, 0xbd, 0xaf, 0x89, 0x71, 0xbd, 0x12, 0x49, 0x6c, 0xbd, 0xfa, 0xec, 0xac, 0xbd, - 0xbb, 0x6a, 0x15, 0xbd, 0x8a, 0xf1, 0x09, 0xbd, 0xd0, 0x80, 0xa4, 0xbd, 0xcc, 0x80, 0xe4, 0xbd, - 0x3e, 0x4a, 0xe1, 0xbd, 0x6f, 0xec, 0xbe, 0xbd, 0xe7, 0x06, 0xaf, 0xbd, 0x58, 0x12, 0x95, 0xbd, - 0x3c, 0xec, 0xc7, 0xbd, 0x85, 0xcb, 0x52, 0xbd, 0x0b, 0x28, 0x94, 0xbd, 0xb5, 0x1d, 0x9e, 0xbd, - 0x90, 0x3f, 0xae, 0xbd, 0x72, 0xf5, 0x0d, 0xbe, 0xda, 0x64, 0xad, 0xbd, 0xb7, 0x2c, 0x7b, 0xbd, - 0xa8, 0x11, 0x09, 0xbe, 0x9c, 0x42, 0xba, 0xbd, 0x32, 0xf7, 0x0f, 0xbc, 0xa5, 0x00, 0x8a, 0xbd, - 0xcc, 0x46, 0xab, 0xbc, 0x8c, 0xda, 0x92, 0xbd, 0x61, 0x0a, 0x32, 0xbd, 0x9e, 0x03, 0x8e, 0xbd, - 0x48, 0xc9, 0x87, 0xbd, 0xb8, 0xa5, 0x7f, 0xbb, 0x7c, 0x9c, 0x02, 0x3d, 0xb7, 0xe5, 0xe8, 0xbc, - 0xde, 0x03, 0x98, 0xbd, 0x9d, 0x44, 0x04, 0xbd, 0x84, 0xb3, 0x81, 0x3a, 0xba, 0x19, 0x8c, 0xbc, - 0x6c, 0x96, 0xe8, 0xbc, 0x13, 0x92, 0xe3, 0x3b, 0xdf, 0x22, 0x3d, 0xbd, 0x92, 0x00, 0x93, 0xbd, - 0x51, 0x58, 0xd6, 0xbc, 0x79, 0x06, 0xa4, 0xbd, 0xed, 0xfc, 0x12, 0xbd, 0x7e, 0x73, 0x6e, 0xbd, - 0x88, 0x90, 0x2e, 0xbd, 0x98, 0xa3, 0x2e, 0x3b, 0x5d, 0x93, 0xa9, 0xbc, 0x48, 0xf5, 0x53, 0xbd, - 0x13, 0x91, 0xd7, 0xbc, 0xd2, 0x79, 0x57, 0xbd, 0x34, 0xa4, 0xdf, 0xbc, 0x3a, 0xe9, 0xb7, 0xbc, - 0xc2, 0x52, 0x7d, 0xbd, 0x49, 0x29, 0x6e, 0xbd, 0x14, 0xc6, 0xa5, 0xbc, 0x36, 0xc1, 0x0a, 0xbc, - 0x8c, 0x7a, 0x19, 0xbf, 0x9b, 0x99, 0xc7, 0xbf, 0xc0, 0x8b, 0x6b, 0xc0, 0xf8, 0xf5, 0x10, 0xc0, - 0x7d, 0xbf, 0xb9, 0xbf, 0x12, 0xfc, 0x1e, 0xc0, 0x1e, 0x02, 0xf4, 0xbf, 0x55, 0x9c, 0x38, 0xc0, - 0x5b, 0x05, 0x3f, 0xc0, 0x3d, 0xe6, 0x2e, 0xc0, 0x59, 0x21, 0x93, 0xbf, 0x5a, 0x2a, 0xf3, 0xbf, - 0xc8, 0x94, 0xfc, 0xbf, 0xc3, 0x59, 0xb8, 0xbe, 0x01, 0x54, 0x0c, 0xc0, 0xd4, 0x40, 0x27, 0xc0, - 0x6e, 0x49, 0xb8, 0xbe, 0x92, 0xe8, 0x05, 0xc0, 0x36, 0xb6, 0x36, 0xbf, 0x1d, 0xdc, 0xc5, 0xbf, - 0xb3, 0x17, 0x7b, 0xc0, 0x90, 0x53, 0x04, 0xc0, 0xc7, 0x59, 0xb3, 0xbf, 0x2d, 0xcd, 0x65, 0xc0, - 0xb7, 0x46, 0x53, 0xc0, 0xec, 0x4c, 0xb0, 0xbf, 0xa9, 0x00, 0x0f, 0xc0, 0x48, 0x47, 0x04, 0xc0, - 0x6c, 0xb7, 0x04, 0xc0, 0x6e, 0xae, 0xa4, 0xbf, 0xf9, 0x09, 0x2f, 0xc0, 0xd4, 0xee, 0x09, 0xc0, - 0x88, 0x88, 0x87, 0x3f, 0xb8, 0x34, 0x88, 0x3f, 0x66, 0x50, 0xac, 0x3f, 0x4e, 0x13, 0xbb, 0x3f, - 0x88, 0x31, 0xff, 0x3f, 0xbd, 0xd1, 0x8f, 0x3f, 0x96, 0x85, 0x3b, 0x3f, 0x00, 0x62, 0xc7, 0x3f, - 0x64, 0xa5, 0xe6, 0x3f, 0x04, 0xd3, 0xd0, 0x3f, 0x64, 0xb3, 0xaa, 0x3f, 0x7e, 0xaf, 0x69, 0x3f, - 0x80, 0x32, 0xd4, 0x3f, 0x9a, 0x9e, 0x2c, 0x3f, 0x21, 0xf1, 0x7e, 0x3f, 0xce, 0xe2, 0xd6, 0x3f, - 0xa2, 0x44, 0x6b, 0x3f, 0x42, 0xa9, 0xad, 0x3f, 0x8f, 0x7f, 0xd5, 0x3f, 0xd4, 0x2d, 0xcd, 0x3f, - 0x48, 0x46, 0x47, 0x3f, 0xba, 0xd6, 0x88, 0x3f, 0x53, 0xa4, 0x00, 0x40, 0x48, 0x36, 0xb2, 0x3f, - 0x8d, 0xbb, 0xc9, 0x3f, 0x8e, 0x8d, 0xfb, 0x3f, 0x3e, 0x36, 0xd6, 0x3f, 0xe0, 0x1c, 0x9d, 0x3f, - 0xcc, 0x5b, 0xe5, 0x3f, 0xdb, 0xe6, 0xc6, 0x3f, 0xb4, 0x72, 0x8f, 0x3e, 0xcc, 0xc1, 0x6c, 0x3f, - 0x10, 0x2f, 0x3a, 0xbf, 0xbd, 0x77, 0x4f, 0xbf, 0x94, 0x51, 0x65, 0xbf, 0xf8, 0xfd, 0x51, 0xbf, - 0xb3, 0x85, 0x89, 0xbf, 0x74, 0x82, 0x77, 0xbf, 0x8a, 0x13, 0x66, 0xbf, 0x0c, 0xbe, 0x92, 0xbf, - 0x8c, 0x02, 0x7f, 0xbf, 0x6e, 0xd2, 0x67, 0xbf, 0x1a, 0xa3, 0x83, 0xbf, 0x7c, 0x0f, 0x5b, 0xbf, - 0xaa, 0xf8, 0x71, 0xbf, 0x28, 0xd6, 0x2b, 0xbf, 0x1c, 0x7a, 0x4c, 0xbf, 0x30, 0xdf, 0x64, 0xbf, - 0x40, 0x84, 0x51, 0xbf, 0x2f, 0xe2, 0x83, 0xbf, 0xd4, 0x34, 0x4f, 0xbf, 0xf0, 0x73, 0x76, 0xbf, - 0x70, 0xa5, 0x43, 0xbf, 0x82, 0x29, 0x62, 0xbf, 0x1d, 0xdb, 0x82, 0xbf, 0x32, 0xab, 0x8e, 0xbf, - 0x8c, 0x27, 0x88, 0xbf, 0x2a, 0x93, 0x89, 0xbf, 0x5d, 0x75, 0x6f, 0xbf, 0x43, 0xed, 0x3f, 0xbf, - 0x29, 0xb9, 0x8a, 0xbf, 0xd1, 0xff, 0x4e, 0xbf, 0xb0, 0x4f, 0x05, 0xbf, 0x28, 0x4a, 0x6c, 0xbf, - 0x28, 0xa3, 0x46, 0xbd, 0x95, 0x74, 0xb2, 0xbe, 0x68, 0x20, 0x23, 0xbf, 0x6b, 0x54, 0x04, 0xbf, - 0x30, 0x43, 0xed, 0xbe, 0xa1, 0xe5, 0x8f, 0xbe, 0x28, 0x1e, 0x83, 0xbd, 0x3d, 0xff, 0xef, 0xbe, - 0xa8, 0xed, 0xb6, 0xbe, 0x3b, 0x64, 0xc4, 0xbe, 0x56, 0xf0, 0x4f, 0xbe, 0x28, 0x4b, 0x33, 0xbe, - 0xa9, 0x8c, 0xaf, 0xbe, 0xe0, 0xa2, 0x5b, 0x3d, 0x82, 0xcb, 0xf3, 0xbe, 0x5c, 0x3e, 0xed, 0xbe, - 0xdc, 0xdc, 0x09, 0xbe, 0xce, 0x3f, 0x97, 0xbe, 0x0b, 0x17, 0xcc, 0xbe, 0x30, 0x5e, 0xb7, 0xbe, - 0x80, 0x4d, 0x85, 0xbe, 0xa5, 0x56, 0x3a, 0xbe, 0x67, 0xa9, 0xdb, 0xbe, 0xab, 0x5f, 0xc2, 0xbe, - 0x92, 0xd9, 0x0a, 0xbf, 0x59, 0x0d, 0x7f, 0xbe, 0x3c, 0xe5, 0x83, 0xbe, 0x9b, 0xac, 0x62, 0xbe, - 0xd9, 0x63, 0xb2, 0xbe, 0x8e, 0x46, 0x8c, 0xbe, 0xbc, 0xec, 0x05, 0xbe, 0xa3, 0xea, 0x8b, 0xbe, - 0x23, 0x1a, 0x9c, 0x3d, 0x5a, 0x02, 0xd4, 0x3d, 0x3f, 0x88, 0xf8, 0x3d, 0x18, 0x26, 0xcd, 0x3d, - 0x18, 0x67, 0xf1, 0x3d, 0x58, 0x16, 0xf9, 0x3d, 0x44, 0xa4, 0xe8, 0x3d, 0xfb, 0xeb, 0x12, 0x3e, - 0x1d, 0x96, 0xdb, 0x3d, 0x7c, 0x17, 0xce, 0x3d, 0xbe, 0xae, 0xf5, 0x3d, 0x5e, 0x7c, 0xda, 0x3d, - 0xba, 0x4d, 0xd4, 0x3d, 0x5a, 0xae, 0x9a, 0x3d, 0x99, 0x25, 0xe2, 0x3d, 0x33, 0x7d, 0xcf, 0x3d, - 0xd2, 0x77, 0xca, 0x3d, 0xab, 0x26, 0xfe, 0x3d, 0x92, 0x99, 0xaf, 0x3d, 0xfc, 0xfc, 0xde, 0x3d, - 0xd2, 0x4b, 0xd0, 0x3d, 0x3a, 0x0c, 0xd9, 0x3d, 0xbe, 0xc4, 0xdc, 0x3d, 0xdc, 0x1d, 0x0f, 0x3e, - 0x28, 0x5c, 0x09, 0x3e, 0xf8, 0x85, 0xdd, 0x3d, 0x9f, 0x7b, 0xc7, 0x3d, 0xa4, 0x2f, 0xa9, 0x3d, - 0x10, 0x4a, 0xf6, 0x3d, 0x90, 0x5f, 0xaa, 0x3d, 0x57, 0x3d, 0x9a, 0x3d, 0x4e, 0x65, 0xf8, 0x3d, - 0xf4, 0xb1, 0x08, 0x3c, 0x97, 0xd5, 0x6a, 0x3d, 0x1c, 0x94, 0xc3, 0x3d, 0xdb, 0x04, 0x8f, 0x3d, - 0x4e, 0x8b, 0x68, 0x3d, 0xde, 0x46, 0x59, 0x3d, 0x00, 0xc5, 0xf6, 0x3c, 0xfb, 0x0d, 0x9b, 0x3d, - 0xe0, 0x7f, 0x31, 0x3d, 0xab, 0x61, 0x46, 0x3d, 0x49, 0x4d, 0x19, 0x3d, 0x38, 0x31, 0x1e, 0x3d, - 0x84, 0xe4, 0x31, 0x3d, 0x58, 0x5c, 0x47, 0x3b, 0x50, 0x15, 0xa0, 0x3d, 0xfe, 0x93, 0x6e, 0x3d, - 0x8e, 0xb5, 0xf9, 0x3c, 0x30, 0x46, 0x4f, 0x3d, 0x0e, 0xf1, 0x36, 0x3d, 0x9f, 0x58, 0x46, 0x3d, - 0x56, 0x33, 0x52, 0x3d, 0x7d, 0xc5, 0x12, 0x3d, 0x0a, 0x87, 0x46, 0x3d, 0x79, 0xb6, 0x87, 0x3d, - 0xe5, 0xf3, 0xa7, 0x3d, 0xe5, 0xa7, 0xd1, 0x3c, 0x7c, 0xb2, 0xf2, 0x3c, 0x04, 0xeb, 0xf3, 0x3c, - 0x48, 0x60, 0x3f, 0x3d, 0x57, 0x89, 0xf3, 0x3c, 0x46, 0xc0, 0x14, 0x3d, 0x43, 0xa9, 0x66, 0x3d, - 0x04, 0x83, 0x35, 0x3d, 0x3c, 0x47, 0x33, 0x3d, 0xe1, 0xdd, 0x44, 0x3d, 0x87, 0x4d, 0x47, 0x3d, - 0xac, 0xb3, 0x8a, 0x3d, 0xd7, 0xd1, 0x52, 0x3d, 0x0f, 0xc6, 0x38, 0x3d, 0xde, 0x36, 0x81, 0x3d, - 0xb8, 0xdc, 0x80, 0x3d, 0xb5, 0xd3, 0x66, 0x3d, 0xa6, 0x9b, 0x6e, 0x3d, 0xaa, 0xc9, 0x38, 0x3d, - 0xbc, 0xcc, 0x70, 0x3d, 0x17, 0xe1, 0x17, 0x3d, 0x8c, 0x4a, 0x26, 0x3d, 0x60, 0x34, 0x64, 0x3d, - 0x14, 0x8e, 0x35, 0x3d, 0x2e, 0xc1, 0x6b, 0x3d, 0xd2, 0xc7, 0x59, 0x3d, 0xb7, 0xf6, 0x6e, 0x3d, - 0x86, 0x32, 0x1d, 0x3d, 0x6e, 0x5c, 0x47, 0x3d, 0xb4, 0x23, 0x88, 0x3d, 0xc0, 0xcf, 0x76, 0x3d, - 0xc6, 0x34, 0x73, 0x3d, 0xd9, 0x3f, 0x8f, 0x3d, 0x5c, 0xbb, 0x74, 0x3d, 0xaa, 0x9d, 0x3b, 0x3d, - 0x0d, 0x7a, 0x87, 0x3d, 0xc1, 0x40, 0x58, 0x3d, 0x79, 0xd5, 0xb8, 0x3c, 0xd8, 0xea, 0x3e, 0x3d, - 0x7f, 0xfc, 0x29, 0x3c, 0x5a, 0x5f, 0xa9, 0x3c, 0xbe, 0xe9, 0x0f, 0x3d, 0x49, 0x2c, 0x09, 0x3d, - 0xe5, 0x45, 0x18, 0x3d, 0xda, 0xbd, 0x8c, 0x3c, 0xb0, 0x30, 0x5d, 0x3b, 0x82, 0x47, 0xef, 0x3c, - 0x88, 0xc6, 0xfd, 0x3c, 0x1f, 0xda, 0xf7, 0x3c, 0x1a, 0x27, 0x8b, 0x3c, 0xa2, 0x97, 0x35, 0x3c, - 0x88, 0xe2, 0xea, 0x3c, 0xc0, 0x67, 0x5b, 0x3a, 0x87, 0x74, 0xc8, 0x3c, 0x2c, 0x01, 0x0c, 0x3d, - 0x96, 0x0e, 0x26, 0x3c, 0x36, 0x7b, 0xaa, 0x3c, 0xc8, 0xa6, 0x05, 0x3d, 0x16, 0x4c, 0xe6, 0x3c, - 0x7d, 0x01, 0x58, 0x3c, 0xcc, 0xcb, 0x5e, 0x3c, 0x30, 0xc6, 0x16, 0x3d, 0x03, 0x95, 0xc0, 0x3c, - 0x5a, 0x70, 0x08, 0x3d, 0x65, 0xc7, 0xea, 0x3c, 0xb9, 0xd8, 0xd3, 0x3c, 0x8c, 0xd1, 0x9e, 0x3c, - 0x5c, 0xb4, 0xf0, 0x3c, 0x0e, 0x87, 0xd5, 0x3c, 0x8a, 0xd8, 0x48, 0x3b, 0xb0, 0x68, 0x67, 0x3c, - 0xc4, 0xc7, 0x2f, 0x3f, 0x90, 0x93, 0xc5, 0x3f, 0xeb, 0x97, 0x06, 0x40, 0x5f, 0x09, 0xc6, 0x3f, - 0x14, 0xb1, 0xbb, 0x3f, 0x30, 0x92, 0xd7, 0x3f, 0x6e, 0x83, 0xb7, 0x3f, 0x96, 0xe4, 0x04, 0x40, - 0x28, 0x23, 0x9f, 0x3f, 0x18, 0x72, 0xa2, 0x3f, 0x2f, 0xf7, 0xb7, 0x3f, 0xd0, 0x06, 0xb4, 0x3f, - 0x5d, 0xc4, 0x9e, 0x3f, 0x58, 0x20, 0x37, 0x3f, 0x47, 0xb3, 0xef, 0x3f, 0x5b, 0x0b, 0xb0, 0x3f, - 0x95, 0xd8, 0x9c, 0x3f, 0x8c, 0x27, 0xd0, 0x3f, 0xd5, 0x68, 0x86, 0x3f, 0x94, 0x39, 0xaf, 0x3f, - 0x93, 0x72, 0xc3, 0x3f, 0x57, 0x00, 0xaa, 0x3f, 0xa5, 0xb3, 0xa0, 0x3f, 0xb8, 0x20, 0xfc, 0x3f, - 0x19, 0x69, 0x02, 0x40, 0x52, 0xb9, 0x81, 0x3f, 0xbc, 0x5e, 0x81, 0x3f, 0x2e, 0x1d, 0x75, 0x3f, - 0xfa, 0x1c, 0xb7, 0x3f, 0x3e, 0xc1, 0x61, 0x3f, 0x8e, 0xc7, 0x97, 0x3f, 0x82, 0x26, 0xe2, 0x3f, - 0xe8, 0x37, 0xe1, 0x3d, 0xa2, 0x3f, 0x44, 0x3f, 0xbf, 0xc7, 0x87, 0x3e, 0x24, 0x5e, 0xa3, 0x3e, - 0x41, 0x81, 0x87, 0x3f, 0x28, 0x70, 0x1a, 0x3e, 0x7a, 0x98, 0x9a, 0xbe, 0x22, 0x74, 0x13, 0x3f, - 0x2d, 0x7d, 0x2c, 0x3f, 0x42, 0x35, 0x05, 0x3e, 0xe0, 0xaa, 0x21, 0x3e, 0x02, 0x4a, 0x11, 0x3f, - 0x9f, 0xd5, 0x55, 0x3f, 0xbc, 0x82, 0x97, 0x3d, 0x9b, 0xf3, 0x07, 0x3f, 0x8e, 0x8f, 0x3c, 0x3f, - 0x58, 0x7d, 0x96, 0x3e, 0x36, 0x10, 0xff, 0x3e, 0xa4, 0xc1, 0xcd, 0x3d, 0x04, 0x86, 0x4c, 0x3e, - 0x10, 0x24, 0xcf, 0x3d, 0x22, 0xba, 0x9a, 0x3e, 0xf8, 0xff, 0xd5, 0xbd, 0x94, 0xd0, 0xe7, 0x3d, - 0x84, 0xea, 0xb0, 0x3d, 0x68, 0xe6, 0x82, 0x3f, 0x05, 0x8d, 0x56, 0x3f, 0x34, 0x64, 0xc7, 0x3d, - 0x4c, 0x7d, 0xef, 0x3d, 0x29, 0x23, 0xa9, 0x3e, 0xc2, 0x1d, 0x42, 0x3e, 0xea, 0x51, 0x18, 0x3f, - 0xb1, 0x7e, 0x5a, 0xbd, 0x05, 0xf1, 0x96, 0xbe, 0xe2, 0x1b, 0xb1, 0xbe, 0xb0, 0xa0, 0xb3, 0xbe, - 0x56, 0xf5, 0x8e, 0xbe, 0x65, 0xef, 0xcd, 0xbe, 0xf6, 0xfb, 0x3d, 0xbe, 0x82, 0x2b, 0x00, 0xbf, - 0x44, 0x86, 0x15, 0xbf, 0xe8, 0xb5, 0x93, 0xbe, 0x79, 0x3f, 0x42, 0xbe, 0xc6, 0x04, 0x98, 0xbe, - 0xda, 0xf4, 0x56, 0xbe, 0x86, 0x0c, 0x30, 0xbe, 0x4c, 0xe5, 0xc4, 0xbe, 0x02, 0x3d, 0xa9, 0xbe, - 0x4c, 0x05, 0x27, 0xbe, 0x85, 0x2c, 0x2f, 0xbe, 0xa1, 0x0b, 0x84, 0xbe, 0x7f, 0x45, 0x9d, 0xbd, - 0xfa, 0xb2, 0x72, 0xbe, 0x0a, 0x50, 0x7d, 0xbe, 0xdf, 0xdc, 0xe1, 0xbd, 0xc2, 0xb1, 0xa7, 0xbe, - 0xf6, 0x15, 0xe1, 0xbe, 0x6f, 0xe4, 0xaf, 0xbe, 0x42, 0x82, 0xec, 0xbe, 0xb6, 0xde, 0x1f, 0xbe, - 0xcd, 0x80, 0x91, 0xbe, 0xd4, 0x1a, 0xbc, 0xbe, 0xa2, 0x70, 0xb7, 0x3c, 0x32, 0x75, 0x79, 0xbe, - 0xa9, 0x0b, 0x91, 0x3b, 0x8f, 0x50, 0xbb, 0xbe, 0x06, 0x7b, 0x34, 0xbe, 0x73, 0xed, 0x34, 0xbe, - 0xff, 0x7c, 0xa9, 0xbe, 0xb4, 0x55, 0xbc, 0xbd, 0xb7, 0x39, 0x10, 0x3e, 0x56, 0x99, 0x07, 0x3e, - 0x44, 0x10, 0xc3, 0xbd, 0x68, 0xa7, 0x66, 0xbe, 0x56, 0x49, 0xf8, 0x3c, 0x3d, 0xaa, 0x40, 0xbd, - 0xfb, 0x3d, 0x85, 0xbe, 0x80, 0x6d, 0x79, 0x3a, 0x5b, 0xd1, 0x86, 0xbe, 0xf6, 0x1a, 0x65, 0xbe, - 0x68, 0x25, 0x23, 0x3e, 0x7c, 0x1e, 0x74, 0xbe, 0x30, 0x5f, 0x0c, 0x3e, 0x16, 0x24, 0x3d, 0xbe, - 0x3e, 0x42, 0xa7, 0xbd, 0xe4, 0x71, 0x9b, 0xbc, 0xbf, 0xd4, 0x97, 0x3d, 0xde, 0x44, 0x77, 0xbe, - 0x07, 0x26, 0x62, 0x3e, 0xfe, 0xf2, 0x06, 0xbe, 0x70, 0xc6, 0xca, 0xbe, 0x3a, 0xbb, 0x3b, 0xbd, - 0x28, 0x3a, 0x27, 0x3d, 0xae, 0x8d, 0x31, 0xbe, 0x20, 0xf2, 0x3e, 0xbe, 0x8e, 0xe3, 0x96, 0xbd, - 0x3b, 0xd3, 0x86, 0x3b, 0xc7, 0x63, 0x17, 0x3d, 0x32, 0x6b, 0x54, 0x3d, 0x8a, 0x2c, 0x50, 0x3d, - 0x10, 0x9d, 0xbd, 0x3c, 0x28, 0x5d, 0x75, 0x3d, 0xec, 0xbd, 0xfc, 0x3c, 0xa0, 0xd8, 0x49, 0x3d, - 0x7c, 0x2f, 0x8f, 0x3d, 0x55, 0x9f, 0x4d, 0x3d, 0x68, 0x92, 0xb1, 0x3c, 0xe0, 0x22, 0xe5, 0x3c, - 0x8c, 0x73, 0x88, 0x3c, 0xe6, 0xaa, 0xbe, 0x3c, 0xc6, 0xb1, 0x59, 0x3d, 0x60, 0x57, 0x14, 0x3d, - 0xf4, 0x9e, 0xdb, 0x3b, 0x93, 0xd2, 0xab, 0x3c, 0x7a, 0x53, 0xe9, 0x3c, 0x97, 0xf5, 0x66, 0x3c, - 0x35, 0x89, 0x14, 0x3d, 0xf0, 0x39, 0xe5, 0x3c, 0x4e, 0xaf, 0x85, 0x3c, 0x94, 0x3b, 0x6b, 0x3d, - 0x2e, 0x14, 0x55, 0x3d, 0x2d, 0xde, 0xc4, 0x3c, 0x58, 0x7e, 0x7a, 0x3d, 0x16, 0x08, 0xb8, 0x3c, - 0xc4, 0x46, 0x16, 0x3d, 0x17, 0x22, 0x58, 0x3d, 0x80, 0xba, 0xb9, 0xb9, 0x6f, 0x0d, 0xa7, 0x3c, - 0xc1, 0xe6, 0xd9, 0xba, 0x02, 0x7d, 0x34, 0x3d, 0x80, 0xec, 0x0e, 0x3d, 0xc6, 0xd7, 0x08, 0x3d, - 0xb8, 0x0f, 0xe8, 0x3c, 0xc3, 0x90, 0xec, 0x3c, 0x80, 0x03, 0xea, 0xba, 0x84, 0xdc, 0x65, 0xbc, - 0xff, 0x49, 0xac, 0x3c, 0x2c, 0xb7, 0x33, 0x3d, 0x60, 0xed, 0xa9, 0xb9, 0xde, 0x83, 0x43, 0x3b, - 0x70, 0xac, 0xb1, 0x3c, 0x22, 0x32, 0xc4, 0x3b, 0x39, 0x08, 0x28, 0x3d, 0x80, 0x2f, 0xd1, 0x3c, - 0x6d, 0x3a, 0xd1, 0xbc, 0x8c, 0xfa, 0xe2, 0x3c, 0xf7, 0x7c, 0x34, 0xbc, 0x44, 0xe2, 0xcb, 0x3c, - 0xee, 0xd0, 0xa9, 0x3c, 0x92, 0x7b, 0xa8, 0x3b, 0x5c, 0x24, 0x0b, 0xbb, 0x1e, 0xfc, 0x47, 0x3d, - 0xfe, 0xcb, 0x65, 0xbc, 0x2d, 0x32, 0x3b, 0x3b, 0xb0, 0x81, 0x5f, 0x3d, 0x1f, 0x5d, 0x3b, 0x3c, - 0xb2, 0x35, 0x89, 0x3b, 0xb2, 0xab, 0x08, 0x3d, 0xdb, 0x32, 0xa8, 0x3c, 0x00, 0xbc, 0x5e, 0x3b, - 0xed, 0xd6, 0x7e, 0x3b, 0x92, 0xc4, 0x9d, 0x3c, 0xc9, 0xd9, 0x78, 0x3c, 0xa4, 0x3f, 0x85, 0x3c, - 0x79, 0x6a, 0xc4, 0x3c, 0x71, 0xbc, 0x86, 0x3c, 0xdc, 0x89, 0x71, 0x3b, 0x7c, 0x01, 0xf6, 0x3c, - 0xcd, 0xd5, 0x02, 0x3d, 0xe0, 0xa4, 0x26, 0x3c, 0x05, 0xde, 0x25, 0x3c, 0xd1, 0xe1, 0xa3, 0x3c, - 0x31, 0xe2, 0x97, 0x3c, 0x22, 0x0d, 0xfc, 0x3b, 0x6e, 0xae, 0xa3, 0x3c, 0xbe, 0x32, 0xb3, 0x3c, - 0x4f, 0x1a, 0x55, 0x3c, 0x76, 0x50, 0x40, 0x3c, 0x8c, 0xf6, 0x55, 0x3c, 0x47, 0x92, 0x7f, 0x3b, - 0x2c, 0x83, 0x1c, 0x3c, 0x62, 0x9f, 0x66, 0x3c, 0x6e, 0xa2, 0x59, 0x3b, 0x26, 0xb0, 0x36, 0x3c, - 0xf3, 0x75, 0xa9, 0x3c, 0xcc, 0xaa, 0xe7, 0x3c, 0x03, 0xc7, 0xd7, 0x3c, 0xec, 0x3d, 0xe2, 0x3b, - 0xa4, 0xe5, 0x56, 0x3c, 0xf9, 0x35, 0x8c, 0x3c, 0x80, 0x38, 0x3b, 0xb7, 0xbc, 0x87, 0x94, 0x3c, - 0x03, 0x14, 0x83, 0x3a, 0xce, 0xdd, 0xba, 0x3c, 0xca, 0xb8, 0xdb, 0x3b, 0x79, 0xab, 0xf7, 0x3b, - 0xcd, 0xa3, 0xd9, 0x3c, 0x40, 0xf5, 0x74, 0x3a, 0xad, 0xfb, 0x4e, 0xbc, 0x42, 0xe4, 0xd4, 0xba, - 0x82, 0xc6, 0xf5, 0x3b, 0x19, 0xa9, 0xe5, 0x3b, 0x82, 0xc1, 0x65, 0xba, 0x78, 0x22, 0xf6, 0x3b, - 0xa3, 0x7e, 0xac, 0x3c, 0xc0, 0x7c, 0x79, 0xba, 0xc0, 0x09, 0x64, 0x3c, 0x1e, 0x73, 0x87, 0x3c, - 0x6b, 0x1f, 0x4c, 0xbb, 0x70, 0x78, 0x77, 0x3c, 0xd5, 0x36, 0xdb, 0xbb, 0x0a, 0x4c, 0x18, 0x3c, - 0x7c, 0x0d, 0xea, 0x3a, 0x5a, 0x11, 0x2e, 0x3b, 0x7c, 0x56, 0xc1, 0xbb, 0x74, 0x62, 0xe0, 0x3b, - 0x77, 0x22, 0x4a, 0xbc, 0xd8, 0xee, 0x90, 0x3c, 0x5b, 0xca, 0xbc, 0x3c, 0x7c, 0x2b, 0xb6, 0x3a, - 0xe1, 0x69, 0x4d, 0xbb, 0x52, 0xfe, 0xf2, 0x3b, 0xf7, 0xea, 0x2a, 0x3c, 0x8a, 0xd9, 0x1d, 0x3c, - 0x00, 0xf0, 0xc3, 0x38, 0x24, 0x4a, 0x38, 0x3f, 0xb2, 0xe0, 0x68, 0x3f, 0x44, 0x2b, 0x5e, 0x3f, - 0x3b, 0xfc, 0xa0, 0x3e, 0x62, 0x09, 0x76, 0x3f, 0x01, 0x13, 0xce, 0x3e, 0x08, 0x4f, 0x81, 0x3e, - 0x7f, 0xcd, 0x5a, 0x3f, 0xbf, 0x4e, 0x83, 0x3f, 0x52, 0x8f, 0x4d, 0x3e, 0x36, 0x4d, 0x53, 0x3e, - 0x5a, 0xe7, 0x63, 0x3e, 0xcd, 0x14, 0x9f, 0x3e, 0xe0, 0x39, 0x6d, 0x3f, 0x3e, 0xb4, 0x02, 0x3f, - 0x79, 0x21, 0x92, 0xbe, 0x2b, 0xa4, 0xd8, 0x3e, 0x52, 0x7d, 0x2f, 0x3e, 0xae, 0xfc, 0xc6, 0x3e, - 0xcc, 0xaa, 0x1d, 0x3f, 0xed, 0x1b, 0x98, 0x3e, 0x20, 0x49, 0x3e, 0x3e, 0x52, 0x62, 0x95, 0x3f, - 0x01, 0xcc, 0xcf, 0x3e, 0xc0, 0xcd, 0x3f, 0x3d, 0x62, 0xd2, 0x8b, 0x3f, 0x28, 0xce, 0xb6, 0x3e, - 0x1f, 0x7a, 0xdc, 0x3e, 0x4c, 0xec, 0x62, 0x3f, 0x5a, 0xeb, 0x37, 0x3e, 0x0c, 0x80, 0xf3, 0x3d, - 0xb0, 0x6c, 0xf4, 0xbd, 0x85, 0xc9, 0xdb, 0xbd, 0x34, 0x1a, 0x1f, 0xbe, 0xb3, 0xe9, 0x14, 0xbe, - 0x80, 0x87, 0x80, 0xbe, 0x58, 0xe6, 0x10, 0xbe, 0x1a, 0x64, 0xd3, 0xbd, 0x77, 0xdc, 0x3d, 0xbe, - 0xf8, 0xf0, 0x4f, 0xbe, 0x1a, 0x7b, 0x52, 0xbe, 0x7c, 0x30, 0x33, 0xbe, 0x4a, 0xff, 0xd6, 0xbd, - 0xf7, 0x35, 0x84, 0xbe, 0x74, 0xcd, 0xb3, 0xbd, 0xa6, 0x6d, 0xcb, 0xbd, 0xae, 0x0f, 0x31, 0xbe, - 0x8d, 0xcf, 0x5a, 0xbd, 0xfa, 0x4b, 0xed, 0xbd, 0xa5, 0x3c, 0x2e, 0xbe, 0x2a, 0x05, 0x29, 0xbe, - 0xd5, 0x14, 0x4a, 0xbd, 0x63, 0xd8, 0x1f, 0xbe, 0x10, 0xe4, 0x73, 0xbe, 0x1d, 0x41, 0x17, 0xbe, - 0xa6, 0x1e, 0x3d, 0xbe, 0x29, 0x1b, 0x5b, 0xbe, 0xd6, 0x0a, 0x69, 0xbe, 0x03, 0xd2, 0x12, 0xbe, - 0xff, 0x9c, 0x14, 0xbe, 0x64, 0x12, 0x18, 0xbe, 0x06, 0x28, 0x57, 0xbd, 0x79, 0xa7, 0x04, 0xbe, - 0x28, 0x60, 0xb6, 0x3d, 0x77, 0xd4, 0xd9, 0x3d, 0x04, 0x88, 0xcc, 0x3d, 0x7a, 0x7f, 0xd0, 0x3d, - 0x1c, 0x28, 0x0b, 0x3e, 0x24, 0xca, 0xef, 0x3d, 0x41, 0xb2, 0xdb, 0x3d, 0x7d, 0xb8, 0x07, 0x3e, - 0x46, 0xbc, 0x02, 0x3e, 0xee, 0x45, 0xe3, 0x3d, 0xea, 0x6e, 0x03, 0x3e, 0x4a, 0x09, 0xbf, 0x3d, - 0x8e, 0xdf, 0x02, 0x3e, 0x0d, 0xdb, 0xb6, 0x3d, 0x66, 0x50, 0xc1, 0x3d, 0x3e, 0x02, 0xbd, 0x3d, - 0x11, 0x50, 0xa0, 0x3d, 0x92, 0x1b, 0xf2, 0x3d, 0xf6, 0xab, 0xbf, 0x3d, 0xc4, 0x5a, 0xe7, 0x3d, - 0x0d, 0xfe, 0x80, 0x3d, 0xc4, 0x97, 0xdb, 0x3d, 0x7a, 0x64, 0xf2, 0x3d, 0x03, 0xd9, 0x08, 0x3e, - 0xf0, 0x0a, 0x00, 0x3e, 0x3d, 0xaf, 0xe1, 0x3d, 0xc4, 0xd9, 0xe7, 0x3d, 0xfc, 0xa6, 0xa7, 0x3d, - 0xda, 0x7b, 0xde, 0x3d, 0x32, 0x19, 0xc6, 0x3d, 0x9c, 0xf1, 0x78, 0x3d, 0x87, 0xdc, 0xd9, 0x3d, - 0xe0, 0x1d, 0x69, 0x3a, 0x54, 0x1a, 0x18, 0x3d, 0x8a, 0xc2, 0x99, 0x3d, 0xec, 0xc3, 0x53, 0x3d, - 0x32, 0x56, 0x71, 0x3d, 0xdc, 0xa4, 0x2e, 0x3d, 0x54, 0xda, 0x73, 0x3c, 0xab, 0x9b, 0x51, 0x3d, - 0x5a, 0xcf, 0xa5, 0x3c, 0x89, 0xbc, 0x41, 0x3d, 0xa0, 0xce, 0x05, 0x3d, 0x8e, 0x31, 0xc1, 0x3c, - 0x0d, 0xcf, 0x39, 0x3d, 0x80, 0xd6, 0x83, 0xba, 0x2b, 0x6d, 0x80, 0x3d, 0x06, 0x36, 0x48, 0x3d, - 0x1e, 0xa2, 0x7d, 0x3c, 0x62, 0xa7, 0xa4, 0x3c, 0xe2, 0x70, 0x39, 0x3d, 0xdb, 0x28, 0x19, 0x3d, - 0x78, 0x8a, 0x85, 0x3c, 0xa2, 0x75, 0xda, 0x3c, 0x97, 0x84, 0x5f, 0x3d, 0xa8, 0xb8, 0x13, 0x3d, - 0xa5, 0x14, 0x59, 0x3d, 0x7f, 0x94, 0x07, 0x3d, 0x46, 0x6d, 0x3e, 0x3d, 0xa8, 0x7c, 0xbc, 0x3c, - 0xf8, 0x78, 0xf6, 0x3c, 0x44, 0x24, 0xeb, 0x3c, 0x9c, 0xe7, 0xa6, 0x3b, 0x38, 0x9a, 0x22, 0x3d, - 0x3c, 0x66, 0x1a, 0xbc, 0xe2, 0x34, 0x69, 0xbc, 0x43, 0x16, 0x5e, 0xbc, 0x62, 0x21, 0x55, 0xbc, - 0x3b, 0x25, 0x75, 0xbc, 0x7e, 0x65, 0x75, 0xbc, 0x10, 0x75, 0x5b, 0xbc, 0x2a, 0x32, 0x85, 0xbc, - 0xac, 0x71, 0x5b, 0xbc, 0xf8, 0x37, 0x47, 0xbc, 0x32, 0xa2, 0x76, 0xbc, 0xf6, 0x3a, 0x3f, 0xbc, - 0x94, 0x3b, 0x52, 0xbc, 0xcd, 0xa7, 0x2a, 0xbc, 0x93, 0x19, 0x65, 0xbc, 0x88, 0x6b, 0x2c, 0xbc, - 0xd8, 0xe1, 0x2f, 0xbc, 0x2a, 0xb7, 0x73, 0xbc, 0xfc, 0x2a, 0x2e, 0xbc, 0x02, 0x1d, 0x5a, 0xbc, - 0x66, 0x33, 0x0d, 0xbc, 0x46, 0x1b, 0x48, 0xbc, 0xdc, 0x7f, 0x4d, 0xbc, 0xa6, 0xa7, 0x8a, 0xbc, - 0xd8, 0xda, 0x79, 0xbc, 0xb4, 0x14, 0x35, 0xbc, 0x54, 0x04, 0x40, 0xbc, 0x2d, 0xba, 0x0d, 0xbc, - 0x0e, 0x5e, 0x54, 0xbc, 0x97, 0x20, 0x34, 0xbc, 0xe8, 0xb3, 0xf9, 0xbb, 0x8e, 0x34, 0x5f, 0xbc, - 0x1c, 0x4b, 0x1f, 0xba, 0x49, 0x91, 0xef, 0xbb, 0x47, 0x78, 0x35, 0xbc, 0xc4, 0x09, 0x03, 0xbc, - 0x12, 0x0f, 0xee, 0xbb, 0x3f, 0xd8, 0xf6, 0xbb, 0xc8, 0x2b, 0x88, 0xbb, 0x03, 0x15, 0x05, 0xbc, - 0xa0, 0xb7, 0x2d, 0xbb, 0x14, 0xca, 0xbd, 0xbb, 0xf8, 0xb9, 0xb3, 0xbb, 0x7a, 0x14, 0x9a, 0xbb, - 0x6d, 0x51, 0x96, 0xbb, 0x9e, 0xc8, 0xb5, 0xba, 0x97, 0x30, 0x31, 0xbc, 0x4c, 0x98, 0xca, 0xbb, - 0x9d, 0xfd, 0x91, 0xbb, 0x91, 0xeb, 0xa5, 0xbb, 0x8a, 0x03, 0xbe, 0xbb, 0x44, 0x96, 0xbc, 0xbb, - 0x1a, 0x6e, 0x81, 0xbb, 0x96, 0x3f, 0x88, 0xbb, 0x39, 0xfc, 0xc9, 0xbb, 0x4a, 0xe2, 0xea, 0xbb, - 0x99, 0xd1, 0x03, 0xbc, 0xa1, 0x0f, 0x5b, 0xbb, 0x0e, 0x6a, 0xa5, 0xbb, 0x12, 0x36, 0x35, 0xbb, - 0x89, 0x59, 0xa7, 0xbb, 0x91, 0x5a, 0x89, 0xbb, 0xf5, 0x79, 0x06, 0xbb, 0x0c, 0xea, 0xe3, 0xbb, - 0xb3, 0x3d, 0xae, 0xbb, 0xfc, 0x18, 0xae, 0xbb, 0x48, 0xd2, 0xb0, 0xbb, 0xaa, 0x57, 0xb7, 0xbb, - 0xe4, 0xf9, 0x0b, 0xbc, 0x74, 0xb0, 0xcb, 0xbb, 0xa8, 0x17, 0xb6, 0xbb, 0xcd, 0x29, 0xf3, 0xbb, - 0x3d, 0xbf, 0x01, 0xbc, 0x29, 0x3f, 0xe5, 0xbb, 0x29, 0xe8, 0xef, 0xbb, 0x71, 0x4b, 0xa2, 0xbb, - 0xf6, 0xbe, 0x0b, 0xbc, 0xa0, 0x12, 0x9e, 0xbb, 0xfb, 0x8c, 0x90, 0xbb, 0x1c, 0xd1, 0xbb, 0xbb, - 0x0e, 0x0b, 0x6e, 0xbb, 0xa2, 0x4f, 0xc8, 0xbb, 0xa2, 0x62, 0xbd, 0xbb, 0x10, 0xa4, 0xd5, 0xbb, - 0x3c, 0x53, 0x43, 0xbb, 0x62, 0x15, 0xce, 0xbb, 0xaa, 0x28, 0xfd, 0xbb, 0x36, 0x1d, 0xe6, 0xbb, - 0x9d, 0xce, 0xe8, 0xbb, 0x95, 0xb5, 0xee, 0xbb, 0x61, 0x5b, 0xf4, 0xbb, 0xa2, 0x78, 0xa9, 0xbb, - 0x54, 0x6d, 0xc8, 0xbb, 0xfc, 0x34, 0xbc, 0xbb, 0x96, 0x91, 0x4a, 0xbb, 0x9d, 0x34, 0xb9, 0xbb, - 0x45, 0x61, 0x6e, 0xba, 0xf9, 0xb2, 0xf2, 0xba, 0x4c, 0x34, 0x88, 0xbb, 0x1b, 0x80, 0x4a, 0xbb, - 0x15, 0xd6, 0x99, 0xbb, 0x29, 0x6e, 0x23, 0xbb, 0xe0, 0xc3, 0x75, 0xba, 0xfa, 0xb4, 0x5b, 0xbb, - 0x6b, 0xf5, 0x26, 0xbb, 0x76, 0xd8, 0x79, 0xbb, 0xe6, 0x45, 0x26, 0xbb, 0xe0, 0x6a, 0xc0, 0xba, - 0x0f, 0x6b, 0x90, 0xbb, 0x66, 0x3e, 0xa2, 0xb9, 0xfe, 0x47, 0x39, 0xbb, 0x72, 0x63, 0x69, 0xbb, - 0xe6, 0x13, 0x01, 0xba, 0x05, 0x39, 0x9e, 0xba, 0xd6, 0x72, 0x5c, 0xbb, 0x62, 0xa4, 0x33, 0xbb, - 0x2b, 0x6a, 0x29, 0xba, 0xcc, 0x33, 0x14, 0xbb, 0xea, 0xa6, 0x94, 0xbb, 0x1b, 0x7e, 0x0c, 0xbb, - 0xba, 0x8a, 0x65, 0xbb, 0x92, 0xc9, 0x60, 0xbb, 0x13, 0x0c, 0x87, 0xbb, 0xf3, 0x17, 0x12, 0xbb, - 0x16, 0x78, 0x0f, 0xbb, 0x5a, 0x83, 0x19, 0xbb, 0x3e, 0xd5, 0xa0, 0xb9, 0x29, 0xea, 0x17, 0xbb, - 0xb2, 0x04, 0xaa, 0xbd, 0x12, 0xb9, 0x5c, 0xbe, 0xac, 0xf9, 0x73, 0xbe, 0xea, 0xc3, 0x4c, 0xbe, - 0xb9, 0xcb, 0x3f, 0xbe, 0xcc, 0x19, 0x5f, 0xbe, 0x30, 0xc1, 0x2f, 0xbe, 0x32, 0x6a, 0x69, 0xbe, - 0xf6, 0xd8, 0x04, 0xbe, 0x9e, 0x2d, 0x1a, 0xbe, 0x4c, 0xcd, 0x3f, 0xbe, 0xbe, 0xc0, 0x21, 0xbe, - 0x88, 0x8f, 0x06, 0xbe, 0xde, 0xfc, 0xe2, 0xbd, 0x38, 0x1f, 0x82, 0xbe, 0x3b, 0x31, 0x14, 0xbe, - 0xfc, 0x50, 0x21, 0xbe, 0x1a, 0x08, 0x48, 0xbe, 0x4e, 0xb4, 0x11, 0xbe, 0xee, 0x67, 0x31, 0xbe, - 0x8b, 0x27, 0x04, 0xbe, 0x6f, 0xa3, 0x14, 0xbe, 0x8e, 0x87, 0x1a, 0xbe, 0x08, 0xbf, 0x6f, 0xbe, - 0x95, 0x15, 0x5d, 0xbe, 0x07, 0x1b, 0xde, 0xbd, 0x9e, 0xe7, 0x06, 0xbe, 0xd0, 0x99, 0xbd, 0xbd, - 0x5c, 0x00, 0x2b, 0xbe, 0x28, 0xec, 0x08, 0xbe, 0x5f, 0xf4, 0xc3, 0xbd, 0x5e, 0xe7, 0x4b, 0xbe, - 0x4e, 0x5f, 0xc8, 0xbc, 0x46, 0xdb, 0x9e, 0xbd, 0xef, 0xa0, 0x41, 0xbd, 0xab, 0x68, 0xff, 0xbc, - 0xc8, 0x4e, 0x21, 0xbe, 0x13, 0x27, 0x35, 0xbd, 0x00, 0xeb, 0x6d, 0x39, 0x3c, 0xac, 0xb2, 0xbd, - 0x0e, 0xd7, 0xb8, 0xbd, 0x0e, 0xf8, 0x62, 0xbd, 0xa4, 0x2e, 0x6b, 0xbd, 0xb3, 0x47, 0x90, 0xbd, - 0x94, 0x32, 0x34, 0xbe, 0x94, 0x6e, 0xe3, 0xbc, 0xdc, 0x78, 0x58, 0xbd, 0x50, 0x27, 0xa1, 0xbd, - 0xfb, 0xe7, 0x14, 0x3c, 0x5e, 0xdb, 0xf5, 0xbc, 0x00, 0x9e, 0x5b, 0xbc, 0x74, 0xa2, 0xca, 0xbc, - 0xd2, 0xa8, 0x96, 0x3c, 0xaa, 0xa3, 0xa2, 0xbd, 0x30, 0xe9, 0xda, 0xbc, 0x22, 0x34, 0xa2, 0xbc, - 0x56, 0x51, 0x12, 0xbd, 0xe4, 0x2a, 0xf5, 0xbd, 0x40, 0x7e, 0x13, 0xbe, 0x28, 0xcc, 0xf8, 0xbc, - 0x74, 0x31, 0xb7, 0x3c, 0x14, 0x39, 0xd6, 0xbc, 0x5d, 0x0b, 0x32, 0xbd, 0x3d, 0xc2, 0xc5, 0xbd, - 0xc4, 0x69, 0xb1, 0x3c, 0xf4, 0x76, 0x60, 0x3d, 0x06, 0xd7, 0x37, 0x3d, 0xf6, 0xb5, 0x60, 0x3d, - 0x10, 0xec, 0x66, 0x3d, 0xe0, 0x4f, 0x78, 0x3d, 0xa1, 0x43, 0x13, 0x3d, 0x83, 0x30, 0x8b, 0x3d, - 0x1e, 0x15, 0xb1, 0x3d, 0xb2, 0xcc, 0x49, 0x3d, 0x83, 0x31, 0x36, 0x3d, 0xd2, 0x30, 0x19, 0x3d, - 0x4a, 0x52, 0x5e, 0x3d, 0x42, 0x31, 0x21, 0x3d, 0x42, 0xfb, 0x58, 0x3d, 0x98, 0x36, 0x13, 0x3d, - 0x31, 0xd4, 0x50, 0x3c, 0xf4, 0x41, 0x04, 0x3d, 0x18, 0x73, 0x1d, 0x3d, 0x7e, 0xad, 0xc7, 0x3c, - 0x22, 0x1f, 0x28, 0x3c, 0xba, 0xa6, 0x32, 0x3d, 0xef, 0x6f, 0xde, 0x3c, 0xa0, 0x01, 0x61, 0x3d, - 0xcc, 0x45, 0x80, 0x3d, 0xe7, 0xd7, 0x17, 0x3d, 0x2e, 0xe3, 0x86, 0x3d, 0xdf, 0x56, 0xb8, 0x3c, - 0x3f, 0xe9, 0xea, 0x3c, 0x5a, 0x7b, 0x57, 0x3d, 0x34, 0xca, 0xd3, 0x3b, 0x4a, 0x71, 0x1d, 0x3d, - 0x5a, 0x46, 0x90, 0xbb, 0xb6, 0x30, 0x1f, 0x3d, 0x46, 0x6b, 0xed, 0x3c, 0x45, 0xd1, 0x90, 0x3c, - 0x0a, 0x1e, 0x3b, 0x3d, 0xff, 0x62, 0xc2, 0x3c, 0xc0, 0x9a, 0xb7, 0xbb, 0x30, 0x84, 0x12, 0xbc, - 0x54, 0x66, 0xc3, 0xbb, 0x82, 0xe2, 0x00, 0x3d, 0x7c, 0xde, 0x18, 0x3c, 0x58, 0x95, 0x30, 0x3c, - 0x83, 0xf6, 0x17, 0x3d, 0xbc, 0xa1, 0x8b, 0x3b, 0xd1, 0xab, 0x29, 0x3d, 0xd8, 0x1d, 0xcc, 0x3c, - 0x1d, 0xfb, 0x63, 0xbc, 0x8c, 0xe6, 0x6b, 0x3c, 0x57, 0xeb, 0x07, 0xbc, 0x48, 0x3e, 0xa4, 0x3c, - 0xc0, 0x70, 0xcd, 0xba, 0xd6, 0xdd, 0x29, 0x3c, 0xfc, 0x60, 0x8b, 0x3b, 0x8c, 0x42, 0xb6, 0x3c, - 0xc2, 0x13, 0xbf, 0xbc, 0xe9, 0x13, 0xaf, 0x3c, 0x70, 0x21, 0x77, 0x3d, 0x6d, 0xc6, 0xb2, 0x3b, - 0x3a, 0xcb, 0x10, 0xbc, 0x51, 0xbe, 0x98, 0x3c, 0xa0, 0xaf, 0x2e, 0x3c, 0xa4, 0xb1, 0xa1, 0x3c, - 0x23, 0x09, 0x11, 0xbb, 0x6c, 0x4f, 0xf8, 0xbb, 0x1a, 0x75, 0xd8, 0xbb, 0xec, 0x47, 0x04, 0xbc, - 0x54, 0xe4, 0xb3, 0xbb, 0xae, 0x45, 0x10, 0xbc, 0xde, 0xc8, 0xa7, 0xbb, 0x6a, 0x1c, 0xe0, 0xbb, - 0xc6, 0x52, 0x1e, 0xbc, 0x3a, 0x30, 0xe9, 0xbb, 0x28, 0x34, 0xab, 0xbb, 0x5c, 0x13, 0x79, 0xbb, - 0x34, 0x5c, 0x87, 0xbb, 0x97, 0x16, 0xaa, 0xbb, 0x9b, 0x42, 0x07, 0xbc, 0xd8, 0x87, 0x83, 0xbb, - 0x51, 0x1c, 0xc7, 0xba, 0xae, 0x84, 0x95, 0xbb, 0xd8, 0xb4, 0xa0, 0xbb, 0x51, 0x46, 0x7b, 0xbb, - 0xae, 0xe9, 0x07, 0xbb, 0x92, 0xa0, 0x90, 0xbb, 0x80, 0x7c, 0x5e, 0xbb, 0x8e, 0x6a, 0x0e, 0xbc, - 0xf0, 0x9e, 0xe6, 0xbb, 0x72, 0xf1, 0x2c, 0xbb, 0xc5, 0xa0, 0x04, 0xbc, 0x1e, 0xbd, 0x2c, 0xbb, - 0x9f, 0xe4, 0x96, 0xbb, 0x2c, 0x48, 0x02, 0xbc, 0x34, 0x19, 0x61, 0xb8, 0xd2, 0x98, 0x67, 0xbb, - 0x96, 0xff, 0xcd, 0x39, 0x3a, 0x17, 0xc4, 0xbb, 0x64, 0x5a, 0xa4, 0xbb, 0x64, 0xb8, 0x8e, 0xbb, - 0x1c, 0xd9, 0x90, 0xbb, 0x42, 0xa6, 0xa7, 0xbb, 0x14, 0xe7, 0x7c, 0xba, 0xec, 0x29, 0x5f, 0x3a, - 0x10, 0x65, 0xdc, 0xb9, 0x46, 0xdb, 0xae, 0xbb, 0x62, 0x17, 0xe0, 0xba, 0x2b, 0x35, 0x95, 0xba, - 0x91, 0x25, 0x1e, 0xbb, 0xd1, 0x63, 0xdc, 0xba, 0x0f, 0xac, 0xe8, 0xbb, 0x38, 0xd0, 0x3e, 0xbb, - 0x55, 0x56, 0x96, 0x3a, 0xdc, 0xc8, 0x35, 0xbb, 0x30, 0x03, 0x72, 0xb9, 0x59, 0xed, 0x5e, 0xbb, - 0x8f, 0x91, 0x69, 0xba, 0xd0, 0xb7, 0x8e, 0xba, 0x1a, 0xd3, 0x90, 0xba, 0x3a, 0xbb, 0xb1, 0xbb, - 0x5c, 0x2f, 0xcc, 0x3a, 0xa8, 0x1c, 0x8c, 0xba, 0x00, 0x24, 0xf7, 0xbb, 0xd0, 0x16, 0x74, 0xba, - 0x34, 0xc2, 0x5f, 0xba, 0x97, 0x46, 0x95, 0xbb, 0xd6, 0x44, 0xf8, 0xb9, 0x8c, 0x16, 0xda, 0xba, - 0x21, 0x18, 0xa7, 0xba, 0x72, 0x90, 0x3b, 0xbb, 0x44, 0x32, 0x0a, 0xbb, 0x34, 0x66, 0x1a, 0xbb, - 0x80, 0x98, 0x88, 0xbb, 0x1c, 0xdb, 0x31, 0xbb, 0xce, 0x66, 0xb9, 0xba, 0x9c, 0xd2, 0x86, 0xbb, - 0x91, 0xfb, 0x9d, 0xbb, 0xee, 0xd3, 0x1b, 0xbb, 0x49, 0x63, 0x24, 0xbb, 0xde, 0x08, 0x20, 0xbb, - 0x03, 0x9b, 0x93, 0xbb, 0x0c, 0x19, 0xf7, 0xba, 0x06, 0xfb, 0x19, 0xbb, 0x68, 0xdd, 0x19, 0xbb, - 0x69, 0x2f, 0x03, 0xba, 0x52, 0xae, 0xc8, 0xba, 0x51, 0xd0, 0xe7, 0xba, 0x6d, 0xfe, 0x8c, 0xba, - 0x18, 0x42, 0xcd, 0xb8, 0x3b, 0x70, 0x39, 0xbb, 0xa0, 0xde, 0xb8, 0xba, 0xac, 0x67, 0x0a, 0xbb, - 0x8d, 0x0d, 0x54, 0xbb, 0xb2, 0x5a, 0x4c, 0xbb, 0xe6, 0xe1, 0x86, 0xbb, 0xc7, 0x02, 0xa8, 0xba, - 0x2c, 0x26, 0x55, 0xba, 0xec, 0x93, 0x0e, 0xbb, 0x86, 0x83, 0x5a, 0xba, 0x78, 0xc4, 0x37, 0xbb, - 0x6c, 0x3d, 0xec, 0x37, 0x40, 0x58, 0x07, 0xbb, 0x1e, 0x2f, 0xac, 0xba, 0xfb, 0x3a, 0x02, 0xba, - 0xc7, 0x25, 0x6e, 0xbb, 0x0a, 0xd8, 0x63, 0xba, 0x50, 0xe2, 0x2d, 0x3a, 0x7b, 0x1c, 0x8d, 0xb9, - 0x7c, 0x7e, 0x51, 0xb9, 0xf7, 0xfd, 0xc2, 0xba, 0xe7, 0xd3, 0x44, 0xba, 0xf4, 0x98, 0x91, 0xba, - 0x9b, 0xec, 0x6e, 0xbb, 0x38, 0xee, 0x08, 0xb9, 0xbe, 0x43, 0xe8, 0xba, 0x56, 0x7b, 0xeb, 0xba, - 0x3c, 0x43, 0x5a, 0x3a, 0xa0, 0xdb, 0x26, 0xba, 0x5a, 0xf2, 0x14, 0x3a, 0xeb, 0x4a, 0x61, 0xba, - 0x52, 0x62, 0x01, 0x3a, 0x3c, 0xfb, 0x98, 0xba, 0xb6, 0x2e, 0x8b, 0xb9, 0x54, 0x6e, 0xfa, 0xb9, - 0x71, 0x3a, 0x8b, 0x3a, 0x22, 0xe9, 0x18, 0xbb, 0x43, 0xa6, 0x7b, 0xbb, 0x76, 0x2d, 0xe1, 0xb9, - 0x9b, 0x45, 0x8b, 0x3a, 0x5a, 0x51, 0xfa, 0xb9, 0xf9, 0xc5, 0x88, 0xba, 0x65, 0xfb, 0xf4, 0xba, - 0x32, 0x4e, 0x6b, 0xbc, 0xad, 0x60, 0x08, 0xbe, 0xc6, 0x8d, 0xf3, 0xbd, 0x20, 0x34, 0x07, 0xbe, - 0xce, 0xad, 0x93, 0xbd, 0x00, 0x0f, 0x15, 0xbe, 0x48, 0x51, 0x8c, 0xbd, 0x79, 0x97, 0x3a, 0xbd, - 0x13, 0x1e, 0xba, 0xbd, 0x8a, 0x44, 0x01, 0xbe, 0xf4, 0x5e, 0x82, 0xbd, 0x74, 0x72, 0x1b, 0xbd, - 0xfe, 0xb2, 0xf2, 0xbc, 0x84, 0x1e, 0x90, 0xbd, 0xc4, 0x5f, 0x23, 0xbe, 0xc7, 0x8d, 0x6e, 0xbd, - 0xd0, 0xa0, 0xd3, 0xbb, 0x32, 0xac, 0x99, 0xbd, 0x14, 0xe7, 0x5e, 0xbd, 0x02, 0x0c, 0x9a, 0xbd, - 0xff, 0x73, 0x1b, 0xbd, 0x97, 0x3c, 0x2c, 0xbd, 0x9b, 0xfc, 0x33, 0xbd, 0x1d, 0x8d, 0x1d, 0xbe, - 0x84, 0xf2, 0x57, 0xbd, 0x94, 0x19, 0x41, 0xbc, 0x22, 0x44, 0x0f, 0xbe, 0x76, 0x43, 0x05, 0xbd, - 0x68, 0x36, 0x89, 0xbd, 0xc7, 0x63, 0x09, 0xbe, 0x9c, 0xbd, 0xa1, 0x3b, 0x42, 0xa2, 0x0c, 0xbd, - 0x3a, 0x20, 0x79, 0xbd, 0xe6, 0xe6, 0x9a, 0xbd, 0x1a, 0xda, 0x9d, 0xbd, 0xc1, 0xfd, 0xc3, 0xbd, - 0x01, 0x64, 0xe5, 0xbd, 0xfa, 0xab, 0x6b, 0xbd, 0x9e, 0x60, 0xdd, 0xbc, 0xf4, 0xc5, 0xb7, 0xbd, - 0x74, 0x78, 0xe0, 0xbd, 0xa8, 0xbb, 0xa8, 0xbd, 0x16, 0x1d, 0x84, 0xbd, 0xce, 0xb1, 0x6a, 0xbd, - 0xbc, 0xea, 0x8a, 0xbd, 0x4c, 0xb3, 0x06, 0xbd, 0x27, 0x54, 0x8d, 0xbd, 0xee, 0x96, 0xe4, 0xbd, - 0x28, 0x90, 0xa4, 0xbd, 0xe5, 0xb1, 0xce, 0xbd, 0xc9, 0xa1, 0xd4, 0xbd, 0x45, 0x6a, 0xcd, 0xbd, - 0x7f, 0x6a, 0x83, 0xbd, 0xdd, 0x66, 0x3c, 0xbd, 0x80, 0x8b, 0xd8, 0xbd, 0xce, 0x6c, 0xac, 0xbd, - 0xe0, 0x35, 0xb0, 0xbd, 0x42, 0xef, 0x01, 0xbe, 0x81, 0xfe, 0xad, 0xbd, 0x9e, 0x79, 0x8a, 0xbd, - 0xe9, 0x9c, 0x05, 0xbe, 0x83, 0x68, 0xd6, 0xbd, 0xe5, 0xd8, 0xf9, 0xbb, 0x78, 0x51, 0x3f, 0xbd, - 0x8a, 0x7c, 0x1b, 0x3d, 0xc4, 0xb4, 0x2a, 0x3d, 0x02, 0xf4, 0x61, 0x3d, 0x54, 0x73, 0x3b, 0x3d, - 0x08, 0x10, 0x67, 0x3d, 0x37, 0x52, 0x62, 0x3d, 0xb8, 0xf0, 0x4b, 0x3d, 0x8c, 0x3b, 0x8d, 0x3d, - 0xb2, 0x73, 0x63, 0x3d, 0x08, 0xc7, 0x4c, 0x3d, 0x50, 0xbd, 0x5c, 0x3d, 0x76, 0x79, 0x5a, 0x3d, - 0xbb, 0x8c, 0x38, 0x3d, 0xa9, 0x40, 0x07, 0x3d, 0x20, 0x6f, 0x42, 0x3d, 0x66, 0xe2, 0x6f, 0x3d, - 0x0d, 0x2b, 0x61, 0x3d, 0x86, 0x6e, 0x71, 0x3d, 0x18, 0x81, 0x42, 0x3d, 0x3f, 0x31, 0x58, 0x3d, - 0x4c, 0x4e, 0x6b, 0x3d, 0x29, 0x2a, 0x48, 0x3d, 0xdb, 0x55, 0x6a, 0x3d, 0x3a, 0x57, 0x80, 0x3d, - 0x24, 0x4a, 0x80, 0x3d, 0x4d, 0x96, 0x8f, 0x3d, 0x51, 0xd3, 0x5e, 0x3d, 0x3a, 0xcf, 0x39, 0x3d, - 0x36, 0xbc, 0x91, 0x3d, 0x8d, 0x4f, 0x41, 0x3d, 0x10, 0xaf, 0xe4, 0x3c, 0xd0, 0xee, 0x5b, 0x3d, - 0x08, 0x64, 0xa2, 0x3b, 0x6a, 0x94, 0xce, 0x3c, 0x59, 0xe4, 0x14, 0x3d, 0x76, 0x5b, 0x0d, 0x3d, - 0x24, 0xa2, 0xdb, 0x3c, 0x1d, 0x93, 0x3b, 0x3c, 0xc0, 0x0f, 0x03, 0xbb, 0x9e, 0xd3, 0xd0, 0x3c, - 0x42, 0x0f, 0x00, 0x3d, 0x6c, 0xd5, 0xb6, 0x3c, 0xdc, 0xef, 0xc8, 0x3b, 0x11, 0x2b, 0x0b, 0x3c, - 0x22, 0xd4, 0x9c, 0x3c, 0x31, 0x95, 0xb5, 0xbb, 0x0a, 0xf1, 0xd0, 0x3c, 0xeb, 0xbc, 0xf9, 0x3c, - 0x06, 0x6a, 0xb1, 0x3b, 0x6a, 0x55, 0xd6, 0x3c, 0x49, 0xe2, 0xa7, 0x3c, 0x16, 0xd1, 0xc3, 0x3c, - 0x95, 0x89, 0xb5, 0x3c, 0xee, 0xab, 0xf1, 0x3b, 0x16, 0x48, 0xa4, 0x3c, 0xd3, 0xe0, 0xe2, 0x3c, - 0x00, 0x15, 0x02, 0x3d, 0x6d, 0xec, 0x56, 0x3c, 0x79, 0x14, 0x2f, 0x3c, 0xcd, 0xe9, 0x64, 0x3c, - 0x65, 0xaf, 0xc1, 0x3c, 0x3c, 0xae, 0x98, 0x3c, 0x10, 0x87, 0x6d, 0x3c, 0x65, 0x95, 0x41, 0x3c, - 0x0e, 0x45, 0x80, 0xbb, 0x86, 0x8c, 0xa3, 0xbb, 0x4b, 0x83, 0xf6, 0xbb, 0x71, 0xf4, 0xb0, 0xbb, - 0x32, 0xdc, 0xc6, 0xbb, 0x3e, 0x7f, 0xe3, 0xbb, 0x51, 0xef, 0xd3, 0xbb, 0x08, 0x5b, 0x0d, 0xbc, - 0xd3, 0x72, 0xcc, 0xbb, 0x0a, 0x4f, 0xc0, 0xbb, 0x42, 0x79, 0xcc, 0xbb, 0x3f, 0x80, 0xd5, 0xbb, - 0x82, 0xbc, 0xb2, 0xbb, 0x93, 0x9b, 0x6a, 0xbb, 0xe1, 0xbb, 0xc7, 0xbb, 0xf7, 0xe4, 0xd7, 0xbb, - 0x36, 0x4e, 0xbf, 0xbb, 0xf9, 0x58, 0xde, 0xbb, 0x05, 0x94, 0x99, 0xbb, 0x1a, 0x0a, 0xbd, 0xbb, - 0x02, 0x96, 0xf8, 0xbb, 0x6c, 0x0a, 0xca, 0xbb, 0x72, 0x91, 0xc6, 0xbb, 0x7a, 0xb7, 0x02, 0xbc, - 0x78, 0xf9, 0x04, 0xbc, 0x28, 0x81, 0xe3, 0xbb, 0x07, 0x1a, 0xc0, 0xbb, 0x63, 0x02, 0xac, 0xbb, - 0x24, 0xfb, 0xf5, 0xbb, 0x6f, 0x28, 0x94, 0xbb, 0x84, 0xa2, 0x98, 0xbb, 0x74, 0x42, 0xe8, 0xbb, - 0x02, 0xf5, 0x20, 0xba, 0x5b, 0x3b, 0x5b, 0xbb, 0x2a, 0xdd, 0xb8, 0xbb, 0x8c, 0x14, 0x8c, 0xbb, - 0x0e, 0xba, 0x4b, 0xbb, 0xfd, 0xe8, 0x27, 0xbb, 0x08, 0xeb, 0xa8, 0xba, 0x09, 0x45, 0x8c, 0xbb, - 0x8e, 0xde, 0x79, 0xbb, 0x91, 0x3d, 0x4b, 0xbb, 0x29, 0xe7, 0xbf, 0xba, 0x2e, 0x1c, 0x05, 0xbb, - 0xf5, 0xa7, 0x3b, 0xbb, 0x60, 0xe7, 0x90, 0x39, 0xca, 0x86, 0x7f, 0xbb, 0xa0, 0xc3, 0x77, 0xbb, - 0x96, 0x4c, 0x67, 0xba, 0x08, 0xdf, 0x65, 0xbb, 0xb6, 0x41, 0x02, 0xbb, 0x14, 0xd3, 0x3c, 0xbb, - 0xce, 0xf4, 0x84, 0xbb, 0xfb, 0x26, 0x04, 0xbb, 0xfc, 0x9f, 0x19, 0xbb, 0xea, 0xd6, 0x92, 0xbb, - 0x50, 0xc0, 0xa4, 0xbb, 0xd7, 0xff, 0xa2, 0xba, 0xe0, 0x6d, 0xc3, 0xba, 0xf0, 0x49, 0x0a, 0xbb, - 0xba, 0x8e, 0x34, 0xbb, 0x8e, 0x53, 0xda, 0xba, 0x68, 0x55, 0x59, 0xbb, 0x5e, 0x0f, 0x3e, 0xbb, - 0x6b, 0xaa, 0x1b, 0xbb, 0x4d, 0x61, 0x24, 0xbb, 0x30, 0xfb, 0x3d, 0xbb, 0x3e, 0xba, 0x3c, 0xbb, - 0x14, 0x04, 0x6f, 0xbb, 0x80, 0x42, 0x3d, 0xbb, 0xf9, 0x84, 0x19, 0xbb, 0xdf, 0x9b, 0x76, 0xbb, - 0xac, 0x93, 0x66, 0xbb, 0xf6, 0x92, 0x42, 0xbb, 0x82, 0xe1, 0x45, 0xbb, 0x24, 0xb6, 0x3a, 0xbb, - 0xd1, 0x22, 0x29, 0xbb, 0xf2, 0x38, 0xf3, 0xba, 0x41, 0x24, 0x2b, 0xbb, 0x0a, 0xdf, 0x70, 0xbb, - 0xba, 0x6e, 0x5a, 0xbb, 0xb9, 0xa9, 0x69, 0xbb, 0x62, 0x51, 0x55, 0xbb, 0x40, 0xb0, 0x5b, 0xbb, - 0x49, 0x1e, 0x41, 0xbb, 0x29, 0x4c, 0x24, 0xbb, 0xe8, 0xa9, 0x6f, 0xbb, 0xf4, 0x47, 0x5f, 0xbb, - 0xf8, 0xe8, 0x5d, 0xbb, 0xe0, 0x13, 0x96, 0xbb, 0xf8, 0xbc, 0x59, 0xbb, 0xb6, 0xe4, 0x2d, 0xbb, - 0x60, 0x04, 0x95, 0xbb, 0xc8, 0x11, 0x57, 0xbb, 0x31, 0x1d, 0x82, 0xba, 0x79, 0x4c, 0x2d, 0xbb, - 0xf4, 0x0f, 0x3e, 0xba, 0xe7, 0xe4, 0xdc, 0xba, 0x10, 0x55, 0x00, 0xbb, 0x58, 0xda, 0x17, 0xbb, - 0xe8, 0x04, 0x0e, 0xbb, 0x01, 0xdf, 0x36, 0xba, 0xac, 0x7c, 0x81, 0x39, 0xb0, 0x55, 0xd0, 0xba, - 0x20, 0x10, 0x17, 0xbb, 0x68, 0xc5, 0xcf, 0xba, 0x27, 0x88, 0x25, 0xba, 0xee, 0x0f, 0x1f, 0xba, - 0x92, 0x74, 0xa8, 0xba, 0xd0, 0xf8, 0x97, 0x38, 0x8a, 0x57, 0xc6, 0xba, 0xe4, 0xdb, 0x14, 0xbb, - 0x5f, 0x28, 0x60, 0xba, 0x87, 0x80, 0xfc, 0xba, 0x6f, 0xbc, 0xf9, 0xba, 0x5f, 0x1f, 0xfa, 0xba, - 0x29, 0xfa, 0x9a, 0xba, 0xd2, 0xa9, 0xe2, 0xb9, 0xe8, 0xd0, 0xeb, 0xba, 0xd7, 0x5f, 0xd9, 0xba, - 0xb2, 0x6c, 0xf1, 0xba, 0x1c, 0xcd, 0xe2, 0xba, 0xd2, 0xfc, 0x94, 0xba, 0x94, 0x0e, 0x8e, 0xba, - 0xf9, 0x54, 0x0e, 0xbb, 0x78, 0xfc, 0xf2, 0xba, 0x44, 0xb6, 0xa8, 0xb9, 0xd2, 0x46, 0x10, 0xba, - 0x1e, 0x64, 0x12, 0xbd, 0xf3, 0x57, 0x99, 0xbd, 0xa6, 0xec, 0x03, 0xbe, 0xe1, 0xea, 0xad, 0xbd, - 0x3c, 0x19, 0x9a, 0xbd, 0xe1, 0x87, 0xbd, 0xbd, 0x8b, 0xdf, 0xa4, 0xbd, 0x10, 0x77, 0xfb, 0xbd, - 0xdd, 0x7a, 0xaf, 0xbd, 0x4e, 0x7c, 0xa4, 0xbd, 0x9f, 0x79, 0x90, 0xbd, 0xda, 0x93, 0xa7, 0xbd, - 0xe9, 0xea, 0x9c, 0xbd, 0x74, 0x18, 0xed, 0xbc, 0x73, 0xf0, 0xc2, 0xbd, 0x08, 0xc2, 0xb5, 0xbd, - 0x5e, 0x66, 0x63, 0xbd, 0x7e, 0x7a, 0xb9, 0xbd, 0x76, 0x3d, 0x49, 0xbd, 0xbe, 0xf2, 0x93, 0xbd, - 0xf9, 0x6a, 0xeb, 0xbd, 0x2d, 0xf8, 0xa4, 0xbd, 0xa2, 0x7d, 0x8b, 0xbd, 0xed, 0xf5, 0xf3, 0xbd, - 0x48, 0x64, 0x01, 0xbe, 0x4f, 0x4c, 0x77, 0xbd, 0xf9, 0xd9, 0x7b, 0xbd, 0x44, 0xf1, 0x86, 0xbd, - 0x4e, 0xa0, 0xa9, 0xbd, 0xfb, 0xe5, 0x31, 0xbd, 0xe7, 0xd2, 0xb3, 0xbd, 0xf2, 0x35, 0xce, 0xbd, - 0x76, 0x2d, 0x5b, 0xbc, 0x8c, 0xdf, 0x78, 0xbd, 0x85, 0xa0, 0xbc, 0xbc, 0x48, 0x02, 0x1e, 0xbd, - 0xc8, 0xa3, 0x85, 0xbd, 0x4d, 0xcb, 0x11, 0xbc, 0x82, 0x55, 0xca, 0x3c, 0x35, 0x7f, 0x26, 0xbd, - 0x06, 0xe6, 0x59, 0xbd, 0x2c, 0xe9, 0xef, 0xbb, 0x32, 0x84, 0xbb, 0xbb, 0x38, 0xfa, 0x23, 0xbd, - 0x87, 0xc5, 0xda, 0xbc, 0xd6, 0x17, 0x3c, 0xbb, 0x17, 0x77, 0x3b, 0xbd, 0xe8, 0x1a, 0x84, 0xbd, - 0x89, 0x1b, 0x49, 0xbd, 0xa7, 0x8e, 0x6d, 0xbd, 0x14, 0x7c, 0xd0, 0xbc, 0xa6, 0xe5, 0xf6, 0xbc, - 0xbe, 0x40, 0xf8, 0xbc, 0x00, 0xce, 0xfa, 0xbb, 0x00, 0xac, 0x12, 0xb7, 0xd3, 0xc8, 0xa5, 0xbc, - 0x78, 0x5c, 0x3d, 0xbc, 0xcc, 0x58, 0xa3, 0xbd, 0xd4, 0xe3, 0x30, 0xbd, 0x03, 0x09, 0x36, 0xbc, - 0x48, 0x6d, 0x3f, 0xbd, 0x86, 0x3a, 0x32, 0xbd, 0x80, 0xcc, 0x4a, 0xbb, 0x87, 0x89, 0xf7, 0xbc, - 0x95, 0xb6, 0x8b, 0x3b, 0x92, 0x37, 0x82, 0x3c, 0x00, 0xaf, 0xe2, 0x3c, 0xaa, 0x9d, 0xb6, 0x3c, - 0xb2, 0x6b, 0x88, 0x3c, 0x06, 0x60, 0xdd, 0x3c, 0x19, 0x81, 0x67, 0x3c, 0xa1, 0x36, 0x16, 0x3d, - 0x44, 0x10, 0x0f, 0x3d, 0xe8, 0xce, 0x9c, 0x3c, 0xfb, 0xe0, 0x43, 0x3c, 0x2c, 0x11, 0xd0, 0x3c, - 0x13, 0xdb, 0x07, 0x3c, 0xf4, 0xf8, 0x03, 0x3c, 0xb8, 0x38, 0xdb, 0x3c, 0xe6, 0xf6, 0xf8, 0x3c, - 0x02, 0xeb, 0xb5, 0x3c, 0xde, 0xe7, 0x82, 0x3c, 0x5e, 0x1d, 0xa3, 0x3c, 0x67, 0x07, 0x0b, 0x3c, - 0x44, 0xa8, 0xfe, 0x3c, 0x84, 0xcb, 0x8b, 0x3c, 0x70, 0x35, 0x3e, 0x3c, 0x80, 0xfa, 0xbe, 0x3c, - 0x30, 0xc4, 0x00, 0x3d, 0x4f, 0x69, 0x09, 0x3d, 0x74, 0xb9, 0xfb, 0x3c, 0xe9, 0x7b, 0x80, 0x3c, - 0xc0, 0x6d, 0xff, 0x3c, 0x12, 0xe8, 0xcd, 0x3c, 0x50, 0xd7, 0x46, 0x3a, 0xd9, 0x95, 0xa1, 0x3c, - 0xf6, 0x5d, 0x17, 0x3b, 0xcc, 0xaa, 0xd5, 0x3c, 0x82, 0xf2, 0x59, 0x3c, 0xa2, 0x5b, 0x8f, 0x3c, - 0xfc, 0x69, 0xa5, 0x3c, 0x84, 0xb1, 0x02, 0x3b, 0xc8, 0x97, 0x48, 0xbc, 0x69, 0xa4, 0x95, 0xbb, - 0x0c, 0xca, 0x94, 0x3c, 0xcc, 0xf6, 0x6b, 0x3c, 0x28, 0x6c, 0xb5, 0xbb, 0xb4, 0x10, 0x09, 0x3b, - 0x30, 0xf7, 0x75, 0x3c, 0x02, 0x93, 0x3e, 0xbb, 0x0a, 0x85, 0x73, 0x3c, 0xd0, 0x95, 0x97, 0x3c, - 0x9b, 0x19, 0x18, 0xbc, 0x5c, 0xfb, 0xbe, 0x3c, 0xbc, 0x25, 0xce, 0xbb, 0xbe, 0x8e, 0x79, 0x3c, - 0xa6, 0xb1, 0x58, 0x3c, 0x38, 0x0c, 0x4b, 0xba, 0x89, 0x41, 0xa7, 0xbb, 0x71, 0x49, 0xaa, 0x3c, - 0x1c, 0xd2, 0xd1, 0xbb, 0xb4, 0xae, 0xed, 0x3b, 0x65, 0x3e, 0x90, 0x3c, 0x84, 0x3d, 0xaa, 0x3b, - 0x64, 0x00, 0x89, 0x3b, 0x88, 0xf6, 0x5e, 0x3c, 0x7e, 0x92, 0x8d, 0x3c, 0xd4, 0x94, 0xf0, 0x3a, - 0xb0, 0x11, 0xa2, 0xb9, 0x6b, 0xfd, 0xd9, 0xba, 0x95, 0xa7, 0x84, 0xbb, 0xf8, 0x35, 0x40, 0xbb, - 0x86, 0xa4, 0xae, 0xba, 0xdc, 0xbe, 0x7c, 0xbb, 0x5f, 0xbd, 0x18, 0xbb, 0xf2, 0x5f, 0x80, 0xbb, - 0xed, 0x53, 0x8f, 0xbb, 0x5e, 0x5e, 0x5b, 0xbb, 0x96, 0xc4, 0xad, 0xba, 0x42, 0x14, 0x29, 0xbb, - 0x26, 0xbc, 0x90, 0xba, 0x3e, 0x8a, 0x7d, 0xba, 0xd3, 0xc9, 0x53, 0xbb, 0xb4, 0x7a, 0x5a, 0xbb, - 0x9c, 0x3f, 0xa1, 0xba, 0x9a, 0x90, 0xd5, 0xba, 0xd4, 0x0d, 0xef, 0xba, 0xb3, 0xfa, 0x86, 0xba, - 0x06, 0x5d, 0x8d, 0xbb, 0xde, 0x7e, 0x14, 0xbb, 0xd6, 0xc2, 0xc2, 0xba, 0x6d, 0x14, 0x7d, 0xbb, - 0x8f, 0x67, 0x83, 0xbb, 0xa0, 0x51, 0x33, 0xbb, 0x77, 0xb6, 0x84, 0xbb, 0xfa, 0xee, 0x12, 0xbb, - 0xca, 0x1e, 0x5a, 0xbb, 0x0b, 0xa0, 0x44, 0xbb, 0x49, 0x4d, 0x65, 0xba, 0x33, 0xe8, 0x05, 0xbb, - 0x7e, 0x7c, 0xee, 0xb8, 0x4b, 0xc1, 0x2f, 0xbb, 0x2a, 0x24, 0x2b, 0xbb, 0xf2, 0xcd, 0x20, 0xbb, - 0x31, 0x08, 0xdd, 0xba, 0xff, 0x6d, 0xb1, 0xba, 0x10, 0xcb, 0xa0, 0x39, 0x38, 0x53, 0x58, 0x39, - 0x10, 0xc9, 0x30, 0xbb, 0xc3, 0x4e, 0x3c, 0xbb, 0x3e, 0x04, 0xde, 0x39, 0xe8, 0xa7, 0x82, 0xb9, - 0xa4, 0xd2, 0xe8, 0xba, 0xf2, 0x61, 0x07, 0x39, 0xab, 0xd1, 0x05, 0xbb, 0x6e, 0x93, 0x0c, 0xbb, - 0x18, 0x80, 0xfb, 0x3a, 0xe4, 0xd7, 0x1a, 0xbb, 0xe8, 0xc6, 0x50, 0x3a, 0xfa, 0x66, 0xdf, 0xba, - 0xb2, 0xad, 0x25, 0xbb, 0xc6, 0xaa, 0xfc, 0xb9, 0x64, 0x77, 0x42, 0x39, 0xc8, 0x86, 0x6c, 0xbb, - 0x78, 0x19, 0x6d, 0xb9, 0x98, 0xe8, 0xdf, 0xb8, 0xe1, 0x70, 0x33, 0xbb, 0x80, 0x63, 0x9c, 0xba, - 0xdd, 0xb5, 0x24, 0xba, 0x9a, 0x2b, 0xf2, 0xba, 0xbb, 0x9c, 0x25, 0xbb, 0xb2, 0x57, 0x85, 0xb9, - 0x00, 0x96, 0xb9, 0xb9, 0x12, 0x21, 0xa8, 0xba, 0x16, 0x84, 0xa4, 0xba, 0x5b, 0x2b, 0xa5, 0xba, - 0x96, 0x51, 0xbf, 0xba, 0x37, 0xff, 0x94, 0xba, 0x28, 0x38, 0x8e, 0xb9, 0xad, 0x44, 0x0a, 0xbb, - 0xc4, 0x10, 0x01, 0xbb, 0x16, 0xf7, 0x2c, 0xba, 0xf6, 0xab, 0x20, 0xba, 0x45, 0xde, 0xd0, 0xba, - 0x9f, 0xff, 0x14, 0xba, 0xe3, 0x80, 0xca, 0xb9, 0x94, 0x29, 0xcf, 0xba, 0xa3, 0xfc, 0x01, 0xbb, - 0x41, 0x9e, 0xe7, 0xba, 0x55, 0x0b, 0xa7, 0xba, 0xaa, 0xc5, 0xa3, 0xba, 0x2c, 0x2f, 0x25, 0xba, - 0x64, 0x53, 0xc5, 0xba, 0x93, 0xbb, 0x4b, 0xba, 0x98, 0xe3, 0x02, 0xba, 0x55, 0x7a, 0x79, 0xba, - 0x43, 0x42, 0xbe, 0xba, 0x82, 0x9f, 0x23, 0xbb, 0xf8, 0x86, 0xd8, 0xba, 0x14, 0xb3, 0x39, 0xba, - 0x61, 0x80, 0xf8, 0xba, 0x88, 0x4d, 0xc4, 0xba, 0x14, 0x9c, 0x0e, 0x39, 0x31, 0xf4, 0x9f, 0xba, - 0xb8, 0x5c, 0x86, 0xb9, 0xa8, 0xe3, 0xea, 0xba, 0x2e, 0xb2, 0x0c, 0xba, 0x54, 0xc3, 0x85, 0xba, - 0x6c, 0x83, 0xd6, 0xba, 0x60, 0x8a, 0xf0, 0x38, 0xb7, 0x48, 0x89, 0x3a, 0x58, 0x88, 0x48, 0xb8, - 0xbc, 0x42, 0x94, 0xba, 0xbd, 0xaf, 0xdd, 0xb9, 0x5d, 0x33, 0x89, 0x39, 0x00, 0x7c, 0xe0, 0xb9, - 0x1d, 0x16, 0x6d, 0xba, 0xad, 0xc3, 0x29, 0x39, 0x6c, 0x31, 0x81, 0xba, 0x02, 0x12, 0xb6, 0xba, - 0x38, 0xf3, 0x49, 0xb9, 0xed, 0x1a, 0xd7, 0xba, 0x4c, 0x5f, 0x7e, 0x38, 0x72, 0x9b, 0x7d, 0xba, - 0x14, 0x8e, 0x20, 0xba, 0xba, 0xbb, 0xf0, 0x38, 0x57, 0x5c, 0xb0, 0x39, 0x0a, 0x5f, 0x58, 0xba, - 0x0c, 0xed, 0xe6, 0x39, 0xaa, 0xec, 0x9a, 0xba, 0xec, 0x8a, 0x82, 0xba, 0x82, 0xe2, 0x40, 0xb9, - 0x1d, 0x1e, 0x0f, 0xba, 0x96, 0x86, 0x71, 0xba, 0xcc, 0xeb, 0x34, 0xba, 0xd1, 0xdb, 0xaa, 0xb9, - 0x5c, 0x26, 0xb9, 0xba, 0x1d, 0xff, 0x09, 0xbd, 0x83, 0xa6, 0x8d, 0xbd, 0x2e, 0xb6, 0x50, 0xbd, - 0xa8, 0x91, 0x90, 0xbc, 0x2a, 0xfa, 0x66, 0xbd, 0xfc, 0x1b, 0xed, 0xbc, 0x77, 0x65, 0x01, 0xbd, - 0xf1, 0xb0, 0x87, 0xbd, 0x39, 0x93, 0x8b, 0xbd, 0x6e, 0xde, 0x0e, 0xbc, 0xe4, 0xbe, 0xb2, 0xbc, - 0x04, 0xc7, 0xd2, 0xbc, 0x7e, 0xb7, 0x0e, 0xbc, 0x8e, 0x23, 0x44, 0xbd, 0x7c, 0x68, 0x3a, 0xbd, - 0x1c, 0x03, 0xa4, 0x3c, 0xf2, 0x21, 0xf9, 0xbc, 0x18, 0x43, 0xaa, 0xbb, 0x44, 0x43, 0xbe, 0xbc, - 0x6d, 0x20, 0x8e, 0xbd, 0x8d, 0x8f, 0xed, 0xbc, 0xd6, 0xa9, 0x6f, 0xbc, 0x03, 0xc4, 0xa1, 0xbd, - 0x38, 0x5f, 0x37, 0xbd, 0xeb, 0x51, 0x45, 0xbc, 0x80, 0x49, 0x86, 0xbd, 0x72, 0xea, 0x12, 0xbd, - 0x4c, 0x76, 0x06, 0xbd, 0x3a, 0xd5, 0x32, 0xbd, 0x97, 0x95, 0x19, 0xbd, 0x90, 0xdc, 0x95, 0xbc, - 0x36, 0xe7, 0xa3, 0xbf, 0x17, 0x3b, 0xb3, 0xbf, 0xd2, 0x62, 0xea, 0xbf, 0x96, 0x22, 0xa8, 0xbf, - 0xe0, 0xb4, 0x75, 0xc0, 0x65, 0x42, 0xea, 0xbf, 0xd1, 0x68, 0x9c, 0xbf, 0xe5, 0x54, 0x1f, 0xc0, - 0x38, 0xc1, 0x22, 0xc0, 0xea, 0xf7, 0x25, 0xc0, 0x1b, 0x85, 0x17, 0xc0, 0xfc, 0xb4, 0xc2, 0xbf, - 0xce, 0x94, 0x94, 0xc0, 0xed, 0x08, 0x96, 0xbf, 0xab, 0x99, 0x91, 0xbf, 0x4f, 0x32, 0xfb, 0xbf, - 0x19, 0x99, 0x7a, 0x3e, 0x4f, 0x09, 0x4c, 0xbf, 0x79, 0xa9, 0xb6, 0xbf, 0xd4, 0x58, 0xbf, 0xbf, - 0xc7, 0x2e, 0x48, 0x3e, 0x9b, 0x45, 0x20, 0xc0, 0xe5, 0xd5, 0x22, 0xc0, 0xd1, 0x17, 0xb1, 0xbf, - 0xf8, 0xcb, 0x03, 0xc0, 0x15, 0xea, 0x30, 0xc0, 0xbc, 0x31, 0x6c, 0xc0, 0x44, 0x6b, 0xce, 0xbf, - 0x30, 0xe0, 0xd1, 0xbe, 0x1e, 0x74, 0x9a, 0xbf, 0xca, 0x37, 0x8b, 0xbf, 0x5c, 0x1c, 0x10, 0xc0, - 0x52, 0x9e, 0x89, 0x3f, 0xda, 0x54, 0xcb, 0x3f, 0x7d, 0xb8, 0x9e, 0x3f, 0x6f, 0x5f, 0xbc, 0x3f, - 0x96, 0x37, 0xef, 0x3f, 0x24, 0xc5, 0xd1, 0x3f, 0xde, 0xe7, 0xaa, 0x3f, 0x8b, 0xd8, 0xe2, 0x3f, - 0xc0, 0xb8, 0x01, 0x40, 0x7e, 0x08, 0xc1, 0x3f, 0xe5, 0xe6, 0xd6, 0x3f, 0xe4, 0x40, 0x8c, 0x3f, - 0x88, 0xb5, 0xee, 0x3f, 0xf7, 0x0a, 0xa6, 0x3f, 0x5e, 0x58, 0xa9, 0x3f, 0x8e, 0xa4, 0x80, 0x3f, - 0xae, 0x13, 0x21, 0x3f, 0xf2, 0xa2, 0xad, 0x3f, 0x2b, 0x09, 0x97, 0x3f, 0x06, 0xe7, 0xa3, 0x3f, - 0x02, 0x0c, 0xb2, 0x3e, 0x5d, 0x0d, 0xb5, 0x3f, 0xbf, 0xa5, 0xaa, 0x3f, 0xee, 0x9d, 0xdf, 0x3f, - 0x2b, 0x04, 0xd8, 0x3f, 0x4a, 0x98, 0x92, 0x3f, 0xb3, 0xac, 0xd1, 0x3f, 0x1d, 0xd6, 0x62, 0x3f, - 0xc1, 0x4f, 0x84, 0x3f, 0x7b, 0xd4, 0xad, 0x3f, 0x14, 0x85, 0x22, 0x3f, 0x0d, 0x0d, 0xa6, 0x3f, - 0x14, 0xa1, 0xaf, 0xbd, 0x6e, 0xac, 0x1a, 0x3f, 0xab, 0x67, 0x73, 0x3f, 0xf0, 0x3d, 0x05, 0x3f, - 0xe7, 0x24, 0x79, 0x3f, 0xd1, 0x3a, 0x37, 0x3f, 0x1e, 0x8c, 0x49, 0x3e, 0xfe, 0x2f, 0xc3, 0x3e, - 0xc8, 0x3d, 0x00, 0xbe, 0x21, 0x90, 0x39, 0x3f, 0x83, 0x1a, 0x00, 0x3f, 0x50, 0x98, 0xb2, 0x3e, - 0x38, 0xdd, 0x48, 0x3f, 0x98, 0xd4, 0xb7, 0x3d, 0x79, 0xf1, 0x80, 0x3f, 0x32, 0x74, 0x17, 0x3f, - 0x00, 0x0c, 0x2c, 0x3d, 0xd0, 0xd6, 0xef, 0x3d, 0xbe, 0x8f, 0xb7, 0x3e, 0xd3, 0xb2, 0xe8, 0x3e, - 0xac, 0x72, 0x11, 0xbd, 0x72, 0x44, 0xcf, 0x3e, 0x18, 0xbe, 0x1d, 0x3f, 0x0b, 0x57, 0xd1, 0x3e, - 0x34, 0xd8, 0x50, 0x3e, 0x7c, 0x0c, 0x08, 0x3f, 0xb5, 0x5a, 0x8e, 0x3f, 0x31, 0x02, 0x68, 0x3e, - 0xd8, 0x52, 0xb4, 0x3d, 0x5e, 0xda, 0xbf, 0x3e, 0xc0, 0xf8, 0xd2, 0xba, 0x86, 0x9e, 0x21, 0x3f, - 0x76, 0x16, 0xea, 0xbd, 0x37, 0x23, 0x63, 0xbe, 0x92, 0xa4, 0x30, 0xbe, 0x16, 0xdc, 0x4f, 0xbe, - 0x8c, 0xd8, 0x4d, 0xbe, 0xa0, 0xe6, 0x61, 0xbe, 0x7e, 0x74, 0x2d, 0xbe, 0xc6, 0xc1, 0x4e, 0xbe, - 0x11, 0x1e, 0x5a, 0xbe, 0x64, 0x3c, 0x34, 0xbe, 0x40, 0xb1, 0x4a, 0xbe, 0xe7, 0x3d, 0x06, 0xbe, - 0x18, 0x23, 0x28, 0xbe, 0x7e, 0xd6, 0x23, 0xbe, 0xfc, 0xf6, 0x57, 0xbe, 0x50, 0x5a, 0xeb, 0xbd, - 0xca, 0x4c, 0xd6, 0xbd, 0xaa, 0x27, 0x3b, 0xbe, 0xa7, 0xe3, 0x16, 0xbe, 0x4a, 0xed, 0x28, 0xbe, - 0x2d, 0x13, 0x69, 0xbd, 0xd2, 0x82, 0x18, 0xbe, 0xd6, 0x7a, 0x15, 0xbe, 0xab, 0x0c, 0x72, 0xbe, - 0xc4, 0x5e, 0x47, 0xbe, 0xdc, 0x89, 0xd9, 0xbd, 0xbb, 0x94, 0x36, 0xbe, 0x7a, 0x50, 0xb9, 0xbd, - 0x4c, 0x4a, 0x15, 0xbe, 0xfc, 0x97, 0x3a, 0xbe, 0x06, 0x38, 0x71, 0xbd, 0xd8, 0xb9, 0x1a, 0xbe, - 0xc0, 0x89, 0xea, 0x3a, 0x03, 0xaf, 0xfc, 0xbd, 0x3f, 0x07, 0x13, 0xbe, 0xa0, 0xea, 0xdc, 0xbd, - 0x7a, 0x42, 0xe4, 0xbd, 0x71, 0x60, 0x03, 0xbe, 0x73, 0x8a, 0x63, 0xbd, 0xad, 0xa2, 0x80, 0xbd, - 0x80, 0xba, 0x10, 0x3b, 0xb4, 0x11, 0xc8, 0xbd, 0x48, 0x06, 0xa4, 0xbd, 0x36, 0x08, 0x67, 0xbd, - 0xd3, 0x0f, 0x66, 0xbd, 0xcd, 0xc7, 0x20, 0xbd, 0xac, 0xa4, 0x37, 0xbe, 0xfb, 0xaf, 0x96, 0xbd, - 0xe2, 0x14, 0x3c, 0xbd, 0x63, 0xf3, 0x70, 0xbd, 0xbb, 0x8b, 0x85, 0xbd, 0x79, 0xb0, 0xa8, 0xbd, - 0x68, 0x2f, 0x98, 0xbc, 0xaf, 0x7d, 0x44, 0xbd, 0x26, 0x1e, 0x98, 0xbd, 0x44, 0x0c, 0xd2, 0xbd, - 0x32, 0xc4, 0x3d, 0xbd, 0xc5, 0xa6, 0x37, 0xbd, 0x89, 0xb9, 0x00, 0xbe, 0xe7, 0xfc, 0xc8, 0xbc, - 0x25, 0xd4, 0x51, 0xbd, 0xf2, 0xcc, 0xab, 0xbd, 0x04, 0xc7, 0x9b, 0x3b, 0xd3, 0x10, 0xad, 0xbd, - 0x54, 0x79, 0x80, 0xbd, 0x8f, 0x53, 0x9a, 0xbd, 0x14, 0x65, 0x85, 0xbd, 0x17, 0x89, 0x90, 0xbd, - 0xcb, 0xa8, 0xf9, 0xbd, 0x1b, 0x02, 0xaa, 0xbd, 0xc5, 0x36, 0x8b, 0xbd, 0x34, 0x53, 0xd3, 0xbd, - 0xb3, 0xac, 0xf5, 0xbd, 0xb4, 0xc7, 0xb9, 0xbd, 0x03, 0xf1, 0xc4, 0xbd, 0xd4, 0x49, 0x7e, 0xbd, - 0x18, 0xd0, 0x0c, 0xbe, 0x4e, 0x56, 0x89, 0xbd, 0x58, 0x4a, 0x65, 0xbd, 0xaa, 0x06, 0x81, 0xbd, - 0xf8, 0x93, 0x9b, 0xbc, 0x87, 0x63, 0x80, 0xbd, 0x3e, 0x27, 0x82, 0xbd, 0x7f, 0xbb, 0x89, 0xbd, - 0xef, 0x8b, 0x27, 0xbc, 0x0d, 0xbf, 0xb7, 0xbd, 0xb7, 0x75, 0xad, 0xbd, 0x8e, 0xed, 0xaa, 0xbd, - 0x26, 0xae, 0xc2, 0xbd, 0xd5, 0xb4, 0xa8, 0xbd, 0xdf, 0x4d, 0xdf, 0xbd, 0x70, 0x97, 0x6a, 0xbd, - 0xca, 0x3f, 0x36, 0xbd, 0x9a, 0xe4, 0x87, 0xbd, 0x09, 0xd8, 0x2b, 0xbd, 0xe7, 0x53, 0xa0, 0xbd, - 0x4f, 0x90, 0x99, 0xbb, 0x66, 0x1f, 0xd6, 0xbc, 0x82, 0x8f, 0x4f, 0xbd, 0x45, 0x89, 0xbc, 0xbc, - 0x7d, 0xf1, 0x9d, 0xbd, 0xd8, 0xf7, 0x16, 0xbd, 0x19, 0x27, 0x35, 0xbc, 0x8a, 0xc5, 0x09, 0xbd, - 0xbe, 0x5d, 0x4a, 0xbc, 0x54, 0x28, 0x53, 0xbd, 0xcc, 0x85, 0x18, 0xbd, 0x43, 0xa4, 0xca, 0xbc, - 0x0d, 0x01, 0xab, 0xbd, 0x22, 0x1c, 0xdb, 0xbb, 0xb6, 0xa5, 0x24, 0xbd, 0xfe, 0x62, 0x2d, 0xbd, - 0x71, 0xe1, 0x29, 0x3c, 0x70, 0x5a, 0x3c, 0xba, 0xd2, 0x25, 0xb8, 0xbc, 0xc2, 0x99, 0xd4, 0xbc, - 0x02, 0x6b, 0x12, 0x3c, 0x4e, 0xb7, 0x20, 0xbd, 0xe8, 0x13, 0x48, 0xbd, 0x16, 0x9c, 0x87, 0xbc, - 0xba, 0x01, 0xab, 0xbc, 0x33, 0xb3, 0x53, 0xbd, 0xad, 0x72, 0xa9, 0xbd, 0x5c, 0x0c, 0xc7, 0xbc, - 0xb0, 0xab, 0x36, 0x3b, 0x5d, 0xcc, 0x8c, 0xbc, 0x55, 0xfe, 0x25, 0xbc, 0xf2, 0x44, 0x38, 0xbd, - 0xb5, 0x14, 0x72, 0xbf, 0xe7, 0x22, 0x62, 0xc0, 0x74, 0x48, 0x46, 0xc0, 0xe9, 0xcc, 0x4a, 0xc0, - 0x69, 0x8e, 0x23, 0xc0, 0xab, 0xed, 0x5d, 0xc0, 0x25, 0x68, 0x0e, 0xc0, 0x36, 0xc4, 0x16, 0xc0, - 0x92, 0x2b, 0xde, 0xbf, 0xb5, 0x92, 0x1b, 0xc0, 0x90, 0x4d, 0x22, 0xc0, 0xb1, 0x52, 0xdd, 0xbf, - 0x1b, 0xbe, 0xa8, 0xbf, 0xa5, 0x69, 0xf9, 0xbf, 0x1e, 0x63, 0x83, 0xc0, 0xa9, 0xa2, 0xd0, 0xbf, - 0x1e, 0xa2, 0xe0, 0xbf, 0x17, 0x32, 0x21, 0xc0, 0x02, 0x36, 0x02, 0xc0, 0xc8, 0x29, 0x1a, 0xc0, - 0x13, 0xe5, 0x65, 0xbf, 0x59, 0x69, 0xcb, 0xbf, 0xfd, 0xbe, 0xeb, 0xbf, 0x3e, 0xe6, 0x5d, 0xc0, - 0x20, 0x97, 0x0c, 0xc0, 0x85, 0x10, 0x81, 0xbf, 0x99, 0xd6, 0x1f, 0xc0, 0x74, 0x0c, 0x64, 0xbf, - 0x86, 0x4b, 0x07, 0xc0, 0x66, 0x4a, 0x2c, 0xc0, 0x01, 0x9d, 0xac, 0xbe, 0x7f, 0x9b, 0x05, 0xc0}; -unsigned char conv2d_winograd_fp32_bias[] = { - 0x94, 0xcb, 0xde, 0x3f, 0x6f, 0x1d, 0xf0, 0x3f, 0x61, 0xfb, 0x8f, 0x40, 0x24, 0xce, 0xdb, 0x3f, - 0x55, 0x18, 0xf2, 0x40, 0x38, 0xa5, 0x64, 0x41, 0x87, 0x80, 0x94, 0xc0, 0xee, 0x19, 0x40, 0x40, - 0x28, 0x08, 0x8a, 0x40, 0x99, 0x24, 0x8c, 0xc0, 0x05, 0x80, 0x41, 0x40, 0xd4, 0x8a, 0xb3, 0x41, - 0x24, 0xe3, 0x2e, 0x41, 0x3c, 0xe6, 0xf7, 0x40, 0xa3, 0x0f, 0xdf, 0xc0, 0x6c, 0xd6, 0xdf, 0x40}; -unsigned char conv2d_winograd_fp32_out[] = { - 0xd3, 0xab, 0x56, 0x42, 0xf0, 0xb2, 0xa1, 0x42, 0xc4, 0x6b, 0xac, 0x42, 0x9c, 0x19, 0xbd, 0x42, - 0x3b, 0xac, 0xcf, 0x42, 0xc7, 0x8f, 0xc6, 0x42, 0x62, 0x76, 0xe7, 0x42, 0xed, 0x1f, 0xc5, 0x42, - 0xf6, 0x91, 0xcf, 0x42, 0xfa, 0x2c, 0x9b, 0x42, 0x5e, 0x2a, 0xcd, 0x42, 0xad, 0x6c, 0xb6, 0x42, - 0xf2, 0xd6, 0xd9, 0x42, 0xc9, 0x6c, 0x41, 0x42, 0x77, 0xc0, 0xa9, 0x42, 0x5c, 0xd0, 0xf6, 0x42, - 0x86, 0x25, 0xb6, 0x42, 0x18, 0x6e, 0xcf, 0x42, 0xf2, 0x6b, 0x19, 0x43, 0xe8, 0x8d, 0xf1, 0x42, - 0x95, 0xa8, 0x3e, 0x43, 0x1d, 0xd9, 0x16, 0x43, 0xce, 0x47, 0x3f, 0x43, 0x8c, 0x4f, 0xf0, 0x42, - 0x1e, 0x75, 0x27, 0x43, 0xa5, 0xbf, 0x0f, 0x43, 0x64, 0xbe, 0x21, 0x43, 0x72, 0xd6, 0xb4, 0x42, - 0x26, 0xf0, 0xb9, 0x42, 0x5e, 0x17, 0x02, 0x43, 0x7b, 0x2b, 0xeb, 0x42, 0xdd, 0x00, 0x0c, 0x43, - 0x0d, 0x07, 0x2c, 0x43, 0xef, 0xf1, 0x1f, 0x43, 0xc8, 0xe6, 0x3e, 0x43, 0x27, 0x94, 0x41, 0x43, - 0x1d, 0x29, 0x42, 0x43, 0xd7, 0xa9, 0x1d, 0x43, 0x9b, 0x9b, 0x32, 0x43, 0x5b, 0x4f, 0x26, 0x43, - 0xf1, 0xb6, 0x21, 0x43, 0x4e, 0xc5, 0xc5, 0x42, 0xb5, 0x89, 0xcd, 0x42, 0xca, 0xb4, 0xf2, 0x42, - 0x27, 0xbb, 0xe3, 0x42, 0xcb, 0xa9, 0x02, 0x43, 0xe8, 0xb7, 0x00, 0x43, 0x69, 0xbd, 0x18, 0x43, - 0x97, 0x31, 0x3c, 0x43, 0x8e, 0xb8, 0x41, 0x43, 0x9a, 0x24, 0x42, 0x43, 0x80, 0x71, 0x1a, 0x43, - 0xe9, 0x22, 0x2d, 0x43, 0xcf, 0x2f, 0x1c, 0x43, 0x64, 0x93, 0x1b, 0x43, 0xe6, 0x73, 0xad, 0x42, - 0x22, 0x21, 0xb0, 0x42, 0x3e, 0xfd, 0xf8, 0x42, 0x78, 0xa9, 0xf0, 0x42, 0xfd, 0x66, 0x14, 0x43, - 0x4a, 0xcd, 0x18, 0x43, 0x6f, 0x6b, 0x21, 0x43, 0x46, 0x57, 0x3c, 0x43, 0x61, 0x26, 0x42, 0x43, - 0xf7, 0x97, 0x37, 0x43, 0xe7, 0xf9, 0x1f, 0x43, 0x59, 0x44, 0x27, 0x43, 0xe3, 0xe2, 0x12, 0x43, - 0x1e, 0x8f, 0xee, 0x42, 0x04, 0xca, 0xa9, 0x42, 0xbe, 0x76, 0xd4, 0x42, 0x61, 0x6f, 0x22, 0x43, - 0x95, 0x55, 0x0b, 0x43, 0xdd, 0xef, 0x12, 0x43, 0xf5, 0x95, 0x1d, 0x43, 0x21, 0xab, 0x24, 0x43, - 0xbe, 0x0f, 0x47, 0x43, 0x07, 0xf5, 0x51, 0x43, 0xe2, 0x6c, 0x3c, 0x43, 0x45, 0xa5, 0x1b, 0x43, - 0x14, 0x27, 0x1f, 0x43, 0x9b, 0x6a, 0x10, 0x43, 0x63, 0x9f, 0x0e, 0x43, 0x6a, 0x11, 0x96, 0x42, - 0xd4, 0x1b, 0xe6, 0x42, 0x4f, 0xa2, 0x1c, 0x43, 0x9e, 0x1e, 0x04, 0x43, 0x83, 0x21, 0x12, 0x43, - 0x3a, 0x68, 0x14, 0x43, 0xc8, 0x9a, 0x2d, 0x43, 0x78, 0x8a, 0x41, 0x43, 0xd4, 0xaf, 0x33, 0x43, - 0xfd, 0xfc, 0x1c, 0x43, 0x12, 0x47, 0x04, 0x43, 0x79, 0x1b, 0x04, 0x43, 0x60, 0x5d, 0x0d, 0x43, - 0xf9, 0xd9, 0x26, 0x43, 0x0c, 0xad, 0xb2, 0x42, 0x99, 0x79, 0xcd, 0x42, 0x89, 0x7c, 0x16, 0x43, - 0x12, 0x19, 0x02, 0x43, 0x87, 0x31, 0x09, 0x43, 0xd2, 0x5e, 0x18, 0x43, 0xb1, 0x9d, 0x22, 0x43, - 0xa3, 0x85, 0x29, 0x43, 0x16, 0xef, 0x23, 0x43, 0xbb, 0xe4, 0x02, 0x43, 0x6f, 0x04, 0xe1, 0x42, - 0x7e, 0xe6, 0xeb, 0x42, 0x8e, 0x77, 0x0d, 0x43, 0xd9, 0x88, 0x19, 0x43, 0xc1, 0xb4, 0xcc, 0x42, - 0xa1, 0xe3, 0xc3, 0x42, 0x4f, 0x4c, 0x1b, 0x43, 0x83, 0x64, 0x12, 0x43, 0x39, 0x24, 0x23, 0x43, - 0x86, 0xb3, 0x17, 0x43, 0xcd, 0x1f, 0x28, 0x43, 0x6b, 0xe6, 0x29, 0x43, 0xe9, 0xc4, 0x26, 0x43, - 0xf2, 0x3a, 0x0a, 0x43, 0xd5, 0xe0, 0x01, 0x43, 0xde, 0x28, 0x0d, 0x43, 0x59, 0xeb, 0x01, 0x43, - 0xa3, 0x0c, 0x22, 0x43, 0x6c, 0x75, 0xb1, 0x42, 0x52, 0x6a, 0xba, 0x42, 0x1a, 0xbb, 0x25, 0x43, - 0xed, 0x1c, 0x1c, 0x43, 0x89, 0xa2, 0x2e, 0x43, 0x71, 0xc3, 0x14, 0x43, 0x5b, 0x24, 0x2c, 0x43, - 0x4d, 0x07, 0x29, 0x43, 0xe6, 0x9b, 0x35, 0x43, 0x79, 0x11, 0x24, 0x43, 0xe7, 0xdd, 0x13, 0x43, - 0x77, 0x57, 0x15, 0x43, 0xd5, 0xe5, 0x19, 0x43, 0xc3, 0x05, 0x3e, 0x43, 0xa9, 0xb0, 0xea, 0x42, - 0xcd, 0x58, 0xae, 0x42, 0xae, 0xa7, 0x26, 0x43, 0xf3, 0xf5, 0x29, 0x43, 0x40, 0x73, 0x1c, 0x43, - 0xe3, 0xf0, 0xfe, 0x42, 0x60, 0xb4, 0x25, 0x43, 0xc7, 0xf9, 0x15, 0x43, 0xb8, 0x11, 0x30, 0x43, - 0xa7, 0x2f, 0x2d, 0x43, 0x05, 0x68, 0x1c, 0x43, 0xe9, 0xfc, 0x2a, 0x43, 0x2f, 0x5f, 0x34, 0x43, - 0xcf, 0xcb, 0x45, 0x43, 0xf2, 0x4d, 0xec, 0x42, 0x43, 0x6f, 0xb8, 0x42, 0x66, 0x50, 0x0c, 0x43, - 0xb5, 0x48, 0x0a, 0x43, 0x58, 0x80, 0x0a, 0x43, 0x6f, 0xb9, 0x03, 0x43, 0xee, 0x18, 0x12, 0x43, - 0x69, 0x67, 0x14, 0x43, 0xc9, 0x6e, 0x2a, 0x43, 0x93, 0xa2, 0x1d, 0x43, 0x37, 0xcf, 0x40, 0x43, - 0x2a, 0x44, 0x38, 0x43, 0x3b, 0x79, 0x3e, 0x43, 0x9f, 0xbb, 0x1d, 0x43, 0x2a, 0xd4, 0xb3, 0x42, - 0xe2, 0x4d, 0xa8, 0x42, 0xd6, 0x40, 0xe4, 0x42, 0x33, 0xf8, 0xf5, 0x42, 0xfc, 0xe7, 0xef, 0x42, - 0x71, 0xab, 0x04, 0x43, 0x9f, 0x94, 0x00, 0x43, 0xfb, 0x6e, 0x02, 0x43, 0x10, 0x52, 0x31, 0x43, - 0x2c, 0x32, 0x2e, 0x43, 0xad, 0xb6, 0x49, 0x43, 0x77, 0xc1, 0x26, 0x43, 0xc3, 0xa6, 0x27, 0x43, - 0xe9, 0x8b, 0x08, 0x43, 0x60, 0xcc, 0xa6, 0x42, 0x3d, 0x16, 0x50, 0x42, 0x82, 0x11, 0x9b, 0x42, - 0xaf, 0xef, 0x9c, 0x42, 0x2a, 0x4e, 0xb4, 0x42, 0xd9, 0xce, 0xad, 0x42, 0x78, 0x21, 0xa5, 0x42, - 0x8c, 0x99, 0xc2, 0x42, 0xe0, 0xf9, 0xf1, 0x42, 0x46, 0x8c, 0xeb, 0x42, 0xdd, 0x72, 0x0f, 0x43, - 0x90, 0x5d, 0xba, 0x42, 0x19, 0x3a, 0xb8, 0x42, 0x1e, 0x50, 0x81, 0x42, 0xfd, 0xef, 0x6c, 0x42, - 0xeb, 0xa1, 0x40, 0x42, 0x1b, 0x04, 0x97, 0x42, 0x48, 0x55, 0x78, 0x42, 0x48, 0x02, 0xa2, 0x42, - 0x50, 0xe0, 0xc7, 0x42, 0xd2, 0xd3, 0xb7, 0x42, 0x7c, 0x93, 0xc5, 0x42, 0xd1, 0x6c, 0xcf, 0x42, - 0x2a, 0x2e, 0xba, 0x42, 0x32, 0x9f, 0x9c, 0x42, 0xe9, 0xe6, 0xb8, 0x42, 0xf3, 0x43, 0xaa, 0x42, - 0x82, 0xb9, 0xb4, 0x42, 0x09, 0x54, 0x42, 0x42, 0x0a, 0x0e, 0xb8, 0x42, 0xbb, 0x96, 0xd5, 0x42, - 0xdc, 0xda, 0xca, 0x42, 0x71, 0x6f, 0xdf, 0x42, 0x0c, 0x81, 0xfd, 0x42, 0xd3, 0x7f, 0xf6, 0x42, - 0xa8, 0x50, 0x20, 0x43, 0xff, 0x1f, 0x26, 0x43, 0xd1, 0x51, 0x1c, 0x43, 0xef, 0xae, 0xef, 0x42, - 0x85, 0x76, 0x07, 0x43, 0x91, 0x3e, 0x16, 0x43, 0x25, 0x58, 0x0c, 0x43, 0x57, 0x0a, 0x9b, 0x42, - 0x50, 0xe7, 0xc5, 0x42, 0x6a, 0x76, 0xea, 0x42, 0x5a, 0x31, 0xcd, 0x42, 0x1e, 0xdb, 0xed, 0x42, - 0xe5, 0x92, 0x07, 0x43, 0x45, 0x45, 0x19, 0x43, 0x07, 0x27, 0x24, 0x43, 0xfd, 0xb5, 0x26, 0x43, - 0x15, 0x32, 0x21, 0x43, 0xdb, 0x0b, 0x11, 0x43, 0x74, 0x6e, 0x1a, 0x43, 0xc3, 0x08, 0x1b, 0x43, - 0xab, 0x72, 0x1c, 0x43, 0x11, 0x1b, 0xbe, 0x42, 0x08, 0x69, 0xd9, 0x42, 0xf6, 0x0e, 0xf6, 0x42, - 0x8a, 0x0c, 0xc2, 0x42, 0x89, 0x99, 0x01, 0x43, 0xd2, 0xb7, 0xf0, 0x42, 0x5c, 0xba, 0x07, 0x43, - 0xfb, 0xac, 0x28, 0x43, 0x3d, 0xfc, 0x31, 0x43, 0xc2, 0x51, 0x2e, 0x43, 0xb7, 0x06, 0x23, 0x43, - 0x01, 0xdd, 0x14, 0x43, 0x22, 0x6a, 0x18, 0x43, 0xa1, 0x21, 0x07, 0x43, 0x06, 0x45, 0x9f, 0x42, - 0xf1, 0x8d, 0xbc, 0x42, 0x4a, 0x57, 0xe2, 0x42, 0x8d, 0x38, 0xea, 0x42, 0xbb, 0x86, 0x11, 0x43, - 0x16, 0xdf, 0x0a, 0x43, 0xaf, 0x1c, 0x1c, 0x43, 0x79, 0x0b, 0x2d, 0x43, 0x92, 0x90, 0x37, 0x43, - 0x0f, 0x4a, 0x27, 0x43, 0x90, 0x82, 0x15, 0x43, 0x90, 0x8c, 0x07, 0x43, 0xb4, 0x2e, 0x0c, 0x43, - 0xbe, 0xde, 0xfb, 0x42, 0xf8, 0x42, 0x98, 0x42, 0x3a, 0x9e, 0xd5, 0x42, 0x63, 0x07, 0x06, 0x43, - 0x67, 0x8e, 0x02, 0x43, 0x7a, 0x3c, 0xff, 0x42, 0x77, 0x1b, 0xf4, 0x42, 0xdd, 0x00, 0x20, 0x43, - 0x3c, 0x94, 0x4b, 0x43, 0xd7, 0x51, 0x3f, 0x43, 0x27, 0xe9, 0x38, 0x43, 0x71, 0xfb, 0x06, 0x43, - 0xd3, 0x7e, 0xfe, 0x42, 0x26, 0xcb, 0xf5, 0x42, 0x21, 0x06, 0x0a, 0x43, 0x92, 0xe1, 0x9f, 0x42, - 0xe4, 0x92, 0xda, 0x42, 0x3b, 0x6b, 0x11, 0x43, 0x56, 0x8f, 0xff, 0x42, 0xff, 0x32, 0xf9, 0x42, - 0x08, 0x31, 0x10, 0x43, 0xdf, 0xe4, 0x1a, 0x43, 0x16, 0x29, 0x31, 0x43, 0x91, 0x73, 0x0e, 0x43, - 0x7f, 0x5d, 0x11, 0x43, 0x88, 0xf6, 0xee, 0x42, 0x2a, 0x71, 0x02, 0x43, 0x74, 0x04, 0xfe, 0x42, - 0x15, 0xe0, 0x0c, 0x43, 0x04, 0xb5, 0xc5, 0x42, 0x98, 0x8b, 0xd3, 0x42, 0xfd, 0xa6, 0x04, 0x43, - 0xbe, 0xdf, 0xdf, 0x42, 0xc1, 0xaf, 0x0b, 0x43, 0x98, 0xf1, 0x0a, 0x43, 0xbb, 0x4e, 0x13, 0x43, - 0x3f, 0x60, 0x2f, 0x43, 0x43, 0x2c, 0x19, 0x43, 0xb5, 0xa3, 0x05, 0x43, 0xaf, 0xc0, 0xe4, 0x42, - 0x78, 0x4b, 0xdc, 0x42, 0x02, 0x9b, 0xfb, 0x42, 0xf0, 0xe5, 0x0c, 0x43, 0x04, 0x1b, 0xc4, 0x42, - 0x8f, 0x2d, 0xd0, 0x42, 0xe2, 0x72, 0x0f, 0x43, 0xd7, 0x3c, 0x03, 0x43, 0x16, 0x85, 0x07, 0x43, - 0x24, 0x00, 0x19, 0x43, 0xa6, 0x01, 0x15, 0x43, 0xa7, 0x10, 0x1b, 0x43, 0x6b, 0x13, 0x0e, 0x43, - 0xcf, 0x1d, 0x03, 0x43, 0x85, 0x41, 0xe5, 0x42, 0x94, 0x53, 0xf0, 0x42, 0x3f, 0x5e, 0x05, 0x43, - 0xb7, 0xff, 0x0f, 0x43, 0xb2, 0x43, 0xbd, 0x42, 0xaa, 0x50, 0xd3, 0x42, 0x54, 0x9b, 0x14, 0x43, - 0x58, 0xc1, 0x1c, 0x43, 0x9d, 0xe0, 0x19, 0x43, 0xa4, 0x79, 0x12, 0x43, 0x3f, 0x71, 0x17, 0x43, - 0xf5, 0x90, 0x0b, 0x43, 0xb5, 0x3c, 0x24, 0x43, 0xa5, 0xbe, 0x18, 0x43, 0x34, 0xb1, 0xfa, 0x42, - 0x95, 0xd5, 0x06, 0x43, 0xc1, 0x17, 0x1a, 0x43, 0xbf, 0xf2, 0x20, 0x43, 0x09, 0xb8, 0xd1, 0x42, - 0x7c, 0xb9, 0xd1, 0x42, 0x15, 0x7c, 0x0d, 0x43, 0x38, 0x95, 0x1c, 0x43, 0x0e, 0xa1, 0x11, 0x43, - 0x31, 0x34, 0x09, 0x43, 0xd5, 0x82, 0x0b, 0x43, 0xca, 0xf4, 0x0e, 0x43, 0x5c, 0xa3, 0x1a, 0x43, - 0xbc, 0x2d, 0x11, 0x43, 0x49, 0x76, 0x10, 0x43, 0x70, 0xdf, 0x1f, 0x43, 0xce, 0x47, 0x1b, 0x43, - 0xf7, 0x49, 0x29, 0x43, 0xbc, 0x7f, 0xd8, 0x42, 0x8e, 0xc5, 0xbc, 0x42, 0xe8, 0x4e, 0xf7, 0x42, - 0x92, 0xa7, 0xf0, 0x42, 0x24, 0xc6, 0x05, 0x43, 0x85, 0x5c, 0xfa, 0x42, 0x75, 0x7d, 0xf8, 0x42, - 0x95, 0x28, 0x0d, 0x43, 0x74, 0x25, 0x1f, 0x43, 0x3d, 0x31, 0x1a, 0x43, 0xbe, 0xe4, 0x24, 0x43, - 0xa6, 0x3a, 0x2b, 0x43, 0x3d, 0x67, 0x2a, 0x43, 0xbf, 0x5c, 0x10, 0x43, 0x56, 0x2b, 0xad, 0x42, - 0xdf, 0x90, 0xb1, 0x42, 0x35, 0x38, 0xdf, 0x42, 0x94, 0xa3, 0xd9, 0x42, 0x43, 0xf1, 0xee, 0x42, - 0x32, 0xbe, 0xe6, 0x42, 0xb5, 0xe3, 0xe2, 0x42, 0x8a, 0x26, 0xf9, 0x42, 0xae, 0xf9, 0x10, 0x43, - 0x04, 0x96, 0x1c, 0x43, 0xb4, 0xf5, 0x34, 0x43, 0x4d, 0x9f, 0x1c, 0x43, 0xe8, 0xcb, 0x0b, 0x43, - 0x7a, 0xe9, 0x05, 0x43, 0x73, 0xf3, 0xa3, 0x42, 0x55, 0x3f, 0x61, 0x42, 0x89, 0xee, 0x83, 0x42, - 0x91, 0x9f, 0x82, 0x42, 0xf6, 0xbf, 0x92, 0x42, 0x3f, 0x8f, 0xa0, 0x42, 0x9c, 0x06, 0xab, 0x42, - 0x02, 0x90, 0xae, 0x42, 0xec, 0x3c, 0xc3, 0x42, 0xb6, 0xaa, 0xd7, 0x42, 0xe7, 0xfc, 0xf4, 0x42, - 0x1f, 0xb0, 0xcd, 0x42, 0x3e, 0xfa, 0xb4, 0x42, 0x2f, 0x68, 0x62, 0x42, 0x45, 0x9f, 0x33, 0x42, - 0xdd, 0xd2, 0x4a, 0x42, 0x06, 0xbd, 0x77, 0x42, 0x8a, 0xdd, 0x72, 0x42, 0x75, 0x3a, 0x93, 0x42, - 0x4c, 0x5e, 0xb1, 0x42, 0x46, 0x09, 0xa2, 0x42, 0x22, 0x31, 0xcc, 0x42, 0x6e, 0xae, 0x9b, 0x42, - 0xde, 0x88, 0xc0, 0x42, 0x66, 0xf0, 0x8b, 0x42, 0xeb, 0xc9, 0xb4, 0x42, 0xf5, 0x8d, 0xb5, 0x42, - 0x8c, 0x1f, 0x9f, 0x42, 0x2e, 0x8b, 0xe3, 0x41, 0xc9, 0x9b, 0xa3, 0x42, 0xee, 0x59, 0xc5, 0x42, - 0x87, 0x9e, 0xc9, 0x42, 0x38, 0x93, 0xdc, 0x42, 0x60, 0x2b, 0xf5, 0x42, 0x88, 0x9e, 0xfa, 0x42, - 0x21, 0xb0, 0x15, 0x43, 0x5e, 0xb2, 0x11, 0x43, 0x9a, 0x24, 0x15, 0x43, 0x1f, 0x5d, 0x01, 0x43, - 0x5b, 0x45, 0x17, 0x43, 0x51, 0x3f, 0x09, 0x43, 0xff, 0xd5, 0x0d, 0x43, 0x93, 0x95, 0x9e, 0x42, - 0x0a, 0x99, 0xaf, 0x42, 0xaf, 0x0a, 0xc8, 0x42, 0x2a, 0x68, 0xd2, 0x42, 0x84, 0x88, 0x0b, 0x43, - 0x6a, 0xde, 0xf8, 0x42, 0x5b, 0xeb, 0x01, 0x43, 0x10, 0xbb, 0x27, 0x43, 0x82, 0x2b, 0x22, 0x43, - 0x62, 0x67, 0x0f, 0x43, 0x13, 0xc4, 0xeb, 0x42, 0x78, 0xd3, 0x08, 0x43, 0x20, 0x2a, 0x11, 0x43, - 0xcc, 0x61, 0x02, 0x43, 0x43, 0x30, 0xa2, 0x42, 0xf2, 0xd5, 0xa7, 0x42, 0xd7, 0x1d, 0xe5, 0x42, - 0x59, 0xc6, 0xe8, 0x42, 0x68, 0x99, 0xe8, 0x42, 0x18, 0x1a, 0xfe, 0x42, 0xdd, 0x52, 0x0a, 0x43, - 0x91, 0xcd, 0x2b, 0x43, 0xa0, 0xa7, 0x21, 0x43, 0xd1, 0x2a, 0x28, 0x43, 0x7f, 0xb7, 0x01, 0x43, - 0x21, 0x1c, 0x13, 0x43, 0x2f, 0x43, 0x0a, 0x43, 0xb7, 0xda, 0x01, 0x43, 0x36, 0x7b, 0xa2, 0x42, - 0xf1, 0xe7, 0xa6, 0x42, 0x20, 0xec, 0xff, 0x42, 0xc2, 0x7c, 0xff, 0x42, 0x29, 0x9a, 0xf8, 0x42, - 0x17, 0xa9, 0x09, 0x43, 0xb0, 0xdc, 0x14, 0x43, 0x95, 0xfc, 0x34, 0x43, 0x0b, 0x40, 0x25, 0x43, - 0xc5, 0x6d, 0x23, 0x43, 0xb8, 0x09, 0x14, 0x43, 0x10, 0xea, 0xfe, 0x42, 0xf9, 0x97, 0x03, 0x43, - 0x2c, 0xc5, 0xe0, 0x42, 0x32, 0x5a, 0x8c, 0x42, 0x3a, 0xd3, 0xc3, 0x42, 0x92, 0xdf, 0x01, 0x43, - 0x8d, 0x11, 0xe9, 0x42, 0x36, 0x42, 0x19, 0x43, 0xb5, 0x01, 0xee, 0x42, 0xbd, 0x8f, 0x09, 0x43, - 0x60, 0x29, 0x3b, 0x43, 0x17, 0x93, 0x46, 0x43, 0xf2, 0x9b, 0x2f, 0x43, 0xfe, 0x9e, 0x09, 0x43, - 0xab, 0x43, 0xf8, 0x42, 0xaf, 0x19, 0xe1, 0x42, 0x16, 0x06, 0xe6, 0x42, 0x48, 0x21, 0x8c, 0x42, - 0x93, 0x0f, 0xd7, 0x42, 0x96, 0xaa, 0xfb, 0x42, 0x14, 0xed, 0xeb, 0x42, 0xde, 0x34, 0xef, 0x42, - 0xbc, 0xe5, 0x08, 0x43, 0x82, 0x47, 0x0d, 0x43, 0x6b, 0x34, 0x24, 0x43, 0x84, 0x0f, 0x28, 0x43, - 0xf3, 0xa2, 0x1a, 0x43, 0x0a, 0x20, 0xce, 0x42, 0x6c, 0x11, 0xdd, 0x42, 0xa0, 0xd5, 0xf5, 0x42, - 0xd9, 0xe1, 0x05, 0x43, 0x9c, 0x1c, 0xa8, 0x42, 0xfc, 0xd6, 0xc6, 0x42, 0x25, 0xaa, 0x13, 0x43, - 0xb7, 0x4d, 0xe6, 0x42, 0x30, 0x76, 0xe7, 0x42, 0xbf, 0x08, 0x11, 0x43, 0x87, 0x69, 0x15, 0x43, - 0x44, 0xd2, 0x14, 0x43, 0xf5, 0x04, 0x07, 0x43, 0x90, 0xf3, 0x02, 0x43, 0x04, 0xf7, 0xc0, 0x42, - 0x42, 0x9a, 0xd5, 0x42, 0x6a, 0x3e, 0x08, 0x43, 0x14, 0xde, 0x0f, 0x43, 0x2c, 0xd8, 0xc4, 0x42, - 0x29, 0xee, 0xb0, 0x42, 0x54, 0x07, 0x1d, 0x43, 0x47, 0x34, 0x03, 0x43, 0xe4, 0xc0, 0x04, 0x43, - 0xb0, 0x5c, 0x0f, 0x43, 0xb2, 0x46, 0x0a, 0x43, 0xe4, 0x39, 0x19, 0x43, 0x09, 0x52, 0x05, 0x43, - 0xde, 0x55, 0xdf, 0x42, 0x52, 0x08, 0xf6, 0x42, 0x1a, 0x45, 0xfb, 0x42, 0xbe, 0xc2, 0xe6, 0x42, - 0x0b, 0x48, 0x07, 0x43, 0x79, 0x3f, 0xb9, 0x42, 0x54, 0xfe, 0xd1, 0x42, 0x31, 0xfc, 0x0d, 0x43, - 0x6a, 0x5d, 0x09, 0x43, 0x72, 0x8a, 0x16, 0x43, 0x0c, 0x88, 0x19, 0x43, 0xf1, 0xe6, 0x0f, 0x43, - 0x8a, 0x30, 0x08, 0x43, 0x7f, 0x11, 0x0e, 0x43, 0x47, 0x85, 0xfb, 0x42, 0x9e, 0xf1, 0x10, 0x43, - 0x2a, 0x3b, 0xf1, 0x42, 0x86, 0x5a, 0x0a, 0x43, 0x4b, 0xa1, 0x2c, 0x43, 0x6c, 0x79, 0xcc, 0x42, - 0xe0, 0x36, 0xcb, 0x42, 0xa5, 0xff, 0x20, 0x43, 0xa6, 0xd7, 0x0e, 0x43, 0x63, 0xf4, 0x06, 0x43, - 0x4e, 0xed, 0xed, 0x42, 0xd5, 0xb1, 0x0b, 0x43, 0x70, 0xb7, 0x19, 0x43, 0x85, 0xe2, 0x15, 0x43, - 0x70, 0x6c, 0x0c, 0x43, 0xb7, 0xe7, 0xef, 0x42, 0xb8, 0xe7, 0x1c, 0x43, 0xe7, 0x8d, 0x20, 0x43, - 0x19, 0x1b, 0x36, 0x43, 0x3c, 0x8e, 0xa7, 0x42, 0x58, 0x2f, 0xb4, 0x42, 0x99, 0x9d, 0xfe, 0x42, - 0x92, 0x54, 0xcd, 0x42, 0x78, 0xae, 0x07, 0x43, 0x7c, 0xb1, 0xe2, 0x42, 0x50, 0xfd, 0xf4, 0x42, - 0xdc, 0x2d, 0xea, 0x42, 0x09, 0xe8, 0x19, 0x43, 0xc8, 0xba, 0x08, 0x43, 0x9f, 0x3f, 0x24, 0x43, - 0xc5, 0x00, 0x22, 0x43, 0xcd, 0xc2, 0x1d, 0x43, 0xc6, 0xcc, 0xf9, 0x42, 0xd6, 0xf1, 0xb3, 0x42, - 0xd4, 0xe3, 0xa2, 0x42, 0x14, 0x3e, 0xd2, 0x42, 0x4c, 0x3b, 0xc7, 0x42, 0x8d, 0x73, 0xe3, 0x42, - 0x31, 0x64, 0xd4, 0x42, 0x41, 0x46, 0xfa, 0x42, 0xe9, 0x09, 0xf1, 0x42, 0xb8, 0x4a, 0x0a, 0x43, - 0x85, 0x85, 0x25, 0x43, 0x72, 0xc8, 0x25, 0x43, 0x30, 0xad, 0x19, 0x43, 0xa5, 0x26, 0x0b, 0x43, - 0x69, 0x7e, 0x07, 0x43, 0x6a, 0x5b, 0x87, 0x42, 0xfa, 0x4d, 0x42, 0x42, 0x69, 0x27, 0x8e, 0x42, - 0xa2, 0x41, 0x8e, 0x42, 0x93, 0xe2, 0x99, 0x42, 0x76, 0x0d, 0x9c, 0x42, 0xaa, 0x22, 0x71, 0x42, - 0x70, 0x35, 0xac, 0x42, 0x32, 0x72, 0xdb, 0x42, 0x51, 0x46, 0xc5, 0x42, 0x1c, 0xa6, 0xe3, 0x42, - 0x62, 0x7e, 0xb4, 0x42, 0x20, 0x49, 0x97, 0x42, 0x26, 0xc8, 0x85, 0x42, 0x70, 0xf0, 0x51, 0x42, - 0xf9, 0x0c, 0x28, 0x42, 0x71, 0xb7, 0x84, 0x42, 0x9b, 0xed, 0x7f, 0x42, 0x82, 0x61, 0x83, 0x42, - 0x2d, 0x0b, 0x9c, 0x42, 0xd2, 0xb0, 0x95, 0x42, 0xee, 0x4a, 0xb5, 0x42, 0x82, 0x8f, 0xa8, 0x42, - 0x8d, 0x76, 0xd1, 0x42, 0x33, 0x2f, 0x7b, 0x42, 0x1f, 0x4d, 0x92, 0x42, 0x29, 0x30, 0xbc, 0x42, - 0x1c, 0xa4, 0x8d, 0x42, 0x91, 0x0c, 0x2c, 0x42, 0x87, 0x35, 0xc9, 0x42, 0x0a, 0x01, 0xdf, 0x42, - 0x0e, 0x98, 0xa0, 0x42, 0x53, 0xdb, 0xcb, 0x42, 0x91, 0x12, 0x0a, 0x43, 0xc0, 0x39, 0x06, 0x43, - 0x8b, 0xe9, 0x07, 0x43, 0x3d, 0x64, 0x00, 0x43, 0x06, 0xba, 0x11, 0x43, 0x40, 0xd4, 0x0e, 0x43, - 0xa1, 0xc9, 0x00, 0x43, 0xb2, 0xf3, 0x03, 0x43, 0x54, 0xaa, 0x0e, 0x43, 0x3b, 0x6f, 0xd1, 0x42, - 0xa1, 0x9a, 0x9f, 0x42, 0x00, 0xd3, 0xff, 0x42, 0x92, 0x6e, 0xd1, 0x42, 0x85, 0x6b, 0xfa, 0x42, - 0xe9, 0xaa, 0xfb, 0x42, 0x74, 0xd0, 0x09, 0x43, 0xc6, 0x3b, 0x1f, 0x43, 0xa2, 0xd1, 0x20, 0x43, - 0x92, 0xd2, 0x1b, 0x43, 0x29, 0x0a, 0x04, 0x43, 0xbb, 0x7f, 0x0e, 0x43, 0xdb, 0x50, 0x16, 0x43, - 0xb3, 0x0d, 0x15, 0x43, 0x79, 0xcc, 0xb2, 0x42, 0xb4, 0xdb, 0xbd, 0x42, 0xe2, 0xad, 0xfb, 0x42, - 0xab, 0xed, 0xdd, 0x42, 0x91, 0x1c, 0x00, 0x43, 0x6f, 0x47, 0x06, 0x43, 0xe5, 0x5f, 0xf2, 0x42, - 0x5e, 0xb6, 0x2d, 0x43, 0xd0, 0xd3, 0x2e, 0x43, 0x03, 0x5a, 0x39, 0x43, 0xe3, 0x42, 0xe7, 0x42, - 0xcc, 0xa5, 0x1e, 0x43, 0x1e, 0xd5, 0x15, 0x43, 0xbe, 0x72, 0x16, 0x43, 0x84, 0x09, 0xa7, 0x42, - 0x36, 0xcf, 0xb2, 0x42, 0x98, 0x87, 0xe7, 0x42, 0x63, 0xd3, 0xd8, 0x42, 0xca, 0x1a, 0xf8, 0x42, - 0xba, 0xf3, 0x04, 0x43, 0x4b, 0x0c, 0x08, 0x43, 0xb2, 0x6d, 0x3d, 0x43, 0xa3, 0x8c, 0x34, 0x43, - 0x7c, 0x80, 0x26, 0x43, 0x05, 0x15, 0xf7, 0x42, 0x63, 0xa1, 0x13, 0x43, 0xfe, 0x4d, 0x1a, 0x43, - 0xa8, 0x79, 0x02, 0x43, 0x2c, 0x88, 0x94, 0x42, 0x25, 0x7a, 0xc0, 0x42, 0xe8, 0x0d, 0x03, 0x43, - 0x6b, 0x0c, 0xcb, 0x42, 0x7f, 0x29, 0xfa, 0x42, 0xf6, 0x99, 0xf9, 0x42, 0x4c, 0xec, 0x08, 0x43, - 0x33, 0x44, 0x2f, 0x43, 0xe6, 0x9f, 0x2d, 0x43, 0xb8, 0xa9, 0x2b, 0x43, 0x16, 0x06, 0x05, 0x43, - 0x8f, 0x45, 0x0e, 0x43, 0x94, 0x41, 0x07, 0x43, 0x63, 0x85, 0xf9, 0x42, 0xe3, 0x46, 0xaf, 0x42, - 0x15, 0x1b, 0xcf, 0x42, 0x0e, 0x81, 0x0b, 0x43, 0xb1, 0x0c, 0xf2, 0x42, 0xbf, 0x90, 0xf7, 0x42, - 0x74, 0x1b, 0xf7, 0x42, 0x45, 0xf6, 0x21, 0x43, 0xd4, 0x1f, 0x36, 0x43, 0x75, 0xbb, 0x2d, 0x43, - 0xd8, 0x8d, 0x18, 0x43, 0xd9, 0x94, 0xe6, 0x42, 0xb4, 0x9c, 0xfd, 0x42, 0x73, 0x68, 0xef, 0x42, - 0x2a, 0xa1, 0x07, 0x43, 0x61, 0xff, 0xb3, 0x42, 0xb1, 0x27, 0xc7, 0x42, 0xf3, 0x17, 0x04, 0x43, - 0x23, 0xf9, 0xd1, 0x42, 0xfc, 0x13, 0xde, 0x42, 0xed, 0x10, 0x1a, 0x43, 0x24, 0x1a, 0x0d, 0x43, - 0x5b, 0xe3, 0x1c, 0x43, 0x62, 0x8c, 0x1f, 0x43, 0x20, 0xc3, 0xfd, 0x42, 0x21, 0x8b, 0xc9, 0x42, - 0x6e, 0xd4, 0xfe, 0x42, 0x64, 0xba, 0x02, 0x43, 0x64, 0xd9, 0x04, 0x43, 0x51, 0x5e, 0xb9, 0x42, - 0x0d, 0xa3, 0xd7, 0x42, 0xf9, 0x50, 0x08, 0x43, 0x09, 0x9c, 0x0c, 0x43, 0xcf, 0x1e, 0x02, 0x43, - 0x87, 0xfa, 0x05, 0x43, 0x45, 0xb9, 0xf1, 0x42, 0x34, 0x9b, 0x0c, 0x43, 0xa2, 0x3b, 0x13, 0x43, - 0x30, 0x44, 0xec, 0x42, 0xd0, 0xd2, 0xc9, 0x42, 0xd0, 0xb9, 0xd6, 0x42, 0x58, 0x42, 0x08, 0x43, - 0x86, 0xc7, 0x08, 0x43, 0x59, 0x14, 0xb4, 0x42, 0x36, 0x6c, 0xd1, 0x42, 0xd6, 0xed, 0x0a, 0x43, - 0x73, 0xb5, 0x1c, 0x43, 0x04, 0x9e, 0x2b, 0x43, 0x0a, 0xd6, 0x00, 0x43, 0x94, 0xd0, 0x11, 0x43, - 0x62, 0xd9, 0x03, 0x43, 0xa8, 0x01, 0x12, 0x43, 0x5c, 0x9c, 0x0f, 0x43, 0x29, 0xac, 0x13, 0x43, - 0x9e, 0x06, 0xed, 0x42, 0x9e, 0xe6, 0xf3, 0x42, 0x8c, 0x5d, 0x22, 0x43, 0x56, 0x3a, 0xdd, 0x42, - 0x63, 0x97, 0xa0, 0x42, 0x63, 0xa8, 0x16, 0x43, 0x62, 0xac, 0x19, 0x43, 0x58, 0x5b, 0x25, 0x43, - 0xf4, 0x25, 0xff, 0x42, 0x32, 0x04, 0x17, 0x43, 0x5a, 0x67, 0x1a, 0x43, 0x02, 0x75, 0x17, 0x43, - 0xd5, 0x6a, 0x14, 0x43, 0x60, 0x44, 0x06, 0x43, 0x81, 0xf5, 0x25, 0x43, 0x96, 0x17, 0x25, 0x43, - 0x70, 0x61, 0x2c, 0x43, 0xdf, 0xcb, 0xd1, 0x42, 0xf9, 0x9c, 0xb0, 0x42, 0xf4, 0x2e, 0x0a, 0x43, - 0xaf, 0x0e, 0xd0, 0x42, 0x3a, 0x38, 0x01, 0x43, 0x10, 0xb6, 0xea, 0x42, 0x3e, 0x69, 0x05, 0x43, - 0x37, 0x9f, 0xf8, 0x42, 0x2b, 0x84, 0x16, 0x43, 0x5a, 0x22, 0x06, 0x43, 0x2f, 0xae, 0x1c, 0x43, - 0x32, 0x7e, 0x1f, 0x43, 0x6e, 0x54, 0x29, 0x43, 0x99, 0xf0, 0x18, 0x43, 0xb0, 0xd4, 0xe7, 0x42, - 0x74, 0x96, 0xa1, 0x42, 0x92, 0x06, 0xe8, 0x42, 0x3d, 0xc4, 0xd5, 0x42, 0x81, 0x8c, 0xda, 0x42, - 0x0a, 0x31, 0xcf, 0x42, 0xfd, 0x1b, 0xee, 0x42, 0x96, 0xdd, 0xec, 0x42, 0x70, 0xcc, 0x11, 0x43, - 0x5f, 0x09, 0x17, 0x43, 0xea, 0xdf, 0x2b, 0x43, 0xeb, 0x0e, 0x1e, 0x43, 0xea, 0xab, 0x1f, 0x43, - 0x59, 0xf1, 0xf9, 0x42, 0xf3, 0x5f, 0xbe, 0x42, 0x3f, 0xb9, 0x4f, 0x42, 0x7e, 0x74, 0xae, 0x42, - 0x8f, 0x9e, 0xa0, 0x42, 0xa4, 0x7e, 0xac, 0x42, 0xe5, 0x59, 0xa4, 0x42, 0x99, 0xe1, 0x8d, 0x42, - 0x1c, 0x35, 0xbb, 0x42, 0x1c, 0x02, 0xe1, 0x42, 0xe1, 0xcc, 0xe9, 0x42, 0xd1, 0xcb, 0x00, 0x43, - 0xe4, 0xe0, 0xcb, 0x42, 0xcd, 0xc2, 0xc5, 0x42, 0x73, 0x0d, 0x88, 0x42, 0x46, 0xdc, 0x24, 0x42, - 0xcb, 0xe2, 0x50, 0x42, 0x89, 0x2e, 0xa3, 0x42, 0xb7, 0x8a, 0x94, 0x42, 0x4d, 0x4e, 0xa8, 0x42, - 0x6d, 0x30, 0xbd, 0x42, 0xe3, 0x45, 0xca, 0x42, 0xef, 0xf9, 0xdf, 0x42, 0xd2, 0x71, 0xd3, 0x42, - 0x47, 0x08, 0xd2, 0x42, 0xef, 0xdc, 0xb4, 0x42, 0xe1, 0x3b, 0xd6, 0x42, 0xcb, 0x03, 0xc4, 0x42, - 0x6b, 0x20, 0xc6, 0x42, 0xa1, 0xd5, 0x60, 0x42, 0xd5, 0x5f, 0x9d, 0x42, 0xf2, 0x11, 0x05, 0x43, - 0xb5, 0xc1, 0xeb, 0x42, 0xa2, 0x87, 0x02, 0x43, 0x49, 0x2e, 0x0f, 0x43, 0x7e, 0x2a, 0x12, 0x43, - 0xa1, 0x35, 0x25, 0x43, 0xf2, 0x36, 0x1a, 0x43, 0xfc, 0xb0, 0x36, 0x43, 0x0c, 0x54, 0xfa, 0x42, - 0xd2, 0x74, 0x1f, 0x43, 0x55, 0xdb, 0x18, 0x43, 0xa9, 0x01, 0x28, 0x43, 0x3e, 0xa5, 0xc6, 0x42, - 0xdf, 0x25, 0xd5, 0x42, 0x09, 0x24, 0x05, 0x43, 0x1a, 0xd2, 0xbe, 0x42, 0xd8, 0xe1, 0x01, 0x43, - 0xfa, 0x7d, 0x19, 0x43, 0x4d, 0x0d, 0x1c, 0x43, 0xf8, 0x44, 0x38, 0x43, 0xe1, 0xa1, 0x30, 0x43, - 0x85, 0x73, 0x32, 0x43, 0x2a, 0x53, 0x1d, 0x43, 0xb3, 0x09, 0x32, 0x43, 0xa2, 0x2f, 0x1a, 0x43, - 0xd3, 0x67, 0x28, 0x43, 0xc9, 0xcf, 0xd2, 0x42, 0x42, 0xe2, 0xca, 0x42, 0x2b, 0xcf, 0x08, 0x43, - 0x6d, 0x71, 0xea, 0x42, 0xb2, 0xd6, 0x19, 0x43, 0x33, 0x65, 0x13, 0x43, 0x9f, 0xab, 0x11, 0x43, - 0xc5, 0x0b, 0x32, 0x43, 0xbd, 0x93, 0x3f, 0x43, 0x5f, 0x2e, 0x32, 0x43, 0xd8, 0x30, 0x26, 0x43, - 0xf2, 0xd3, 0x2e, 0x43, 0xfe, 0x6d, 0x1f, 0x43, 0x99, 0xb9, 0x21, 0x43, 0xde, 0x4f, 0xdb, 0x42, - 0xfb, 0x46, 0xd9, 0x42, 0xed, 0xc1, 0x0a, 0x43, 0xe6, 0xbd, 0xfb, 0x42, 0xa2, 0xf0, 0x10, 0x43, - 0x97, 0xa9, 0x0c, 0x43, 0x9e, 0x3d, 0x1c, 0x43, 0x3b, 0xb2, 0x3c, 0x43, 0xf3, 0x04, 0x4e, 0x43, - 0xd7, 0x24, 0x40, 0x43, 0x79, 0x1c, 0x24, 0x43, 0x24, 0x3b, 0x27, 0x43, 0x68, 0xaf, 0x07, 0x43, - 0x03, 0x44, 0x11, 0x43, 0x4b, 0x14, 0xc6, 0x42, 0x39, 0xcd, 0xd2, 0x42, 0x05, 0x7c, 0x15, 0x43, - 0x98, 0xe0, 0x00, 0x43, 0x55, 0xa8, 0x1c, 0x43, 0x15, 0xe6, 0x09, 0x43, 0xcf, 0x2e, 0x16, 0x43, - 0x16, 0xb4, 0x48, 0x43, 0x0e, 0x33, 0x4f, 0x43, 0xb7, 0x9b, 0x47, 0x43, 0xf3, 0x4d, 0x24, 0x43, - 0x80, 0x97, 0x12, 0x43, 0x11, 0x30, 0x0f, 0x43, 0x55, 0x78, 0x11, 0x43, 0xcb, 0xb4, 0xdd, 0x42, - 0xd2, 0xd8, 0xfa, 0x42, 0x75, 0xe7, 0x1d, 0x43, 0x95, 0xfa, 0x0b, 0x43, 0xe6, 0x7d, 0x17, 0x43, - 0xe5, 0x54, 0x18, 0x43, 0xba, 0xc6, 0x1d, 0x43, 0x76, 0x6a, 0x44, 0x43, 0x85, 0xf0, 0x41, 0x43, - 0x3b, 0xee, 0x20, 0x43, 0x6d, 0x49, 0x0d, 0x43, 0x55, 0x9d, 0x05, 0x43, 0x62, 0x36, 0x06, 0x43, - 0x05, 0x0b, 0x1a, 0x43, 0xb9, 0x06, 0xca, 0x42, 0x7a, 0x0a, 0xdf, 0x42, 0x7a, 0x01, 0x13, 0x43, - 0xba, 0x30, 0x06, 0x43, 0x0e, 0xfa, 0x16, 0x43, 0x4c, 0x14, 0x1f, 0x43, 0x05, 0xa5, 0x10, 0x43, - 0x94, 0x27, 0x2a, 0x43, 0x81, 0x83, 0x30, 0x43, 0x3c, 0xfd, 0x0c, 0x43, 0xcb, 0x09, 0x08, 0x43, - 0xf6, 0x56, 0xf6, 0x42, 0x73, 0x90, 0x11, 0x43, 0xf3, 0xab, 0x30, 0x43, 0xd9, 0x89, 0xee, 0x42, - 0x1d, 0xbf, 0xce, 0x42, 0xc5, 0x12, 0x13, 0x43, 0xed, 0x7f, 0x19, 0x43, 0xfb, 0xda, 0x0f, 0x43, - 0x18, 0xfd, 0x11, 0x43, 0xc8, 0xbf, 0x26, 0x43, 0x5b, 0xa8, 0x27, 0x43, 0xf2, 0xbf, 0x1c, 0x43, - 0xf5, 0xa2, 0x0d, 0x43, 0x73, 0xa5, 0x08, 0x43, 0x80, 0x39, 0x05, 0x43, 0x05, 0x12, 0x12, 0x43, - 0xcb, 0x6b, 0x23, 0x43, 0x46, 0x10, 0xd4, 0x42, 0x35, 0x30, 0xce, 0x42, 0x93, 0x17, 0x3d, 0x43, - 0x6b, 0xac, 0x2b, 0x43, 0x1d, 0xa9, 0x32, 0x43, 0x71, 0x82, 0x14, 0x43, 0x84, 0x93, 0x29, 0x43, - 0xe3, 0x91, 0x21, 0x43, 0x35, 0x12, 0x29, 0x43, 0x1b, 0xaf, 0x21, 0x43, 0xd9, 0xb9, 0x18, 0x43, - 0xa0, 0x54, 0x0d, 0x43, 0x9e, 0xe4, 0x10, 0x43, 0x67, 0x1f, 0x2e, 0x43, 0x73, 0xe2, 0xf4, 0x42, - 0xcd, 0xe6, 0xd0, 0x42, 0xa7, 0xd5, 0x26, 0x43, 0xf3, 0xd9, 0x28, 0x43, 0x22, 0x97, 0x25, 0x43, - 0xfb, 0x22, 0x11, 0x43, 0x57, 0x03, 0x2b, 0x43, 0x07, 0x57, 0x18, 0x43, 0x5a, 0xf6, 0x2a, 0x43, - 0xcb, 0xc6, 0x21, 0x43, 0xcd, 0xd5, 0x21, 0x43, 0xbd, 0x9c, 0x27, 0x43, 0x73, 0x85, 0x31, 0x43, - 0x11, 0xa6, 0x3f, 0x43, 0xa6, 0x67, 0xf4, 0x42, 0x75, 0x46, 0xb9, 0x42, 0x28, 0x3c, 0x0b, 0x43, - 0x45, 0x9b, 0x0d, 0x43, 0x80, 0x23, 0x07, 0x43, 0x7a, 0x05, 0x11, 0x43, 0x44, 0x96, 0x1b, 0x43, - 0x15, 0x7d, 0x14, 0x43, 0x8b, 0x6c, 0x23, 0x43, 0xa3, 0xa5, 0x23, 0x43, 0x1b, 0x40, 0x2c, 0x43, - 0x91, 0x0a, 0x41, 0x43, 0xca, 0xa0, 0x41, 0x43, 0x75, 0x1a, 0x2a, 0x43, 0xb5, 0xd4, 0xe1, 0x42, - 0xba, 0x35, 0xb6, 0x42, 0x47, 0xc1, 0xf1, 0x42, 0xb0, 0x87, 0x06, 0x43, 0x6b, 0xd8, 0xdb, 0x42, - 0x39, 0x4a, 0xf9, 0x42, 0xad, 0x71, 0x00, 0x43, 0x5c, 0x4a, 0x0c, 0x43, 0xc3, 0xfb, 0x2c, 0x43, - 0xce, 0x20, 0x2b, 0x43, 0x7b, 0xd9, 0x3e, 0x43, 0xa3, 0x84, 0x29, 0x43, 0xa3, 0x7e, 0x33, 0x43, - 0xb5, 0x19, 0xf9, 0x42, 0x78, 0xfe, 0xbd, 0x42, 0x1f, 0x05, 0x88, 0x42, 0xc7, 0xea, 0x9f, 0x42, - 0xb8, 0xd3, 0xa1, 0x42, 0x63, 0xfe, 0xb6, 0x42, 0xb8, 0xe3, 0xba, 0x42, 0x3d, 0x8c, 0xc1, 0x42, - 0xfd, 0x7c, 0xc3, 0x42, 0xf0, 0xbd, 0xee, 0x42, 0xf2, 0x24, 0xeb, 0x42, 0xac, 0xe5, 0x0b, 0x43, - 0x79, 0xd6, 0xf6, 0x42, 0x9f, 0x33, 0xd6, 0x42, 0x85, 0x8c, 0xae, 0x42, 0x05, 0x1f, 0x56, 0x42, - 0xfc, 0xf8, 0x45, 0x42, 0x2d, 0x44, 0x80, 0x42, 0xb6, 0x40, 0x81, 0x42, 0x15, 0xf5, 0xab, 0x42, - 0x7a, 0x10, 0xb7, 0x42, 0x64, 0x7c, 0xc9, 0x42, 0x7f, 0x59, 0xcc, 0x42, 0xfe, 0x04, 0xd3, 0x42, - 0x6f, 0x8e, 0xd8, 0x42, 0xf8, 0x43, 0x97, 0x42, 0x5d, 0x88, 0xdb, 0x42, 0x23, 0x6d, 0xa4, 0x42, - 0x0d, 0x82, 0xa0, 0x42, 0xa1, 0x11, 0x73, 0x42, 0x1d, 0x1d, 0xbc, 0x42, 0x55, 0x0f, 0xd6, 0x42, - 0xbb, 0x1d, 0xbc, 0x42, 0x05, 0xcd, 0xf9, 0x42, 0xe9, 0xd3, 0x0c, 0x43, 0x32, 0xaf, 0xf1, 0x42, - 0xd6, 0xe5, 0x0f, 0x43, 0x70, 0x58, 0x20, 0x43, 0xb2, 0xea, 0x1c, 0x43, 0xcc, 0x61, 0xf1, 0x42, - 0x82, 0x89, 0x13, 0x43, 0x1a, 0x58, 0x1d, 0x43, 0xc8, 0xa4, 0x14, 0x43, 0xa2, 0xbb, 0xaa, 0x42, - 0x4d, 0x92, 0xd0, 0x42, 0xa1, 0xf8, 0xdc, 0x42, 0x19, 0x3e, 0xe0, 0x42, 0x81, 0xc7, 0xfb, 0x42, - 0x06, 0xf0, 0x15, 0x43, 0x3a, 0x91, 0x23, 0x43, 0x84, 0x89, 0x27, 0x43, 0xf5, 0x80, 0x0a, 0x43, - 0xf4, 0xdb, 0x15, 0x43, 0x85, 0x53, 0xfa, 0x42, 0x44, 0xf5, 0x18, 0x43, 0x96, 0xc6, 0x13, 0x43, - 0x0a, 0xac, 0x1a, 0x43, 0x80, 0xc8, 0xe1, 0x42, 0xf3, 0x5e, 0xc9, 0x42, 0x3a, 0x03, 0x07, 0x43, - 0x66, 0x58, 0x04, 0x43, 0xe7, 0xde, 0xfc, 0x42, 0x7e, 0x1f, 0x09, 0x43, 0x4e, 0x3e, 0x06, 0x43, - 0x24, 0xf3, 0x3a, 0x43, 0xe8, 0x34, 0x3b, 0x43, 0xa6, 0x57, 0x27, 0x43, 0xda, 0x29, 0x17, 0x43, - 0x1e, 0x05, 0x1a, 0x43, 0xfc, 0x6c, 0x1d, 0x43, 0x5a, 0x36, 0x0d, 0x43, 0x5d, 0x21, 0xad, 0x42, - 0x1b, 0xbc, 0xc5, 0x42, 0x3a, 0xf2, 0x06, 0x43, 0xe3, 0xa1, 0xe5, 0x42, 0x26, 0x4d, 0x0e, 0x43, - 0x87, 0xf9, 0x09, 0x43, 0x06, 0x17, 0x22, 0x43, 0x32, 0xb5, 0x16, 0x43, 0x8e, 0xfb, 0x3a, 0x43, - 0xac, 0x56, 0x2d, 0x43, 0x6a, 0xa4, 0x21, 0x43, 0xb8, 0xce, 0x17, 0x43, 0xfc, 0xb6, 0x16, 0x43, - 0x21, 0x43, 0xfa, 0x42, 0xf2, 0x0e, 0xc1, 0x42, 0xb7, 0x78, 0xd5, 0x42, 0xbc, 0x63, 0x18, 0x43, - 0x24, 0x7f, 0xf8, 0x42, 0x4c, 0xe5, 0xfa, 0x42, 0xcb, 0xea, 0xf9, 0x42, 0x10, 0x9b, 0x1d, 0x43, - 0xae, 0xab, 0x3b, 0x43, 0xf6, 0x37, 0x48, 0x43, 0x5c, 0x32, 0x4a, 0x43, 0xd8, 0x00, 0x1b, 0x43, - 0xb2, 0x6a, 0x0e, 0x43, 0xba, 0x72, 0x10, 0x43, 0xe4, 0x44, 0x0f, 0x43, 0x7b, 0x01, 0xbb, 0x42, - 0xae, 0x87, 0xc8, 0x42, 0x8a, 0x44, 0x0e, 0x43, 0x72, 0x14, 0x0b, 0x43, 0x81, 0xd5, 0xf5, 0x42, - 0xda, 0xa7, 0x0f, 0x43, 0xa2, 0xd3, 0x18, 0x43, 0x12, 0x9d, 0x38, 0x43, 0x02, 0xec, 0x1a, 0x43, - 0xe0, 0x18, 0x0f, 0x43, 0xd6, 0xf2, 0xfd, 0x42, 0x80, 0x18, 0x0d, 0x43, 0xd8, 0xb7, 0x03, 0x43, - 0x0a, 0xb9, 0x16, 0x43, 0x21, 0xe3, 0xd6, 0x42, 0x1a, 0xb3, 0xbe, 0x42, 0x92, 0x98, 0x1d, 0x43, - 0xbd, 0x89, 0x0b, 0x43, 0x28, 0x2e, 0x07, 0x43, 0x92, 0x68, 0x0e, 0x43, 0x76, 0x9d, 0x2b, 0x43, - 0xe0, 0xaa, 0x2f, 0x43, 0xa4, 0xde, 0x20, 0x43, 0x56, 0x2c, 0x1c, 0x43, 0x93, 0xff, 0xe9, 0x42, - 0x93, 0x4f, 0xf3, 0x42, 0x96, 0x8f, 0x02, 0x43, 0xe4, 0xe2, 0x0f, 0x43, 0xa9, 0xac, 0xdb, 0x42, - 0x95, 0x97, 0xbf, 0x42, 0xc4, 0x2c, 0x25, 0x43, 0x92, 0x06, 0x17, 0x43, 0x40, 0x91, 0x08, 0x43, - 0x54, 0x83, 0x1d, 0x43, 0x84, 0x6d, 0x1c, 0x43, 0xa6, 0xc6, 0x1e, 0x43, 0x4a, 0xc9, 0x09, 0x43, - 0x88, 0x73, 0xfb, 0x42, 0xe4, 0x34, 0x12, 0x43, 0x36, 0xba, 0x16, 0x43, 0x12, 0xd1, 0x06, 0x43, - 0x42, 0xa3, 0x10, 0x43, 0xef, 0x33, 0xd8, 0x42, 0x88, 0x37, 0xd4, 0x42, 0xf6, 0x01, 0x28, 0x43, - 0x98, 0xe0, 0x0e, 0x43, 0xfa, 0xd4, 0x20, 0x43, 0x7a, 0xc9, 0x10, 0x43, 0xd4, 0x22, 0x29, 0x43, - 0x08, 0x45, 0x21, 0x43, 0x14, 0x40, 0x30, 0x43, 0xa6, 0x71, 0x22, 0x43, 0xea, 0x06, 0x10, 0x43, - 0xe4, 0xfc, 0x08, 0x43, 0x50, 0xb9, 0x14, 0x43, 0xba, 0x24, 0x2e, 0x43, 0x8f, 0xa3, 0xf1, 0x42, - 0xe9, 0x0f, 0xb3, 0x42, 0x8c, 0x78, 0x1a, 0x43, 0x5e, 0x49, 0x2e, 0x43, 0x0c, 0x1f, 0x30, 0x43, - 0x7c, 0x12, 0x09, 0x43, 0x4a, 0x21, 0x18, 0x43, 0x6a, 0x02, 0x1c, 0x43, 0xde, 0x87, 0x1a, 0x43, - 0xae, 0x69, 0x20, 0x43, 0xd2, 0xf4, 0x06, 0x43, 0xd2, 0x50, 0x22, 0x43, 0xfe, 0x1e, 0x2f, 0x43, - 0xac, 0x57, 0x28, 0x43, 0x55, 0xb9, 0xce, 0x42, 0x9a, 0x05, 0xc5, 0x42, 0xa1, 0x81, 0xf7, 0x42, - 0xf6, 0x4e, 0xeb, 0x42, 0xbc, 0xf8, 0x18, 0x43, 0xe2, 0x01, 0x02, 0x43, 0xe6, 0xb1, 0x19, 0x43, - 0x92, 0x84, 0x16, 0x43, 0xa4, 0x0d, 0x24, 0x43, 0x72, 0xa6, 0x1a, 0x43, 0x4c, 0x4b, 0x26, 0x43, - 0x40, 0x68, 0x34, 0x43, 0xb0, 0x77, 0x45, 0x43, 0xc2, 0xaa, 0x16, 0x43, 0x2c, 0x45, 0xc2, 0x42, - 0xc7, 0x6d, 0xc5, 0x42, 0x02, 0x48, 0xdd, 0x42, 0xcb, 0xa9, 0xf2, 0x42, 0xc3, 0xc1, 0xef, 0x42, - 0x3e, 0x4e, 0xff, 0x42, 0x87, 0x27, 0xde, 0x42, 0xb6, 0x7f, 0x00, 0x43, 0x36, 0x5b, 0x2a, 0x43, - 0xd8, 0x7b, 0x20, 0x43, 0x64, 0xa4, 0x2e, 0x43, 0xfe, 0xcf, 0x20, 0x43, 0xfe, 0x62, 0x16, 0x43, - 0x06, 0x1d, 0x20, 0x43, 0x87, 0xce, 0xa6, 0x42, 0x9c, 0x57, 0x7c, 0x42, 0x65, 0xa3, 0x9a, 0x42, - 0xe5, 0x96, 0xa5, 0x42, 0xf1, 0x25, 0xbc, 0x42, 0x6b, 0x38, 0xc8, 0x42, 0x3b, 0x7c, 0xaa, 0x42, - 0x99, 0x9e, 0xc9, 0x42, 0xd9, 0x41, 0xee, 0x42, 0xc6, 0x2c, 0x01, 0x43, 0xd3, 0x25, 0x0d, 0x43, - 0xcc, 0x93, 0xdd, 0x42, 0xf9, 0xa5, 0xa9, 0x42, 0x6d, 0x3b, 0x8b, 0x42, 0xff, 0xb0, 0x80, 0x42, - 0x17, 0x80, 0x36, 0x42, 0x79, 0x25, 0x87, 0x42, 0x12, 0xc8, 0x64, 0x42, 0x21, 0x02, 0x9a, 0x42, - 0x68, 0xc2, 0xba, 0x42, 0x36, 0x67, 0xb2, 0x42, 0x86, 0xd6, 0xb8, 0x42, 0xbf, 0xcc, 0xab, 0x42, - 0xba, 0xad, 0xb7, 0x42, 0x25, 0x9f, 0x87, 0x42, 0xf6, 0xe1, 0x95, 0x42, 0xc6, 0x1a, 0xbd, 0x42, - 0xa6, 0xce, 0x9f, 0x42, 0x4a, 0xa0, 0x4d, 0x42, 0x4f, 0xf0, 0x93, 0x42, 0xcf, 0x5b, 0xc6, 0x42, - 0xae, 0x87, 0xc7, 0x42, 0x99, 0xb9, 0xd9, 0x42, 0xda, 0xbf, 0xfd, 0x42, 0x58, 0x8a, 0xe9, 0x42, - 0x2e, 0x11, 0x0d, 0x43, 0x89, 0xbe, 0x13, 0x43, 0xbb, 0x88, 0x15, 0x43, 0x7b, 0x9e, 0xea, 0x42, - 0x0b, 0xf5, 0x0d, 0x43, 0xed, 0x16, 0x10, 0x43, 0x3a, 0x7b, 0x10, 0x43, 0x62, 0xdb, 0xbb, 0x42, - 0xdc, 0x1b, 0xaa, 0x42, 0x36, 0x29, 0xe1, 0x42, 0x8a, 0xaf, 0x9b, 0x42, 0xe0, 0x69, 0xe3, 0x42, - 0x38, 0xe8, 0xf7, 0x42, 0xc1, 0x3e, 0x09, 0x43, 0x98, 0xa9, 0x1f, 0x43, 0x41, 0x1d, 0x1e, 0x43, - 0x40, 0x7d, 0x0f, 0x43, 0x90, 0x94, 0x08, 0x43, 0x1e, 0xf8, 0x01, 0x43, 0x16, 0x53, 0x16, 0x43, - 0x3e, 0xc2, 0x15, 0x43, 0x10, 0x86, 0xb0, 0x42, 0x4b, 0x74, 0xb3, 0x42, 0x40, 0x30, 0xea, 0x42, - 0x30, 0x20, 0xc0, 0x42, 0xce, 0xe8, 0xfa, 0x42, 0xf2, 0xbc, 0xe7, 0x42, 0xa0, 0xf9, 0x02, 0x43, - 0x9c, 0xb5, 0x2a, 0x43, 0x56, 0xa6, 0x2f, 0x43, 0xf4, 0xf8, 0x35, 0x43, 0x42, 0x97, 0x0c, 0x43, - 0x61, 0x64, 0x05, 0x43, 0xa9, 0x61, 0x18, 0x43, 0xf1, 0x9e, 0x04, 0x43, 0x9f, 0xfe, 0xa1, 0x42, - 0x8f, 0xb6, 0x8a, 0x42, 0x3c, 0x0d, 0xde, 0x42, 0xff, 0x42, 0xde, 0x42, 0x72, 0x2a, 0xf4, 0x42, - 0x45, 0xea, 0x0b, 0x43, 0x9c, 0xc5, 0x04, 0x43, 0xa6, 0x39, 0x21, 0x43, 0x01, 0x34, 0x2e, 0x43, - 0xbd, 0x9d, 0x29, 0x43, 0x19, 0xed, 0x10, 0x43, 0x64, 0x2a, 0x11, 0x43, 0xcc, 0xbe, 0x06, 0x43, - 0xa2, 0x46, 0xeb, 0x42, 0xc8, 0xbc, 0x9a, 0x42, 0x7e, 0x67, 0xb1, 0x42, 0x8b, 0xcf, 0x0a, 0x43, - 0xe7, 0x1c, 0xe4, 0x42, 0x58, 0xc5, 0xfb, 0x42, 0xea, 0xac, 0xee, 0x42, 0x8b, 0x84, 0x17, 0x43, - 0xdd, 0xf4, 0x2e, 0x43, 0xfb, 0xe5, 0x29, 0x43, 0x3e, 0xb2, 0x3c, 0x43, 0x3e, 0x98, 0x0b, 0x43, - 0xd6, 0x37, 0x04, 0x43, 0x79, 0x5b, 0xc5, 0x42, 0xb6, 0xcb, 0x00, 0x43, 0x10, 0x06, 0xae, 0x42, - 0x69, 0xdc, 0xbe, 0x42, 0x77, 0x58, 0x13, 0x43, 0x78, 0x2d, 0x00, 0x43, 0xc2, 0x60, 0xdc, 0x42, - 0x66, 0xd8, 0x03, 0x43, 0xc2, 0xc5, 0x04, 0x43, 0xa7, 0x16, 0x25, 0x43, 0x57, 0x57, 0x11, 0x43, - 0x9e, 0x08, 0x1a, 0x43, 0x82, 0x7f, 0xe4, 0x42, 0x94, 0x6f, 0xe5, 0x42, 0x7b, 0x52, 0x02, 0x43, - 0x70, 0xeb, 0x08, 0x43, 0x89, 0x11, 0xb7, 0x42, 0xd4, 0xe4, 0xba, 0x42, 0x6b, 0x95, 0x0d, 0x43, - 0x4e, 0x94, 0xea, 0x42, 0x53, 0x8b, 0xf3, 0x42, 0x9a, 0x28, 0x06, 0x43, 0xb2, 0x4f, 0x0f, 0x43, - 0x6d, 0x68, 0x25, 0x43, 0x15, 0x43, 0xf5, 0x42, 0x6e, 0xe4, 0xf9, 0x42, 0x8e, 0x17, 0xdc, 0x42, - 0x59, 0x7c, 0xb3, 0x42, 0xb9, 0xa7, 0xe4, 0x42, 0xe8, 0x6a, 0xf5, 0x42, 0xf4, 0x10, 0xc2, 0x42, - 0xb3, 0x62, 0xa1, 0x42, 0xa7, 0xba, 0x08, 0x43, 0xc6, 0xa0, 0x03, 0x43, 0x8f, 0x90, 0x1c, 0x43, - 0xa9, 0x37, 0x23, 0x43, 0x64, 0x8f, 0x14, 0x43, 0x76, 0xd0, 0x0a, 0x43, 0xf2, 0x51, 0xfd, 0x42, - 0x6c, 0x57, 0xe2, 0x42, 0xdf, 0x0a, 0xe3, 0x42, 0x9c, 0xe8, 0xed, 0x42, 0x8e, 0xdf, 0xea, 0x42, - 0x0c, 0x31, 0x0e, 0x43, 0x26, 0xa4, 0xc6, 0x42, 0x97, 0x38, 0xab, 0x42, 0xe4, 0x88, 0x0a, 0x43, - 0x47, 0xda, 0x0c, 0x43, 0x7a, 0x9f, 0x10, 0x43, 0xb6, 0x4b, 0x09, 0x43, 0x38, 0x22, 0x16, 0x43, - 0x9b, 0x5a, 0x1d, 0x43, 0x38, 0x48, 0x1b, 0x43, 0x2d, 0x96, 0x16, 0x43, 0xa8, 0x66, 0xf8, 0x42, - 0x43, 0xbd, 0x03, 0x43, 0xa7, 0xbd, 0x17, 0x43, 0xba, 0x24, 0x18, 0x43, 0xa3, 0x1c, 0xce, 0x42, - 0xea, 0x34, 0xbe, 0x42, 0x35, 0x42, 0x16, 0x43, 0xff, 0xbd, 0x0b, 0x43, 0x35, 0x47, 0x14, 0x43, - 0x5e, 0xd8, 0x06, 0x43, 0xc2, 0xf2, 0x02, 0x43, 0xfe, 0x70, 0x0e, 0x43, 0x22, 0x89, 0x1a, 0x43, - 0x92, 0x81, 0x07, 0x43, 0x82, 0xd0, 0x01, 0x43, 0xf7, 0x5c, 0x1b, 0x43, 0x7b, 0x8f, 0x11, 0x43, - 0xc0, 0xc5, 0x29, 0x43, 0xd0, 0x5c, 0xe9, 0x42, 0x05, 0x59, 0x92, 0x42, 0x16, 0x05, 0x03, 0x43, - 0x64, 0xc1, 0xd2, 0x42, 0xc0, 0x81, 0x05, 0x43, 0xc8, 0x5d, 0xf5, 0x42, 0xa4, 0x46, 0xf0, 0x42, - 0x29, 0x7d, 0xe9, 0x42, 0x51, 0x7d, 0x14, 0x43, 0xbc, 0xcd, 0x10, 0x43, 0x04, 0x53, 0x13, 0x43, - 0x92, 0x86, 0x1d, 0x43, 0x46, 0x7f, 0x33, 0x43, 0x30, 0xd8, 0x09, 0x43, 0xf4, 0x71, 0xb4, 0x42, - 0x28, 0x02, 0x8c, 0x42, 0xd9, 0x85, 0xf5, 0x42, 0xae, 0x08, 0xc8, 0x42, 0xe7, 0x09, 0xc2, 0x42, - 0x9a, 0x44, 0xc9, 0x42, 0x54, 0x82, 0xea, 0x42, 0x9b, 0x2e, 0xef, 0x42, 0x60, 0xf8, 0x13, 0x43, - 0x0b, 0x08, 0x0e, 0x43, 0x80, 0x73, 0x1f, 0x43, 0x45, 0x7f, 0x30, 0x43, 0xcc, 0xab, 0x14, 0x43, - 0xc0, 0xd6, 0xf3, 0x42, 0x58, 0x7d, 0xa7, 0x42, 0x13, 0x6f, 0x39, 0x42, 0x0a, 0x75, 0x82, 0x42, - 0x7d, 0x01, 0x89, 0x42, 0xc0, 0xdf, 0x89, 0x42, 0x26, 0xf9, 0x9b, 0x42, 0x29, 0x72, 0xa4, 0x42, - 0xce, 0xab, 0xa5, 0x42, 0x74, 0xc7, 0xc5, 0x42, 0x11, 0xf7, 0xcd, 0x42, 0xc2, 0x37, 0xf1, 0x42, - 0x0b, 0xcf, 0xaf, 0x42, 0xb1, 0x5d, 0xa2, 0x42, 0xc7, 0xa3, 0x24, 0x42, 0x51, 0x2e, 0x2e, 0x42, - 0x71, 0xa7, 0x5f, 0x42, 0x3e, 0x43, 0x96, 0x42, 0xfe, 0x56, 0x8e, 0x42, 0x9e, 0xc3, 0xa9, 0x42, - 0x9d, 0x94, 0xd4, 0x42, 0xed, 0x4e, 0xb8, 0x42, 0xda, 0x74, 0xd7, 0x42, 0xeb, 0xca, 0xc0, 0x42, - 0xaf, 0xc7, 0xec, 0x42, 0xd9, 0x2c, 0x8e, 0x42, 0x32, 0x60, 0xab, 0x42, 0xba, 0xfd, 0xce, 0x42, - 0xbc, 0x9a, 0xb7, 0x42, 0x45, 0x35, 0x49, 0x42, 0x6b, 0xb2, 0xbb, 0x42, 0xc8, 0xae, 0x02, 0x43, - 0x77, 0x74, 0xac, 0x42, 0x03, 0x77, 0xdc, 0x42, 0x5f, 0xa8, 0x01, 0x43, 0xef, 0x79, 0xde, 0x42, - 0x71, 0xee, 0x1b, 0x43, 0x69, 0xcf, 0x20, 0x43, 0xf4, 0xbf, 0x30, 0x43, 0x1f, 0x66, 0xfb, 0x42, - 0xf1, 0xae, 0x1c, 0x43, 0x66, 0x6e, 0x0f, 0x43, 0x00, 0x98, 0x13, 0x43, 0xd1, 0xfa, 0xc1, 0x42, - 0xd7, 0x67, 0xc3, 0x42, 0xc7, 0x1a, 0xe0, 0x42, 0xf1, 0xfe, 0xbd, 0x42, 0xd7, 0xdc, 0x08, 0x43, - 0x58, 0x72, 0x15, 0x43, 0x58, 0xd5, 0x11, 0x43, 0x92, 0x57, 0x23, 0x43, 0xc2, 0x9f, 0x27, 0x43, - 0x1e, 0xca, 0x29, 0x43, 0xe2, 0xbf, 0x07, 0x43, 0x05, 0x82, 0x1a, 0x43, 0x0c, 0x67, 0x1c, 0x43, - 0xae, 0xa2, 0x1a, 0x43, 0x8c, 0xb9, 0xbf, 0x42, 0x73, 0xf9, 0xcf, 0x42, 0x0c, 0x0b, 0x02, 0x43, - 0x46, 0xb0, 0xe3, 0x42, 0xbd, 0xdc, 0xde, 0x42, 0xf5, 0x1e, 0x03, 0x43, 0x3c, 0xf4, 0x09, 0x43, - 0x7e, 0x74, 0x47, 0x43, 0x02, 0x44, 0x37, 0x43, 0x56, 0x50, 0x33, 0x43, 0xbf, 0x77, 0x16, 0x43, - 0xeb, 0x9a, 0x1f, 0x43, 0x8a, 0x9f, 0x1f, 0x43, 0x8d, 0xbb, 0x0f, 0x43, 0x98, 0x19, 0xb4, 0x42, - 0x0b, 0x1c, 0xb0, 0x42, 0x3b, 0xf9, 0xf0, 0x42, 0x70, 0xbc, 0xe4, 0x42, 0xfc, 0x5f, 0x06, 0x43, - 0xb7, 0x5f, 0x03, 0x43, 0x8a, 0xf0, 0x15, 0x43, 0x58, 0xc6, 0x43, 0x43, 0x06, 0x20, 0x3a, 0x43, - 0x23, 0xe3, 0x1b, 0x43, 0x21, 0xba, 0x21, 0x43, 0x00, 0xbd, 0x22, 0x43, 0x41, 0x5e, 0x12, 0x43, - 0x0b, 0x07, 0x05, 0x43, 0x25, 0xa7, 0xa0, 0x42, 0xb5, 0xd0, 0xce, 0x42, 0xf2, 0x04, 0x0a, 0x43, - 0x88, 0xe8, 0xfd, 0x42, 0xf0, 0xab, 0x10, 0x43, 0x4e, 0x2e, 0x05, 0x43, 0x20, 0xfa, 0x23, 0x43, - 0x75, 0x3b, 0x3b, 0x43, 0x5a, 0x30, 0x4e, 0x43, 0x5a, 0xd4, 0x3a, 0x43, 0xdb, 0x30, 0x11, 0x43, - 0xa7, 0x31, 0x11, 0x43, 0x5f, 0xdf, 0x04, 0x43, 0x3b, 0xcb, 0xe7, 0x42, 0xdb, 0x76, 0xaa, 0x42, - 0x82, 0xbd, 0xe0, 0x42, 0xc1, 0xfc, 0x10, 0x43, 0x13, 0x5d, 0xfd, 0x42, 0xcd, 0x26, 0x02, 0x43, - 0x2e, 0x8b, 0x15, 0x43, 0xc3, 0x45, 0x20, 0x43, 0x51, 0x07, 0x30, 0x43, 0x5a, 0xb6, 0x40, 0x43, - 0x02, 0xca, 0x19, 0x43, 0x40, 0xfc, 0xf1, 0x42, 0x57, 0xcd, 0xee, 0x42, 0x5e, 0x1f, 0x0d, 0x43, - 0x2a, 0x26, 0x0e, 0x43, 0x1b, 0x02, 0xcf, 0x42, 0x43, 0xfc, 0xd3, 0x42, 0xc8, 0xca, 0x0d, 0x43, - 0x33, 0xb2, 0xf6, 0x42, 0x23, 0xc6, 0xfe, 0x42, 0x56, 0x6f, 0x04, 0x43, 0x24, 0xdf, 0x2d, 0x43, - 0x8d, 0xf3, 0x27, 0x43, 0x6b, 0xec, 0x15, 0x43, 0x9a, 0x97, 0xfe, 0x42, 0x89, 0x20, 0xe2, 0x42, - 0x0a, 0x93, 0xdd, 0x42, 0xcf, 0xb1, 0xfe, 0x42, 0x16, 0xa4, 0x10, 0x43, 0x4c, 0x28, 0xcf, 0x42, - 0x5c, 0x01, 0xbe, 0x42, 0xed, 0xc5, 0x07, 0x43, 0x55, 0x13, 0x1c, 0x43, 0x75, 0xca, 0x18, 0x43, - 0x3e, 0x35, 0x0f, 0x43, 0x4d, 0xab, 0x14, 0x43, 0xf5, 0xaa, 0x15, 0x43, 0x36, 0x75, 0x14, 0x43, - 0x4b, 0xeb, 0x0a, 0x43, 0x46, 0x27, 0x0e, 0x43, 0xee, 0xfe, 0x00, 0x43, 0xc0, 0x58, 0x01, 0x43, - 0xe4, 0xcd, 0x0d, 0x43, 0x46, 0x63, 0xc1, 0x42, 0x85, 0xc6, 0xd2, 0x42, 0x8e, 0x4b, 0x14, 0x43, - 0xa1, 0x69, 0x18, 0x43, 0x45, 0xbd, 0x22, 0x43, 0xa0, 0x62, 0x15, 0x43, 0x7e, 0x3c, 0x22, 0x43, - 0x5e, 0xd7, 0x1b, 0x43, 0xe0, 0x18, 0x2c, 0x43, 0x6a, 0x9b, 0x22, 0x43, 0xc0, 0xbf, 0x12, 0x43, - 0xf4, 0xbd, 0x0d, 0x43, 0x98, 0x54, 0x1b, 0x43, 0xdc, 0x3a, 0x23, 0x43, 0x86, 0xbb, 0xe2, 0x42, - 0x6f, 0x8e, 0xc7, 0x42, 0x71, 0x56, 0x1f, 0x43, 0xba, 0xe9, 0x13, 0x43, 0x62, 0xb3, 0x1f, 0x43, - 0xee, 0xae, 0x1b, 0x43, 0xe6, 0x36, 0x1e, 0x43, 0xfa, 0x59, 0x15, 0x43, 0x44, 0xe1, 0x1f, 0x43, - 0x96, 0x33, 0x18, 0x43, 0xc0, 0x35, 0x18, 0x43, 0x81, 0x48, 0x20, 0x43, 0xc0, 0xd3, 0x1b, 0x43, - 0xfe, 0x3f, 0x42, 0x43, 0x8f, 0xf9, 0xf7, 0x42, 0x16, 0xd7, 0xa6, 0x42, 0xca, 0x49, 0x07, 0x43, - 0x6d, 0x59, 0xde, 0x42, 0x4b, 0x50, 0x0d, 0x43, 0xa6, 0x80, 0xf4, 0x42, 0x34, 0xac, 0xe7, 0x42, - 0x50, 0x0b, 0x08, 0x43, 0x22, 0x74, 0x1b, 0x43, 0x9a, 0xee, 0x1f, 0x43, 0x3a, 0x1f, 0x2b, 0x43, - 0x2f, 0x6f, 0x27, 0x43, 0x48, 0x7b, 0x3d, 0x43, 0x73, 0x5c, 0x18, 0x43, 0xe3, 0xd0, 0xc1, 0x42, - 0xa9, 0x29, 0xc3, 0x42, 0x31, 0x61, 0xe6, 0x42, 0xc1, 0x8d, 0xa6, 0x42, 0xb4, 0x30, 0xf4, 0x42, - 0xe3, 0x90, 0x02, 0x43, 0x18, 0x53, 0x04, 0x43, 0xc5, 0x3f, 0xfe, 0x42, 0x78, 0x89, 0x16, 0x43, - 0x9d, 0x49, 0x25, 0x43, 0x49, 0xe9, 0x39, 0x43, 0xea, 0x85, 0x40, 0x43, 0xaa, 0x0e, 0x22, 0x43, - 0xf3, 0x35, 0xe8, 0x42, 0x89, 0x36, 0xa6, 0x42, 0xf3, 0x0a, 0x72, 0x42, 0xc9, 0x7e, 0x8b, 0x42, - 0x89, 0x25, 0x99, 0x42, 0xa2, 0xd7, 0x9a, 0x42, 0x3f, 0x01, 0xb6, 0x42, 0x0d, 0x75, 0xb9, 0x42, - 0x41, 0xe7, 0xb4, 0x42, 0x95, 0xf9, 0xd2, 0x42, 0xf1, 0x91, 0xe3, 0x42, 0xb6, 0x0d, 0x06, 0x43, - 0x99, 0xc3, 0xcd, 0x42, 0x93, 0x43, 0xa1, 0x42, 0xeb, 0x50, 0x76, 0x42, 0xe3, 0x82, 0x6d, 0x42, - 0x92, 0x15, 0x36, 0x42, 0x70, 0x82, 0x8a, 0x42, 0x9f, 0x24, 0x7f, 0x42, 0xda, 0x5f, 0x9f, 0x42, - 0xd0, 0x1c, 0xc9, 0x42, 0x92, 0x36, 0xc4, 0x42, 0x86, 0x27, 0xc1, 0x42, 0x2a, 0xac, 0xbc, 0x42, - 0x58, 0xc1, 0xc3, 0x42, 0x62, 0x7d, 0x88, 0x42, 0x3c, 0x6a, 0xd6, 0x42, 0xdc, 0xda, 0xa9, 0x42, - 0x52, 0xbb, 0xab, 0x42, 0x09, 0x51, 0x34, 0x42, 0x06, 0x65, 0x9f, 0x42, 0xda, 0x70, 0xcd, 0x42, - 0x40, 0x31, 0xd5, 0x42, 0x48, 0x53, 0xfc, 0x42, 0xc2, 0x32, 0x0b, 0x43, 0x52, 0x85, 0xfb, 0x42, - 0x4b, 0xc0, 0x17, 0x43, 0x1b, 0xfc, 0x11, 0x43, 0x64, 0xe7, 0x19, 0x43, 0xc4, 0xd5, 0xd7, 0x42, - 0xba, 0x06, 0x19, 0x43, 0x63, 0xa7, 0x05, 0x43, 0xa7, 0xf8, 0x18, 0x43, 0xf8, 0x9e, 0xaa, 0x42, - 0x32, 0xbf, 0xba, 0x42, 0x50, 0x7d, 0xb7, 0x42, 0x16, 0xd3, 0xbd, 0x42, 0xcc, 0xcc, 0x00, 0x43, - 0xd3, 0xd6, 0x09, 0x43, 0x71, 0xca, 0x06, 0x43, 0x87, 0x8c, 0x20, 0x43, 0xf3, 0x21, 0x23, 0x43, - 0xa7, 0x0c, 0x13, 0x43, 0xa0, 0xd4, 0x01, 0x43, 0x97, 0x68, 0x0d, 0x43, 0x66, 0xdd, 0x07, 0x43, - 0xca, 0x1d, 0x0f, 0x43, 0xc0, 0xdd, 0xc4, 0x42, 0xb8, 0xf1, 0xa0, 0x42, 0x1e, 0x48, 0xf6, 0x42, - 0x3e, 0x9f, 0xd9, 0x42, 0x32, 0xfe, 0x06, 0x43, 0x38, 0x3e, 0xfa, 0x42, 0x49, 0x11, 0x15, 0x43, - 0xab, 0x3f, 0x1b, 0x43, 0xc7, 0xfd, 0x27, 0x43, 0x21, 0xfc, 0x1f, 0x43, 0x50, 0xaf, 0x1d, 0x43, - 0x29, 0xad, 0x02, 0x43, 0x49, 0xe3, 0x16, 0x43, 0xe0, 0x1a, 0xfb, 0x42, 0xa6, 0x32, 0xbd, 0x42, - 0x90, 0xd9, 0xcd, 0x42, 0xce, 0x5a, 0xea, 0x42, 0xe4, 0xbb, 0xd2, 0x42, 0xf4, 0x73, 0x01, 0x43, - 0x26, 0x9a, 0xda, 0x42, 0x7a, 0x81, 0x17, 0x43, 0x7b, 0x8d, 0x28, 0x43, 0xf1, 0x59, 0x23, 0x43, - 0x51, 0xf3, 0x28, 0x43, 0xdf, 0x50, 0x19, 0x43, 0x73, 0xae, 0x09, 0x43, 0x9a, 0x7c, 0xf8, 0x42, - 0x66, 0x04, 0xf2, 0x42, 0x20, 0x5b, 0x9f, 0x42, 0xec, 0x3c, 0xdb, 0x42, 0x0d, 0xc4, 0x04, 0x43, - 0x8c, 0xac, 0xeb, 0x42, 0x72, 0x47, 0x0b, 0x43, 0x2c, 0xba, 0xf5, 0x42, 0x73, 0xd7, 0x06, 0x43, - 0x15, 0x6a, 0x36, 0x43, 0xdd, 0xb7, 0x35, 0x43, 0x57, 0x89, 0x33, 0x43, 0x6f, 0xf0, 0x0c, 0x43, - 0xd1, 0x77, 0x16, 0x43, 0x3c, 0x21, 0x00, 0x43, 0xe3, 0x6a, 0x09, 0x43, 0xaa, 0xb1, 0xa8, 0x42, - 0x18, 0x9c, 0xd8, 0x42, 0x9f, 0xe6, 0x0b, 0x43, 0xea, 0x77, 0xe7, 0x42, 0xa8, 0xc4, 0xfb, 0x42, - 0x35, 0xb3, 0x0f, 0x43, 0xe8, 0xc9, 0x12, 0x43, 0x5b, 0x2d, 0x33, 0x43, 0x51, 0xfc, 0x1e, 0x43, - 0xeb, 0x43, 0x03, 0x43, 0x06, 0x11, 0xcf, 0x42, 0x62, 0x1a, 0xed, 0x42, 0xa2, 0xe5, 0x02, 0x43, - 0xa0, 0x6b, 0x0d, 0x43, 0x32, 0x25, 0xa3, 0x42, 0x58, 0x7b, 0xcd, 0x42, 0x3b, 0x7e, 0x12, 0x43, - 0xb4, 0x6a, 0xdc, 0x42, 0x20, 0x02, 0xf6, 0x42, 0x9e, 0x4d, 0xfc, 0x42, 0x94, 0xab, 0x20, 0x43, - 0xcb, 0xdb, 0x1d, 0x43, 0x0c, 0x19, 0x13, 0x43, 0xc7, 0xd8, 0x00, 0x43, 0xe6, 0xc5, 0xd9, 0x42, - 0xe2, 0xae, 0xc9, 0x42, 0x28, 0x70, 0x01, 0x43, 0x93, 0x22, 0x0e, 0x43, 0xf2, 0xbc, 0xb7, 0x42, - 0xba, 0x29, 0xaa, 0x42, 0xe1, 0x49, 0x1a, 0x43, 0xa0, 0xde, 0x00, 0x43, 0xac, 0x00, 0x02, 0x43, - 0x59, 0x3f, 0x01, 0x43, 0x25, 0x1f, 0x20, 0x43, 0x38, 0x32, 0x1c, 0x43, 0x55, 0x7b, 0x05, 0x43, - 0x6a, 0x15, 0x06, 0x43, 0x9b, 0xa0, 0x05, 0x43, 0x5c, 0x86, 0xf0, 0x42, 0xaa, 0xa6, 0xfa, 0x42, - 0x69, 0x51, 0x16, 0x43, 0x54, 0xb6, 0xc9, 0x42, 0x94, 0x73, 0xc5, 0x42, 0x31, 0x68, 0x19, 0x43, - 0x4c, 0xf1, 0x20, 0x43, 0xd8, 0xda, 0x16, 0x43, 0x19, 0x29, 0x0b, 0x43, 0xf1, 0x45, 0x21, 0x43, - 0x38, 0x2f, 0x0c, 0x43, 0xcd, 0xa2, 0x20, 0x43, 0xab, 0xb1, 0x0f, 0x43, 0x02, 0xf4, 0x01, 0x43, - 0x27, 0x9e, 0x02, 0x43, 0x2b, 0x67, 0x12, 0x43, 0x7b, 0x2d, 0x1f, 0x43, 0xfc, 0x3a, 0xde, 0x42, - 0xdc, 0xca, 0xd8, 0x42, 0x52, 0x88, 0x00, 0x43, 0x42, 0x53, 0x22, 0x43, 0x5f, 0xd1, 0x09, 0x43, - 0x9c, 0x0b, 0x07, 0x43, 0x54, 0x98, 0x0c, 0x43, 0xa1, 0xe0, 0x07, 0x43, 0x23, 0x25, 0x26, 0x43, - 0x33, 0x1c, 0x0b, 0x43, 0x3b, 0x39, 0x04, 0x43, 0xd1, 0xcc, 0x11, 0x43, 0x70, 0xae, 0x17, 0x43, - 0x09, 0x5e, 0x2c, 0x43, 0x4a, 0x81, 0xbf, 0x42, 0x52, 0x5f, 0xad, 0x42, 0xc0, 0x89, 0xe5, 0x42, - 0xea, 0xf0, 0x0a, 0x43, 0x9e, 0x70, 0xfc, 0x42, 0xc8, 0x95, 0xe3, 0x42, 0xf8, 0x98, 0xf5, 0x42, - 0xb1, 0xcc, 0x09, 0x43, 0x47, 0x10, 0x11, 0x43, 0x64, 0xd6, 0x0d, 0x43, 0x18, 0x19, 0x19, 0x43, - 0x80, 0xb2, 0x2a, 0x43, 0x2f, 0x18, 0x2b, 0x43, 0xe6, 0xcd, 0x13, 0x43, 0xd0, 0x9f, 0xa5, 0x42, - 0xd4, 0x99, 0xaa, 0x42, 0x7a, 0x76, 0xc2, 0x42, 0xd6, 0xe5, 0xe2, 0x42, 0x5c, 0x4a, 0x03, 0x43, - 0x14, 0x51, 0xc9, 0x42, 0x0c, 0xf1, 0xce, 0x42, 0xa9, 0x85, 0x09, 0x43, 0x12, 0xd6, 0x1d, 0x43, - 0xa2, 0x30, 0x15, 0x43, 0xdd, 0xe0, 0x2e, 0x43, 0x5f, 0x78, 0x13, 0x43, 0x35, 0x50, 0x08, 0x43, - 0xa4, 0x61, 0xfc, 0x42, 0x8c, 0x96, 0x97, 0x42, 0x79, 0x23, 0x61, 0x42, 0xfe, 0x55, 0x87, 0x42, - 0x94, 0xa3, 0x8b, 0x42, 0x06, 0xf9, 0xb2, 0x42, 0xba, 0xb3, 0xb1, 0x42, 0xde, 0x1a, 0x8c, 0x42, - 0xba, 0x0b, 0xa1, 0x42, 0x5c, 0xab, 0xd3, 0x42, 0x64, 0x98, 0xed, 0x42, 0x10, 0x97, 0xfd, 0x42, - 0x66, 0xfd, 0xc9, 0x42, 0x9c, 0xbc, 0x8a, 0x42, 0xea, 0xed, 0x97, 0x42, 0x17, 0xcd, 0x4c, 0x42, - 0x32, 0xcb, 0xb6, 0x41, 0xb5, 0x7d, 0x60, 0x42, 0x23, 0xc4, 0x86, 0x42, 0x4c, 0xb5, 0x92, 0x42, - 0xd3, 0xf7, 0xab, 0x42, 0x90, 0x26, 0x9e, 0x42, 0x82, 0x0f, 0xbd, 0x42, 0x0a, 0x00, 0xa7, 0x42, - 0x08, 0x96, 0xc0, 0x42, 0xc5, 0x33, 0x8c, 0x42, 0x04, 0xcc, 0xa6, 0x42, 0xf6, 0x85, 0x92, 0x42, - 0xae, 0x54, 0xb9, 0x42, 0xb5, 0x5c, 0x37, 0x42, 0xc3, 0x69, 0xb1, 0x42, 0x73, 0x78, 0xd0, 0x42, - 0x16, 0xc4, 0xa6, 0x42, 0x8c, 0x65, 0xd0, 0x42, 0x3c, 0x2d, 0x0f, 0x43, 0x42, 0x7c, 0xf1, 0x42, - 0x63, 0x70, 0x1c, 0x43, 0xb5, 0xec, 0x10, 0x43, 0x9f, 0x30, 0x19, 0x43, 0x53, 0xf2, 0xed, 0x42, - 0x0b, 0xc2, 0x0d, 0x43, 0x9b, 0x83, 0x1b, 0x43, 0xf6, 0xc6, 0x0a, 0x43, 0x68, 0xc9, 0x97, 0x42, - 0x31, 0xc0, 0xb8, 0x42, 0x3a, 0xd1, 0xd1, 0x42, 0x57, 0x5f, 0xe1, 0x42, 0x44, 0x6e, 0xf5, 0x42, - 0x32, 0x3b, 0x1a, 0x43, 0xee, 0x35, 0x19, 0x43, 0x4d, 0x67, 0x1e, 0x43, 0x87, 0xd1, 0x23, 0x43, - 0x5f, 0x47, 0x14, 0x43, 0x22, 0xff, 0x0a, 0x43, 0x87, 0x46, 0x18, 0x43, 0x2f, 0xbb, 0x0f, 0x43, - 0xdf, 0xa4, 0x12, 0x43, 0xaf, 0xf7, 0xbc, 0x42, 0xb2, 0x53, 0xdb, 0x42, 0x59, 0xd2, 0xe8, 0x42, - 0x38, 0xdd, 0xc4, 0x42, 0x00, 0xdb, 0xe4, 0x42, 0x7b, 0x9f, 0x01, 0x43, 0x02, 0x67, 0x01, 0x43, - 0x90, 0x79, 0x3f, 0x43, 0xa4, 0x6e, 0x33, 0x43, 0x3f, 0x2f, 0x34, 0x43, 0x7e, 0x67, 0x11, 0x43, - 0x69, 0x0b, 0x1e, 0x43, 0x15, 0x70, 0x20, 0x43, 0x4f, 0xc7, 0x06, 0x43, 0x7c, 0x5c, 0xaa, 0x42, - 0x6c, 0x80, 0xad, 0x42, 0x00, 0x1f, 0xe4, 0x42, 0x56, 0x69, 0xf4, 0x42, 0xcb, 0xbb, 0xf6, 0x42, - 0x61, 0x45, 0x06, 0x43, 0x40, 0x83, 0x1b, 0x43, 0x8a, 0xbe, 0x1d, 0x43, 0x23, 0xd9, 0x40, 0x43, - 0xca, 0xbd, 0x29, 0x43, 0x53, 0x64, 0x10, 0x43, 0x7d, 0x59, 0x14, 0x43, 0x2f, 0x9e, 0x19, 0x43, - 0x7e, 0xb4, 0xfc, 0x42, 0x96, 0x91, 0x96, 0x42, 0x6f, 0xf6, 0xcf, 0x42, 0xf5, 0x17, 0x13, 0x43, - 0x65, 0x53, 0xe8, 0x42, 0x40, 0xf5, 0xfc, 0x42, 0x67, 0xc2, 0x08, 0x43, 0xc9, 0x39, 0x0a, 0x43, - 0x5d, 0x71, 0x36, 0x43, 0xe3, 0xd0, 0x4b, 0x43, 0x45, 0x41, 0x3c, 0x43, 0xee, 0xfd, 0x12, 0x43, - 0x67, 0xaf, 0x0d, 0x43, 0xe7, 0xfe, 0x05, 0x43, 0x6d, 0xfe, 0x00, 0x43, 0x6c, 0xf7, 0xa4, 0x42, - 0xc9, 0x10, 0xd0, 0x42, 0x2b, 0xf1, 0x0f, 0x43, 0xfe, 0x3d, 0xfd, 0x42, 0xdc, 0xc8, 0xfa, 0x42, - 0xdf, 0xa4, 0x0f, 0x43, 0x54, 0x08, 0x16, 0x43, 0x2f, 0x0a, 0x2a, 0x43, 0x3e, 0x13, 0x2c, 0x43, - 0xd8, 0x7f, 0x19, 0x43, 0x25, 0x04, 0xf3, 0x42, 0x27, 0x86, 0xe1, 0x42, 0x51, 0xb9, 0xf3, 0x42, - 0xf5, 0x35, 0x18, 0x43, 0x74, 0xb9, 0xb0, 0x42, 0x34, 0x2e, 0xc8, 0x42, 0xdc, 0x39, 0x05, 0x43, - 0x50, 0x0b, 0xf5, 0x42, 0x5c, 0x63, 0x0b, 0x43, 0x1c, 0x45, 0xf9, 0x42, 0x03, 0x4b, 0x1c, 0x43, - 0x8c, 0xf5, 0x2c, 0x43, 0xfc, 0x67, 0x29, 0x43, 0xff, 0x60, 0x21, 0x43, 0xe6, 0x4b, 0xcb, 0x42, - 0x1f, 0x99, 0xcb, 0x42, 0xb0, 0x24, 0x0f, 0x43, 0x7b, 0x9b, 0x1c, 0x43, 0x83, 0x6f, 0xb7, 0x42, - 0x51, 0xd7, 0xc8, 0x42, 0x79, 0xd8, 0x23, 0x43, 0x3e, 0x5c, 0x0e, 0x43, 0x3b, 0x82, 0xf0, 0x42, - 0x77, 0x13, 0x03, 0x43, 0x7f, 0x8e, 0x12, 0x43, 0xe7, 0x62, 0x11, 0x43, 0x72, 0xa1, 0x07, 0x43, - 0x11, 0xdd, 0x16, 0x43, 0x8f, 0x6f, 0xef, 0x42, 0x19, 0x29, 0x05, 0x43, 0x4e, 0x2f, 0xe8, 0x42, - 0x9b, 0x32, 0x16, 0x43, 0x33, 0x9c, 0xd7, 0x42, 0xee, 0x05, 0xb7, 0x42, 0x83, 0x9b, 0x20, 0x43, - 0x34, 0xe0, 0x12, 0x43, 0xb4, 0xc2, 0x23, 0x43, 0xe3, 0x37, 0x1e, 0x43, 0xa3, 0xc0, 0x09, 0x43, - 0x39, 0xf4, 0x17, 0x43, 0x05, 0xf9, 0x1f, 0x43, 0xf5, 0xad, 0x17, 0x43, 0xf4, 0xed, 0x15, 0x43, - 0x78, 0x60, 0xfa, 0x42, 0xb5, 0x9c, 0x07, 0x43, 0x49, 0xa8, 0x26, 0x43, 0x59, 0xa4, 0xe6, 0x42, - 0xb4, 0x29, 0xa6, 0x42, 0xca, 0x81, 0x1c, 0x43, 0x50, 0x63, 0x18, 0x43, 0xef, 0x23, 0x1b, 0x43, - 0x47, 0x01, 0x1b, 0x43, 0x11, 0x17, 0x19, 0x43, 0x2d, 0xfc, 0x18, 0x43, 0x33, 0x66, 0x10, 0x43, - 0x81, 0x5e, 0x0e, 0x43, 0xbc, 0xb7, 0x09, 0x43, 0xac, 0x63, 0x25, 0x43, 0xec, 0xf6, 0x20, 0x43, - 0xbf, 0xb5, 0x1f, 0x43, 0x56, 0xcf, 0xd7, 0x42, 0x80, 0xb3, 0x98, 0x42, 0x66, 0x90, 0x0d, 0x43, - 0xf8, 0x0f, 0xf9, 0x42, 0x9f, 0x7a, 0x05, 0x43, 0x34, 0x07, 0xed, 0x42, 0xb3, 0x1f, 0x05, 0x43, - 0xc6, 0x38, 0x17, 0x43, 0x5c, 0x1c, 0x2d, 0x43, 0xe1, 0xf8, 0x0b, 0x43, 0x9f, 0xfe, 0x25, 0x43, - 0xb6, 0xb7, 0x1d, 0x43, 0x1b, 0xb5, 0x39, 0x43, 0xdf, 0xde, 0x1c, 0x43, 0x1b, 0x7f, 0xc4, 0x42, - 0xaf, 0x61, 0xa9, 0x42, 0xd2, 0x23, 0xdd, 0x42, 0x06, 0x1a, 0xe6, 0x42, 0x72, 0xd4, 0xf6, 0x42, - 0x01, 0x1f, 0xcb, 0x42, 0xd8, 0x79, 0xdd, 0x42, 0x3d, 0x05, 0xdc, 0x42, 0xac, 0xdb, 0x28, 0x43, - 0x55, 0x02, 0x24, 0x43, 0xb9, 0xdd, 0x2c, 0x43, 0x51, 0xbc, 0x1c, 0x43, 0x99, 0xc3, 0x1c, 0x43, - 0x70, 0x4d, 0x05, 0x43, 0xf2, 0xd9, 0xac, 0x42, 0xfd, 0xac, 0x2a, 0x42, 0x19, 0x32, 0x9c, 0x42, - 0xa4, 0x19, 0x85, 0x42, 0xc3, 0xe3, 0x98, 0x42, 0xb2, 0xa7, 0xb1, 0x42, 0x36, 0xac, 0x8c, 0x42, - 0x15, 0x0b, 0xa6, 0x42, 0xdd, 0xdf, 0xcd, 0x42, 0xcc, 0x82, 0xed, 0x42, 0x08, 0x66, 0x05, 0x43, - 0x21, 0xf0, 0xd2, 0x42, 0xa3, 0x24, 0xa7, 0x42, 0xb5, 0xf1, 0x45, 0x42, 0xdc, 0x76, 0x52, 0x42, - 0x66, 0x8a, 0x49, 0x42, 0x56, 0x70, 0x9b, 0x42, 0x66, 0x61, 0x60, 0x42, 0xb6, 0xa1, 0xa5, 0x42, - 0x5b, 0x5f, 0xbe, 0x42, 0xc9, 0x3a, 0xc3, 0x42, 0xc4, 0x26, 0xc9, 0x42, 0x5e, 0x81, 0xb2, 0x42, - 0x0b, 0x47, 0xd4, 0x42, 0x6b, 0xd2, 0xae, 0x42, 0x4f, 0x8a, 0xb5, 0x42, 0x22, 0x7a, 0xa8, 0x42, - 0x97, 0xc9, 0xa2, 0x42, 0x85, 0xb0, 0x23, 0x42, 0xea, 0xe8, 0xb0, 0x42, 0xe8, 0xa0, 0xcc, 0x42, - 0x49, 0x0f, 0xd2, 0x42, 0x5c, 0xd2, 0xfd, 0x42, 0xb2, 0xc0, 0xef, 0x42, 0xe8, 0x3a, 0xf4, 0x42, - 0xf7, 0x51, 0x0d, 0x43, 0x76, 0x03, 0x0f, 0x43, 0xae, 0xfc, 0x18, 0x43, 0xba, 0x21, 0xdc, 0x42, - 0x2f, 0x93, 0x08, 0x43, 0x90, 0x30, 0x18, 0x43, 0xce, 0x79, 0x15, 0x43, 0x86, 0x70, 0xb2, 0x42, - 0x04, 0xa4, 0x99, 0x42, 0xfe, 0xf0, 0xe0, 0x42, 0x20, 0xbc, 0xe0, 0x42, 0x5e, 0x23, 0xdc, 0x42, - 0x22, 0xd9, 0x08, 0x43, 0xb2, 0x79, 0x08, 0x43, 0x89, 0xc7, 0x1d, 0x43, 0x94, 0x98, 0x1d, 0x43, - 0xd8, 0xc3, 0x1a, 0x43, 0x04, 0x0a, 0xf2, 0x42, 0x5c, 0xcf, 0x15, 0x43, 0x92, 0x8e, 0x11, 0x43, - 0x22, 0xd0, 0x1b, 0x43, 0x24, 0x30, 0xbe, 0x42, 0x3a, 0x9b, 0xbb, 0x42, 0xf9, 0xaa, 0x04, 0x43, - 0xdb, 0x74, 0xf4, 0x42, 0x43, 0xc3, 0x01, 0x43, 0x71, 0xfe, 0x00, 0x43, 0xfe, 0x2b, 0x0e, 0x43, - 0x56, 0xf6, 0x1b, 0x43, 0xc3, 0xf5, 0x3a, 0x43, 0xe7, 0xa6, 0x31, 0x43, 0x24, 0xd0, 0x24, 0x43, - 0x21, 0x67, 0x17, 0x43, 0x49, 0x04, 0x17, 0x43, 0x1f, 0xb0, 0x0b, 0x43, 0x1c, 0x32, 0x9f, 0x42, - 0x56, 0x49, 0xb4, 0x42, 0xa8, 0x62, 0xe6, 0x42, 0x14, 0xb4, 0xd8, 0x42, 0x2c, 0xa1, 0xe9, 0x42, - 0x6f, 0x3e, 0x01, 0x43, 0x91, 0x47, 0x14, 0x43, 0xbb, 0x17, 0x21, 0x43, 0x6a, 0x13, 0x3d, 0x43, - 0x4b, 0x56, 0x2e, 0x43, 0x34, 0x5a, 0x1d, 0x43, 0x2c, 0xed, 0x0b, 0x43, 0xa2, 0xf6, 0x0d, 0x43, - 0xa0, 0xb7, 0xfb, 0x42, 0xbe, 0x88, 0xb2, 0x42, 0x24, 0x91, 0xba, 0x42, 0x16, 0xc2, 0xf8, 0x42, - 0xe0, 0xf1, 0xfb, 0x42, 0x6f, 0x7c, 0x0b, 0x43, 0x18, 0xcb, 0xea, 0x42, 0xad, 0xf4, 0x14, 0x43, - 0x3a, 0xeb, 0x3e, 0x43, 0xf5, 0x76, 0x40, 0x43, 0x6c, 0xf9, 0x42, 0x43, 0x15, 0x36, 0x17, 0x43, - 0x92, 0x62, 0x02, 0x43, 0x47, 0xc6, 0xf7, 0x42, 0xc9, 0xcc, 0x03, 0x43, 0x7a, 0x56, 0xa8, 0x42, - 0x9e, 0x52, 0xd5, 0x42, 0x75, 0x8a, 0x09, 0x43, 0x75, 0x17, 0xfc, 0x42, 0x57, 0x17, 0xfe, 0x42, - 0x98, 0x84, 0x05, 0x43, 0xf0, 0x43, 0x19, 0x43, 0xe4, 0xc1, 0x27, 0x43, 0x40, 0xd8, 0x11, 0x43, - 0x47, 0x72, 0x18, 0x43, 0x86, 0xcb, 0xea, 0x42, 0x55, 0x31, 0x05, 0x43, 0xac, 0xf4, 0xfa, 0x42, - 0xa0, 0x09, 0x06, 0x43, 0x6d, 0x81, 0xc6, 0x42, 0x98, 0x56, 0xca, 0x42, 0xdb, 0x4b, 0x10, 0x43, - 0x0e, 0xa3, 0xf4, 0x42, 0x1c, 0x0d, 0x00, 0x43, 0x68, 0xb6, 0x05, 0x43, 0x71, 0xc2, 0x08, 0x43, - 0x09, 0xf1, 0x2b, 0x43, 0x0d, 0x1f, 0x10, 0x43, 0x46, 0x21, 0x0a, 0x43, 0x08, 0x5c, 0xea, 0x42, - 0xe3, 0x2b, 0xf8, 0x42, 0x3c, 0x26, 0x04, 0x43, 0xd4, 0x43, 0x04, 0x43, 0xba, 0x6a, 0xce, 0x42, - 0x64, 0xd2, 0xc2, 0x42, 0x96, 0xde, 0x14, 0x43, 0x81, 0xee, 0x01, 0x43, 0x48, 0xe2, 0xf2, 0x42, - 0xd6, 0x50, 0x12, 0x43, 0xc1, 0x08, 0x0a, 0x43, 0xc1, 0x63, 0x1e, 0x43, 0x98, 0xe2, 0x06, 0x43, - 0x03, 0x86, 0xee, 0x42, 0xf6, 0x4e, 0xff, 0x42, 0x84, 0x5e, 0xf7, 0x42, 0xc6, 0x54, 0xfe, 0x42, - 0x16, 0xde, 0x19, 0x43, 0x00, 0x73, 0xc5, 0x42, 0x58, 0xab, 0xb0, 0x42, 0x19, 0x32, 0x20, 0x43, - 0x64, 0xa9, 0x1c, 0x43, 0xd8, 0xcb, 0x1e, 0x43, 0x58, 0x6e, 0x1c, 0x43, 0x1e, 0x82, 0x21, 0x43, - 0xdf, 0x4e, 0x1e, 0x43, 0xea, 0x0d, 0x1e, 0x43, 0x48, 0x71, 0x13, 0x43, 0x02, 0xb8, 0xfb, 0x42, - 0xa8, 0xaa, 0xfd, 0x42, 0x25, 0x6d, 0x1a, 0x43, 0xc0, 0xb9, 0x28, 0x43, 0x27, 0xd9, 0xc6, 0x42, - 0xca, 0x69, 0xb3, 0x42, 0x1a, 0xa5, 0x19, 0x43, 0x64, 0xa7, 0x17, 0x43, 0xe0, 0xcf, 0x0c, 0x43, - 0x45, 0xb3, 0xfc, 0x42, 0xbe, 0x6c, 0x0d, 0x43, 0x24, 0xcf, 0x11, 0x43, 0xfe, 0x89, 0x1a, 0x43, - 0xf6, 0x27, 0x13, 0x43, 0xbb, 0xd7, 0x06, 0x43, 0x3c, 0xc5, 0x1c, 0x43, 0xa4, 0x8c, 0x1a, 0x43, - 0x60, 0x6c, 0x2e, 0x43, 0x5a, 0x77, 0xdd, 0x42, 0x8d, 0x46, 0x9e, 0x42, 0xe8, 0xd5, 0xfa, 0x42, - 0x81, 0x60, 0xe8, 0x42, 0x25, 0xa3, 0x04, 0x43, 0xbc, 0x0f, 0xf9, 0x42, 0x74, 0x4f, 0x04, 0x43, - 0xf1, 0x3c, 0x03, 0x43, 0x56, 0xe8, 0x16, 0x43, 0xcc, 0x1c, 0x10, 0x43, 0xb5, 0xb0, 0x1c, 0x43, - 0x8e, 0x8e, 0x19, 0x43, 0x28, 0xd0, 0x32, 0x43, 0x30, 0x71, 0x19, 0x43, 0xb7, 0xf4, 0xbe, 0x42, - 0x67, 0x0f, 0x99, 0x42, 0x23, 0x3b, 0xeb, 0x42, 0xd8, 0x80, 0xec, 0x42, 0x85, 0xb6, 0xdf, 0x42, - 0x4b, 0x7d, 0xf9, 0x42, 0x21, 0x00, 0xde, 0x42, 0xe4, 0x7f, 0xfb, 0x42, 0x01, 0xc9, 0x17, 0x43, - 0x5c, 0x6f, 0x1d, 0x43, 0xfc, 0x28, 0x32, 0x43, 0x47, 0xc3, 0x1d, 0x43, 0xc4, 0xdb, 0x0f, 0x43, - 0x16, 0x01, 0x06, 0x43, 0xfa, 0x3f, 0xa3, 0x42, 0xe2, 0x2d, 0x6d, 0x42, 0x83, 0x79, 0x94, 0x42, - 0xc2, 0x7f, 0x96, 0x42, 0xf1, 0x10, 0xa1, 0x42, 0x9b, 0xea, 0xa0, 0x42, 0xb4, 0x79, 0x97, 0x42, - 0x2c, 0xf8, 0xa1, 0x42, 0xac, 0x97, 0xd0, 0x42, 0x2e, 0xba, 0xdb, 0x42, 0xb6, 0x0b, 0xfc, 0x42, - 0xd6, 0x52, 0xd2, 0x42, 0x0c, 0xfd, 0xb2, 0x42, 0x6c, 0xa5, 0x83, 0x42, 0x65, 0x4b, 0x69, 0x42, - 0xe1, 0x3f, 0x7a, 0x42, 0x59, 0x6c, 0xbf, 0x42, 0x1c, 0xd6, 0x9c, 0x42, 0x13, 0x33, 0xb5, 0x42, - 0xbc, 0x23, 0xe1, 0x42, 0x31, 0x9f, 0xbf, 0x42, 0x7a, 0x37, 0x03, 0x43, 0xd6, 0xb9, 0xd1, 0x42, - 0xfb, 0x0f, 0xed, 0x42, 0x43, 0x14, 0xc0, 0x42, 0x8d, 0xb0, 0xde, 0x42, 0xdf, 0x7f, 0xc9, 0x42, - 0x6f, 0x4e, 0xf5, 0x42, 0x10, 0xb4, 0x68, 0x42, 0xb5, 0x8f, 0xe9, 0x42, 0x0f, 0x35, 0xf9, 0x42, - 0xf0, 0xd9, 0xbc, 0x42, 0xd3, 0x00, 0x03, 0x43, 0xf8, 0x67, 0x0a, 0x43, 0x2e, 0xa5, 0x07, 0x43, - 0x20, 0x2c, 0x2c, 0x43, 0x9c, 0x88, 0x20, 0x43, 0xf2, 0xfb, 0x27, 0x43, 0x9c, 0x95, 0x0a, 0x43, - 0xaa, 0xbb, 0x1f, 0x43, 0x5a, 0xe4, 0x17, 0x43, 0x9a, 0x18, 0x13, 0x43, 0x29, 0xd3, 0xb6, 0x42, - 0xb8, 0xed, 0xbe, 0x42, 0xb0, 0x31, 0xff, 0x42, 0xcb, 0x76, 0xf5, 0x42, 0x82, 0x45, 0x15, 0x43, - 0x6a, 0xd2, 0x18, 0x43, 0x6a, 0xe0, 0x14, 0x43, 0xb6, 0xe4, 0x3a, 0x43, 0x3a, 0x8b, 0x28, 0x43, - 0x5c, 0x85, 0x33, 0x43, 0x6c, 0x5d, 0x2a, 0x43, 0x6c, 0x7a, 0x1e, 0x43, 0x7a, 0x63, 0x22, 0x43, - 0x10, 0x9d, 0x22, 0x43, 0x1b, 0x21, 0xe5, 0x42, 0xe8, 0xfd, 0xde, 0x42, 0xb5, 0xec, 0xfb, 0x42, - 0x31, 0x8a, 0xdc, 0x42, 0xe4, 0x1a, 0x05, 0x43, 0xbe, 0x56, 0x01, 0x43, 0xbe, 0x10, 0x13, 0x43, - 0x14, 0xef, 0x31, 0x43, 0x48, 0xf0, 0x26, 0x43, 0xac, 0x62, 0x43, 0x43, 0xd2, 0x8f, 0x23, 0x43, - 0x8a, 0x5e, 0x1a, 0x43, 0xa0, 0x5d, 0x1d, 0x43, 0xa0, 0x9b, 0x0f, 0x43, 0x20, 0x4a, 0xd9, 0x42, - 0x19, 0x1c, 0xbb, 0x42, 0x02, 0xc3, 0x05, 0x43, 0x96, 0xe1, 0x12, 0x43, 0x4a, 0x5e, 0x06, 0x43, - 0x8e, 0x0b, 0x17, 0x43, 0x4c, 0xb0, 0x27, 0x43, 0xd0, 0x6e, 0x3f, 0x43, 0xb0, 0x07, 0x3c, 0x43, - 0x36, 0xfe, 0x45, 0x43, 0x5a, 0x42, 0x2e, 0x43, 0xea, 0x02, 0x25, 0x43, 0xaa, 0x46, 0x10, 0x43, - 0x52, 0xa2, 0x15, 0x43, 0x2e, 0xd2, 0xab, 0x42, 0xed, 0xa2, 0xcd, 0x42, 0x58, 0x5d, 0x14, 0x43, - 0xa2, 0x6c, 0x07, 0x43, 0x68, 0xfd, 0x18, 0x43, 0x42, 0x0b, 0x15, 0x43, 0xc0, 0x6f, 0x26, 0x43, - 0x94, 0xb5, 0x4a, 0x43, 0x4e, 0xd8, 0x4f, 0x43, 0xc8, 0x9b, 0x3c, 0x43, 0x96, 0x73, 0x2a, 0x43, - 0xe4, 0xab, 0x0c, 0x43, 0x3b, 0x9e, 0xf5, 0x42, 0xb0, 0x32, 0x0c, 0x43, 0x2d, 0x40, 0xcf, 0x42, - 0xdf, 0x27, 0xd2, 0x42, 0x2e, 0x88, 0x1c, 0x43, 0xb0, 0xeb, 0x12, 0x43, 0x32, 0xa2, 0x0d, 0x43, - 0x0a, 0xdf, 0x02, 0x43, 0x6e, 0x9c, 0x2c, 0x43, 0x84, 0xf5, 0x40, 0x43, 0xf0, 0x02, 0x30, 0x43, - 0x10, 0x90, 0x28, 0x43, 0xe0, 0xc6, 0x03, 0x43, 0x9a, 0x4a, 0xfd, 0x42, 0x57, 0x6b, 0x0e, 0x43, - 0x4a, 0xb9, 0x14, 0x43, 0x8a, 0x3b, 0xcc, 0x42, 0xc1, 0x8e, 0xc6, 0x42, 0x20, 0xa5, 0x23, 0x43, - 0xf8, 0x72, 0x11, 0x43, 0x2a, 0x55, 0x0a, 0x43, 0xda, 0xfa, 0x1a, 0x43, 0xf8, 0xfa, 0x1f, 0x43, - 0x98, 0x66, 0x2c, 0x43, 0x94, 0xf9, 0x14, 0x43, 0xde, 0x7e, 0x12, 0x43, 0x2c, 0x09, 0x00, 0x43, - 0x9d, 0x8b, 0xfc, 0x42, 0xa8, 0x33, 0x21, 0x43, 0xbc, 0x1e, 0x18, 0x43, 0x39, 0xe4, 0xe2, 0x42, - 0xf1, 0xa2, 0xdb, 0x42, 0xb6, 0x59, 0x25, 0x43, 0xce, 0x1a, 0x19, 0x43, 0x98, 0xa5, 0x0d, 0x43, - 0x46, 0x00, 0x15, 0x43, 0xfe, 0x60, 0x29, 0x43, 0xca, 0xe4, 0x20, 0x43, 0x9a, 0x55, 0x1f, 0x43, - 0xc0, 0x08, 0x17, 0x43, 0xfc, 0xdf, 0x0e, 0x43, 0x1b, 0x68, 0x05, 0x43, 0xb2, 0xa4, 0x05, 0x43, - 0xa8, 0x1a, 0x17, 0x43, 0x7b, 0x8d, 0xdb, 0x42, 0xff, 0xd6, 0xe0, 0x42, 0xde, 0x18, 0x1b, 0x43, - 0xae, 0xa5, 0x24, 0x43, 0x84, 0x65, 0x2b, 0x43, 0x9c, 0xa0, 0x2b, 0x43, 0x8c, 0x2f, 0x34, 0x43, - 0x96, 0xe9, 0x24, 0x43, 0x14, 0xbb, 0x3a, 0x43, 0x16, 0x17, 0x1a, 0x43, 0x10, 0xea, 0x06, 0x43, - 0x48, 0xe0, 0x0c, 0x43, 0xe2, 0xd6, 0x1d, 0x43, 0xc4, 0x66, 0x3a, 0x43, 0x37, 0xe4, 0xe4, 0x42, - 0x6a, 0xda, 0xc7, 0x42, 0x02, 0x0e, 0x27, 0x43, 0x40, 0x04, 0x18, 0x43, 0xb8, 0x61, 0x29, 0x43, - 0x9c, 0x9c, 0x0b, 0x43, 0x98, 0xb9, 0x12, 0x43, 0x76, 0x90, 0x22, 0x43, 0xe6, 0x16, 0x27, 0x43, - 0xaa, 0x13, 0x1c, 0x43, 0xf0, 0x33, 0x23, 0x43, 0xd0, 0x45, 0x31, 0x43, 0x18, 0xe3, 0x38, 0x43, - 0x20, 0x7b, 0x3f, 0x43, 0xe9, 0xb7, 0xe6, 0x42, 0x97, 0x1c, 0xc0, 0x42, 0x7f, 0x5b, 0x11, 0x43, - 0x24, 0x17, 0xff, 0x42, 0xf4, 0x04, 0x1b, 0x43, 0xfa, 0xc2, 0x0b, 0x43, 0x02, 0xf7, 0x0a, 0x43, - 0xb8, 0x9a, 0x17, 0x43, 0x8e, 0x15, 0x28, 0x43, 0xd0, 0x45, 0x2e, 0x43, 0xac, 0x1d, 0x2a, 0x43, - 0x80, 0x82, 0x2d, 0x43, 0x0e, 0x65, 0x42, 0x43, 0xbe, 0x63, 0x1c, 0x43, 0x78, 0x4c, 0xdd, 0x42, - 0xea, 0x8f, 0xa9, 0x42, 0xfd, 0x2b, 0xfb, 0x42, 0x73, 0x23, 0xf5, 0x42, 0xc0, 0xbd, 0x06, 0x43, - 0x30, 0x12, 0xfe, 0x42, 0x04, 0x8c, 0x09, 0x43, 0x1a, 0x72, 0x09, 0x43, 0x30, 0x6d, 0x26, 0x43, - 0xec, 0x79, 0x33, 0x43, 0x1c, 0x9e, 0x4b, 0x43, 0xac, 0xcf, 0x25, 0x43, 0xa4, 0x4b, 0x1a, 0x43, - 0xf0, 0x0d, 0x03, 0x43, 0xd1, 0x08, 0xbe, 0x42, 0x05, 0x5e, 0x85, 0x42, 0x7b, 0xe3, 0xb3, 0x42, - 0x95, 0xdc, 0xb0, 0x42, 0x03, 0x35, 0xbb, 0x42, 0x8e, 0x2b, 0xcc, 0x42, 0x0a, 0xdc, 0xd2, 0x42, - 0x3b, 0xd8, 0xc2, 0x42, 0x62, 0xef, 0xf1, 0x42, 0x9f, 0x54, 0xea, 0x42, 0x58, 0x1e, 0x0c, 0x43, - 0xba, 0x43, 0xd6, 0x42, 0x9e, 0xa3, 0xd4, 0x42, 0x8d, 0xb0, 0xa8, 0x42, 0x6b, 0xd7, 0x84, 0x42, - 0xde, 0xe2, 0x4b, 0x42, 0x1e, 0x3e, 0x99, 0x42, 0xa7, 0x7e, 0x93, 0x42, 0x28, 0x5f, 0xd2, 0x42, - 0x98, 0x53, 0xdf, 0x42, 0x52, 0x91, 0xd4, 0x42, 0xb6, 0x76, 0xd9, 0x42, 0x82, 0x53, 0xe4, 0x42, - 0x5a, 0xf1, 0xca, 0x42, 0x6a, 0x8d, 0xa7, 0x42, 0x86, 0x4d, 0xc1, 0x42, 0x50, 0x34, 0xd2, 0x42, - 0xe2, 0x53, 0xaa, 0x42, 0x3e, 0xa7, 0x6d, 0x42, 0x36, 0xc4, 0xcd, 0x42, 0x58, 0x28, 0xce, 0x42, - 0x12, 0xb9, 0xca, 0x42, 0xdf, 0xb4, 0x00, 0x43, 0x57, 0xa2, 0x12, 0x43, 0x4f, 0xa9, 0x13, 0x43, - 0x1a, 0x74, 0x25, 0x43, 0xe5, 0xa9, 0x3d, 0x43, 0x66, 0x7b, 0x44, 0x43, 0x1e, 0xbd, 0x07, 0x43, - 0x97, 0xfc, 0x20, 0x43, 0x27, 0xd6, 0x24, 0x43, 0xbc, 0xc5, 0x23, 0x43, 0x82, 0x03, 0xc2, 0x42, - 0x28, 0x4e, 0xe9, 0x42, 0xf4, 0xab, 0xea, 0x42, 0x58, 0xb6, 0xbf, 0x42, 0xfc, 0xa4, 0xf5, 0x42, - 0x26, 0x8a, 0x25, 0x43, 0x0d, 0xd5, 0x0e, 0x43, 0xc0, 0xd6, 0x3b, 0x43, 0xed, 0x5a, 0x39, 0x43, - 0x86, 0x54, 0x39, 0x43, 0x82, 0x6a, 0x12, 0x43, 0x2a, 0xb5, 0x22, 0x43, 0x4a, 0x7e, 0x23, 0x43, - 0xc0, 0x1b, 0x29, 0x43, 0xb8, 0x23, 0xe0, 0x42, 0x7a, 0x0e, 0xcc, 0x42, 0x36, 0xcf, 0x13, 0x43, - 0xf0, 0x80, 0x04, 0x43, 0x58, 0xd9, 0xfc, 0x42, 0xf6, 0xfe, 0x0e, 0x43, 0x23, 0x9f, 0x1d, 0x43, - 0x55, 0x6d, 0x27, 0x43, 0xcc, 0xa1, 0x46, 0x43, 0x60, 0x15, 0x3a, 0x43, 0x3c, 0x48, 0x28, 0x43, - 0xd2, 0xc9, 0x23, 0x43, 0xce, 0x45, 0x2f, 0x43, 0xe2, 0x4c, 0x26, 0x43, 0x2a, 0xce, 0xd9, 0x42, - 0x58, 0x8b, 0xe3, 0x42, 0x58, 0x5f, 0xfe, 0x42, 0x10, 0x99, 0x0a, 0x43, 0xf7, 0x2a, 0x08, 0x43, - 0xd1, 0x73, 0x1e, 0x43, 0x60, 0xf6, 0x33, 0x43, 0xf1, 0x15, 0x30, 0x43, 0x43, 0x73, 0x47, 0x43, - 0x1b, 0x43, 0x38, 0x43, 0x1f, 0x86, 0x20, 0x43, 0xaf, 0x93, 0x15, 0x43, 0x58, 0xc0, 0x22, 0x43, - 0x06, 0x8b, 0x08, 0x43, 0xda, 0x45, 0xc3, 0x42, 0x72, 0x8c, 0xf3, 0x42, 0x3f, 0x76, 0x2e, 0x43, - 0x2f, 0x7f, 0x10, 0x43, 0x7d, 0xbf, 0x19, 0x43, 0x7c, 0x17, 0x17, 0x43, 0xb4, 0x29, 0x47, 0x43, - 0xe0, 0x5e, 0x55, 0x43, 0xd6, 0xa5, 0x4f, 0x43, 0xce, 0x52, 0x58, 0x43, 0x11, 0xb4, 0x1d, 0x43, - 0x88, 0x41, 0x12, 0x43, 0x9e, 0x67, 0x0b, 0x43, 0xd5, 0xee, 0x11, 0x43, 0x78, 0xea, 0xd2, 0x42, - 0xac, 0x5d, 0xc6, 0x42, 0xc6, 0x1e, 0x24, 0x43, 0x1e, 0xad, 0x17, 0x43, 0x46, 0x47, 0x06, 0x43, - 0x09, 0x0a, 0x18, 0x43, 0x43, 0x85, 0x3a, 0x43, 0x7c, 0xfe, 0x3f, 0x43, 0xc6, 0x58, 0x36, 0x43, - 0x70, 0x11, 0x30, 0x43, 0x00, 0x37, 0xf7, 0x42, 0xec, 0x34, 0x06, 0x43, 0x81, 0xc5, 0x0a, 0x43, - 0x56, 0x86, 0x1f, 0x43, 0x02, 0xf3, 0xee, 0x42, 0x1a, 0xf9, 0xee, 0x42, 0xd0, 0x32, 0x1c, 0x43, - 0xd2, 0xa8, 0x02, 0x43, 0xb7, 0x09, 0x09, 0x43, 0x54, 0x5e, 0x1f, 0x43, 0x02, 0x66, 0x2b, 0x43, - 0x5e, 0xb6, 0x42, 0x43, 0x76, 0x34, 0x23, 0x43, 0x2c, 0x69, 0x1b, 0x43, 0xae, 0xce, 0x0b, 0x43, - 0x36, 0xfd, 0xe9, 0x42, 0x9b, 0x59, 0x07, 0x43, 0x7e, 0x19, 0x1c, 0x43, 0x08, 0xea, 0xfc, 0x42, - 0x5e, 0x3f, 0xdd, 0x42, 0x1d, 0x9b, 0x22, 0x43, 0xe8, 0xfc, 0x20, 0x43, 0xeb, 0xaf, 0x19, 0x43, - 0xfb, 0x23, 0x28, 0x43, 0x79, 0x8b, 0x2f, 0x43, 0x5a, 0xd6, 0x22, 0x43, 0xb8, 0x21, 0x29, 0x43, - 0x13, 0x94, 0x15, 0x43, 0x15, 0x5c, 0x04, 0x43, 0x97, 0x2e, 0x11, 0x43, 0x2e, 0xe1, 0x11, 0x43, - 0x72, 0x05, 0x2c, 0x43, 0x12, 0xde, 0xf4, 0x42, 0xca, 0x5a, 0xcf, 0x42, 0x94, 0x19, 0x3b, 0x43, - 0x67, 0x2e, 0x1d, 0x43, 0xa1, 0x30, 0x1b, 0x43, 0xb7, 0xc9, 0x22, 0x43, 0xca, 0x8b, 0x35, 0x43, - 0x3d, 0x4f, 0x2b, 0x43, 0x72, 0x5f, 0x34, 0x43, 0x72, 0x71, 0x2d, 0x43, 0x05, 0xec, 0x18, 0x43, - 0x1c, 0x64, 0x1d, 0x43, 0x17, 0x42, 0x17, 0x43, 0x72, 0x3f, 0x2b, 0x43, 0xc6, 0x09, 0x0d, 0x43, - 0x78, 0xf5, 0xe1, 0x42, 0xe0, 0xae, 0x20, 0x43, 0x12, 0x35, 0x2a, 0x43, 0xa0, 0x21, 0x41, 0x43, - 0x0b, 0x8a, 0x1c, 0x43, 0xdf, 0xd8, 0x13, 0x43, 0x2a, 0x9d, 0x20, 0x43, 0x04, 0xa8, 0x2e, 0x43, - 0xe1, 0x5f, 0x28, 0x43, 0x4a, 0xf3, 0x16, 0x43, 0x31, 0x5d, 0x2c, 0x43, 0xe6, 0x4d, 0x3b, 0x43, - 0x06, 0x91, 0x2c, 0x43, 0x04, 0xd7, 0xfe, 0x42, 0xba, 0xf8, 0xa7, 0x42, 0xe4, 0x72, 0x0d, 0x43, - 0x21, 0x8d, 0x0f, 0x43, 0xa4, 0x09, 0x21, 0x43, 0x9f, 0x6e, 0x0f, 0x43, 0xbc, 0xac, 0x0e, 0x43, - 0xbe, 0x5d, 0x1b, 0x43, 0xf5, 0xc6, 0x1e, 0x43, 0xca, 0x01, 0x2e, 0x43, 0xe7, 0x60, 0x2c, 0x43, - 0xd2, 0x74, 0x36, 0x43, 0x74, 0xca, 0x41, 0x43, 0x4e, 0x0a, 0x2c, 0x43, 0x28, 0x39, 0xb1, 0x42, - 0x46, 0x1f, 0xaa, 0x42, 0x1a, 0xc1, 0xed, 0x42, 0x4a, 0x9c, 0x00, 0x43, 0xb0, 0x02, 0x0e, 0x43, - 0x08, 0x4e, 0xf3, 0x42, 0x42, 0xb7, 0xfc, 0x42, 0xc7, 0x6f, 0x1c, 0x43, 0x5d, 0xda, 0x31, 0x43, - 0xc6, 0xe6, 0x27, 0x43, 0x0a, 0x88, 0x41, 0x43, 0x52, 0x92, 0x37, 0x43, 0x74, 0xf5, 0x30, 0x43, - 0x52, 0xba, 0x0f, 0x43, 0xcc, 0x93, 0xd8, 0x42, 0x4c, 0xd6, 0x94, 0x42, 0xc4, 0x73, 0x89, 0x42, - 0xe2, 0x7c, 0xad, 0x42, 0xf8, 0x99, 0xc9, 0x42, 0x96, 0xe8, 0xdc, 0x42, 0xc6, 0xaf, 0xb9, 0x42, - 0xf6, 0x6f, 0x95, 0x42, 0x4e, 0xda, 0xf0, 0x42, 0x1b, 0x91, 0x0b, 0x43, 0x79, 0x6b, 0x0c, 0x43, - 0x5c, 0xc4, 0xea, 0x42, 0x4c, 0x44, 0xbe, 0x42, 0x48, 0x19, 0xa9, 0x42, 0xdd, 0x92, 0x51, 0x42, - 0xb2, 0x13, 0x6d, 0x42, 0xd6, 0x6a, 0x98, 0x42, 0x65, 0x83, 0x8e, 0x42, 0x31, 0x08, 0x93, 0x42, - 0x7c, 0x98, 0xbc, 0x42, 0x88, 0x63, 0xbc, 0x42, 0x65, 0x26, 0xd5, 0x42, 0x90, 0xb9, 0xcd, 0x42, - 0x08, 0x86, 0xaf, 0x42, 0x05, 0x15, 0x93, 0x42, 0x86, 0xc6, 0xc7, 0x42, 0x96, 0x1b, 0xac, 0x42, - 0x8c, 0xaa, 0xc5, 0x42, 0xa8, 0xb0, 0x5b, 0x42, 0xc7, 0x70, 0xac, 0x42, 0xac, 0x19, 0xef, 0x42, - 0xac, 0xd8, 0xd2, 0x42, 0x03, 0x6d, 0x07, 0x43, 0x1a, 0x11, 0x16, 0x43, 0xe2, 0x8b, 0x14, 0x43, - 0xa0, 0x84, 0x30, 0x43, 0xac, 0xec, 0x22, 0x43, 0xbf, 0x23, 0x27, 0x43, 0x40, 0xb5, 0xf4, 0x42, - 0x62, 0x2c, 0x15, 0x43, 0x26, 0x41, 0x17, 0x43, 0x2e, 0x1d, 0x1f, 0x43, 0x34, 0x7d, 0x9b, 0x42, - 0x5e, 0x56, 0xd9, 0x42, 0x1e, 0xca, 0xd7, 0x42, 0x9d, 0xab, 0xd7, 0x42, 0x19, 0xaa, 0x06, 0x43, - 0xf1, 0xca, 0x07, 0x43, 0xb1, 0x86, 0x11, 0x43, 0xd5, 0xf5, 0x35, 0x43, 0x90, 0xae, 0x30, 0x43, - 0x8c, 0x4a, 0x2a, 0x43, 0x50, 0xa3, 0x0f, 0x43, 0x7c, 0x6e, 0x17, 0x43, 0xd2, 0xfe, 0x24, 0x43, - 0x74, 0x80, 0x1d, 0x43, 0x74, 0x30, 0xd1, 0x42, 0xda, 0x22, 0xc9, 0x42, 0x58, 0x48, 0xfa, 0x42, - 0x4d, 0x77, 0xc6, 0x42, 0x64, 0xce, 0x0c, 0x43, 0xaf, 0x03, 0x17, 0x43, 0x5b, 0x88, 0x0b, 0x43, - 0xaf, 0x6d, 0x3c, 0x43, 0x55, 0xb1, 0x27, 0x43, 0x62, 0x4f, 0x31, 0x43, 0xdc, 0x4e, 0x22, 0x43, - 0x1a, 0x95, 0x1a, 0x43, 0x1c, 0x9e, 0x23, 0x43, 0xda, 0x91, 0x12, 0x43, 0x0a, 0x8e, 0xdc, 0x42, - 0x42, 0xfc, 0xb5, 0x42, 0xf9, 0x91, 0xf7, 0x42, 0xf9, 0x19, 0xf7, 0x42, 0xf3, 0x07, 0x09, 0x43, - 0x09, 0x88, 0x0f, 0x43, 0xea, 0xa2, 0x22, 0x43, 0xb8, 0x65, 0x1f, 0x43, 0xdb, 0xbb, 0x3f, 0x43, - 0xf3, 0x0f, 0x2d, 0x43, 0xf2, 0x99, 0x1c, 0x43, 0xd0, 0xc8, 0x1c, 0x43, 0x8b, 0xd3, 0x04, 0x43, - 0x38, 0x8b, 0x07, 0x43, 0x9e, 0x73, 0x9a, 0x42, 0x97, 0xe3, 0xd0, 0x42, 0xf8, 0xe2, 0x0e, 0x43, - 0x33, 0xeb, 0x04, 0x43, 0x61, 0x16, 0x0b, 0x43, 0x86, 0x59, 0x05, 0x43, 0x85, 0xd0, 0x1b, 0x43, - 0x9b, 0x56, 0x3f, 0x43, 0x34, 0x66, 0x43, 0x43, 0xaa, 0xf8, 0x49, 0x43, 0xe9, 0xa0, 0x1c, 0x43, - 0xed, 0xa6, 0x02, 0x43, 0x38, 0x92, 0xfd, 0x42, 0xc2, 0x98, 0x13, 0x43, 0x55, 0x05, 0xc7, 0x42, - 0x10, 0x44, 0xe0, 0x42, 0x0c, 0xa2, 0x1f, 0x43, 0x3e, 0x2d, 0x07, 0x43, 0x24, 0xae, 0x10, 0x43, - 0x22, 0x02, 0x1b, 0x43, 0x01, 0xaf, 0x24, 0x43, 0x50, 0x77, 0x4c, 0x43, 0x3f, 0x08, 0x33, 0x43, - 0x83, 0xd2, 0x11, 0x43, 0x5e, 0xc0, 0x01, 0x43, 0xfa, 0x51, 0xe8, 0x42, 0x28, 0xcc, 0x01, 0x43, - 0xbc, 0x87, 0x17, 0x43, 0x98, 0x72, 0xb9, 0x42, 0x30, 0xda, 0xd7, 0x42, 0x50, 0x31, 0x16, 0x43, - 0x8e, 0xb6, 0x09, 0x43, 0xc9, 0xba, 0x12, 0x43, 0x37, 0x7b, 0x1a, 0x43, 0x07, 0xe9, 0x24, 0x43, - 0xae, 0x60, 0x1f, 0x43, 0x54, 0xd8, 0x1f, 0x43, 0x9c, 0xf8, 0x0b, 0x43, 0xd1, 0xc1, 0xe7, 0x42, - 0xce, 0xa8, 0xe8, 0x42, 0x3c, 0x87, 0x08, 0x43, 0x24, 0xce, 0x17, 0x43, 0xc9, 0xfb, 0xdc, 0x42, - 0x48, 0xb2, 0xdb, 0x42, 0xad, 0x32, 0x1d, 0x43, 0x66, 0x5c, 0x11, 0x43, 0xfd, 0x61, 0x02, 0x43, - 0xac, 0x2b, 0x15, 0x43, 0x19, 0x8a, 0x1d, 0x43, 0x97, 0x4e, 0x23, 0x43, 0xb0, 0x0d, 0x20, 0x43, - 0xa4, 0x22, 0x07, 0x43, 0x56, 0x9c, 0xfe, 0x42, 0xeb, 0x67, 0x03, 0x43, 0x24, 0xa6, 0x0a, 0x43, - 0x18, 0x8c, 0x1f, 0x43, 0x6c, 0x6b, 0xcd, 0x42, 0xd4, 0x5d, 0xd1, 0x42, 0x38, 0x8a, 0x2e, 0x43, - 0xa4, 0xf0, 0x25, 0x43, 0xa8, 0x11, 0x21, 0x43, 0x23, 0x07, 0x29, 0x43, 0x42, 0xd7, 0x2f, 0x43, - 0xd1, 0x58, 0x20, 0x43, 0xb9, 0x00, 0x26, 0x43, 0x1d, 0xe4, 0x18, 0x43, 0x79, 0x6a, 0x0b, 0x43, - 0xf6, 0x6e, 0x0c, 0x43, 0x65, 0x9a, 0x12, 0x43, 0x3e, 0xe5, 0x2c, 0x43, 0x42, 0x17, 0xf9, 0x42, - 0x31, 0xc0, 0xd4, 0x42, 0x86, 0xeb, 0x27, 0x43, 0x60, 0x37, 0x28, 0x43, 0xfc, 0xae, 0x28, 0x43, - 0x66, 0xbb, 0x07, 0x43, 0x76, 0x2f, 0x1f, 0x43, 0xcd, 0x3b, 0x11, 0x43, 0xfe, 0xaa, 0x2f, 0x43, - 0xad, 0xf9, 0x08, 0x43, 0x1f, 0x6c, 0x13, 0x43, 0xd1, 0x14, 0x25, 0x43, 0x0e, 0x63, 0x33, 0x43, - 0x06, 0xa7, 0x33, 0x43, 0xa2, 0x74, 0xf7, 0x42, 0x80, 0xd2, 0xaf, 0x42, 0xa2, 0x42, 0x0e, 0x43, - 0xf1, 0x57, 0x0c, 0x43, 0x70, 0x43, 0x0f, 0x43, 0x7f, 0xe2, 0xef, 0x42, 0xcc, 0x11, 0x05, 0x43, - 0x67, 0xaa, 0x15, 0x43, 0x20, 0xfd, 0x1d, 0x43, 0x89, 0xfd, 0x25, 0x43, 0x14, 0xa5, 0x22, 0x43, - 0xea, 0x28, 0x30, 0x43, 0x78, 0xec, 0x40, 0x43, 0x34, 0xc3, 0x21, 0x43, 0x88, 0xd9, 0xcd, 0x42, - 0xda, 0xb0, 0xa9, 0x42, 0x16, 0x3b, 0xe1, 0x42, 0xf8, 0x5c, 0x05, 0x43, 0x2f, 0x39, 0xf7, 0x42, - 0xae, 0x31, 0xf0, 0x42, 0x9a, 0xbd, 0xf2, 0x42, 0x04, 0xb2, 0x0a, 0x43, 0x69, 0xb0, 0x1e, 0x43, - 0xdf, 0xc4, 0x30, 0x43, 0x8c, 0x7f, 0x35, 0x43, 0x79, 0x5a, 0x2c, 0x43, 0x40, 0x43, 0x1b, 0x43, - 0x12, 0xf9, 0xed, 0x42, 0xcb, 0xde, 0xa6, 0x42, 0xa4, 0x2c, 0x82, 0x42, 0xfc, 0xfe, 0x99, 0x42, - 0xd0, 0x83, 0xaa, 0x42, 0xf4, 0xc4, 0xb7, 0x42, 0x8f, 0xb3, 0xb1, 0x42, 0xd6, 0x0c, 0xb9, 0x42, - 0x6a, 0x1a, 0xc4, 0x42, 0x56, 0x75, 0xe0, 0x42, 0x94, 0x2b, 0xf7, 0x42, 0xe0, 0xeb, 0x08, 0x43, - 0xf3, 0xf5, 0xd0, 0x42, 0xc6, 0x78, 0xc6, 0x42, 0x2c, 0xf4, 0xa0, 0x42, 0x7a, 0x33, 0x5d, 0x42, - 0xee, 0xf4, 0x13, 0x42, 0x30, 0xb3, 0x66, 0x42, 0x3e, 0x45, 0x61, 0x42, 0xf4, 0x84, 0x7f, 0x42, - 0xe1, 0x9a, 0x8c, 0x42, 0x8d, 0x34, 0x99, 0x42, 0x5e, 0x82, 0xa5, 0x42, 0x3c, 0x22, 0xbf, 0x42, - 0x1b, 0xaf, 0x9f, 0x42, 0xd2, 0xc8, 0x9b, 0x42, 0x63, 0x54, 0x90, 0x42, 0x52, 0x0c, 0x9b, 0x42, - 0x56, 0x22, 0xb4, 0x42, 0x66, 0x13, 0x1b, 0x42, 0xf8, 0xde, 0x9c, 0x42, 0x68, 0x3a, 0xc9, 0x42, - 0xba, 0x72, 0xb4, 0x42, 0xb5, 0x35, 0xb9, 0x42, 0xd5, 0x9a, 0xe9, 0x42, 0x19, 0xe7, 0xd2, 0x42, - 0x11, 0xd2, 0x11, 0x43, 0x29, 0xd3, 0xef, 0x42, 0xb4, 0x54, 0x10, 0x43, 0xdc, 0x52, 0xc2, 0x42, - 0x76, 0xcd, 0xdc, 0x42, 0xcb, 0x23, 0x0e, 0x43, 0xc6, 0x9f, 0xfb, 0x42, 0x42, 0xce, 0x96, 0x42, - 0x8c, 0xaa, 0xa0, 0x42, 0x2a, 0x2b, 0xed, 0x42, 0xfb, 0x73, 0xdf, 0x42, 0x26, 0x9a, 0xde, 0x42, - 0x57, 0xee, 0x0e, 0x43, 0xcb, 0xf6, 0x0c, 0x43, 0xa1, 0x8e, 0x11, 0x43, 0xe6, 0x30, 0x0c, 0x43, - 0x6b, 0x76, 0x18, 0x43, 0x28, 0xb9, 0xfe, 0x42, 0x69, 0xb6, 0x13, 0x43, 0xa4, 0xa7, 0x10, 0x43, - 0xc3, 0x30, 0x10, 0x43, 0x89, 0xc7, 0xde, 0x42, 0x3a, 0x2d, 0xc4, 0x42, 0xef, 0x50, 0xce, 0x42, - 0x66, 0xc9, 0x9c, 0x42, 0xd5, 0x94, 0xe3, 0x42, 0x60, 0xd3, 0x08, 0x43, 0x59, 0x9c, 0xe8, 0x42, - 0x0f, 0x4a, 0x1c, 0x43, 0x68, 0x81, 0x25, 0x43, 0x72, 0x47, 0x2f, 0x43, 0x6d, 0x1b, 0x0a, 0x43, - 0xf5, 0x62, 0x09, 0x43, 0xb3, 0x11, 0x08, 0x43, 0x21, 0x7f, 0x02, 0x43, 0x86, 0xd0, 0x8b, 0x42, - 0x9c, 0xe1, 0x83, 0x42, 0x5c, 0x77, 0xc4, 0x42, 0xaa, 0xb4, 0xcd, 0x42, 0x12, 0xcf, 0xe0, 0x42, - 0x96, 0x16, 0xf9, 0x42, 0xbc, 0xe0, 0x07, 0x43, 0x3d, 0xb8, 0x19, 0x43, 0x5c, 0x3f, 0x35, 0x43, - 0x05, 0xab, 0x22, 0x43, 0x37, 0x42, 0x06, 0x43, 0x82, 0x68, 0x04, 0x43, 0xdd, 0x20, 0x01, 0x43, - 0xaa, 0x28, 0xd8, 0x42, 0xd1, 0x67, 0x94, 0x42, 0x84, 0xe7, 0xa9, 0x42, 0xde, 0x15, 0xdd, 0x42, - 0x21, 0x0f, 0xd0, 0x42, 0x2e, 0x8f, 0xc6, 0x42, 0x37, 0x33, 0xe6, 0x42, 0x46, 0x04, 0xf6, 0x42, - 0xac, 0x0e, 0x33, 0x43, 0xe5, 0x7a, 0x3d, 0x43, 0x5f, 0x95, 0x1d, 0x43, 0xa5, 0xb1, 0xf0, 0x42, - 0xd7, 0xc1, 0x05, 0x43, 0xd0, 0xc9, 0xe8, 0x42, 0xce, 0x14, 0xea, 0x42, 0xea, 0xe0, 0x8c, 0x42, - 0xe4, 0x08, 0xb9, 0x42, 0xa8, 0xf4, 0x07, 0x43, 0xbb, 0x58, 0xc8, 0x42, 0x7b, 0x74, 0xf0, 0x42, - 0xd7, 0x37, 0x04, 0x43, 0x76, 0xd3, 0x0b, 0x43, 0x37, 0x43, 0x21, 0x43, 0x96, 0x7e, 0x06, 0x43, - 0x46, 0xf6, 0xf5, 0x42, 0x5c, 0xca, 0xe0, 0x42, 0xce, 0xf2, 0xfa, 0x42, 0xa4, 0x95, 0x07, 0x43, - 0x5a, 0x7d, 0xfb, 0x42, 0x46, 0x4d, 0xa6, 0x42, 0x73, 0xbd, 0xd3, 0x42, 0x52, 0x21, 0x01, 0x43, - 0xf7, 0x35, 0xcc, 0x42, 0x18, 0xa8, 0xe8, 0x42, 0x39, 0x93, 0x07, 0x43, 0x83, 0x4c, 0x16, 0x43, - 0x01, 0xf1, 0x12, 0x43, 0x88, 0x2c, 0x15, 0x43, 0x5e, 0x23, 0xf2, 0x42, 0xa8, 0x52, 0xbf, 0x42, - 0x6b, 0xc7, 0xbf, 0x42, 0x2e, 0x86, 0xfb, 0x42, 0xf9, 0x63, 0x08, 0x43, 0xfd, 0xbc, 0xb8, 0x42, - 0x82, 0x25, 0xc1, 0x42, 0xaf, 0xd3, 0x0b, 0x43, 0x15, 0x3a, 0xe9, 0x42, 0x60, 0x46, 0xeb, 0x42, - 0xcb, 0xe0, 0xec, 0x42, 0x12, 0x9a, 0x0e, 0x43, 0x2f, 0xb5, 0x0d, 0x43, 0x1b, 0x7d, 0x12, 0x43, - 0xde, 0x97, 0xe3, 0x42, 0x79, 0xf5, 0xc7, 0x42, 0x79, 0xb0, 0xe4, 0x42, 0xa2, 0xd2, 0xcf, 0x42, - 0xfa, 0x3c, 0xf3, 0x42, 0xef, 0x01, 0x9e, 0x42, 0x0e, 0x25, 0xb0, 0x42, 0xd9, 0xbe, 0x05, 0x43, - 0x00, 0x72, 0x0f, 0x43, 0xf8, 0x72, 0x29, 0x43, 0xfe, 0x3c, 0x0e, 0x43, 0xd3, 0x8a, 0x08, 0x43, - 0x17, 0xd0, 0x08, 0x43, 0xc7, 0xe0, 0x15, 0x43, 0x74, 0xb8, 0x0a, 0x43, 0x90, 0xf5, 0xda, 0x42, - 0xfb, 0xd2, 0xf1, 0x42, 0x1d, 0x9a, 0x10, 0x43, 0xef, 0x9c, 0x1e, 0x43, 0x42, 0x6e, 0xbd, 0x42, - 0xb9, 0xa0, 0x85, 0x42, 0xdf, 0x9c, 0x10, 0x43, 0xad, 0x00, 0x0d, 0x43, 0xcd, 0x01, 0x12, 0x43, - 0xf0, 0x9e, 0xc2, 0x42, 0x34, 0x3f, 0x06, 0x43, 0x8f, 0x46, 0x0c, 0x43, 0xe7, 0x58, 0x07, 0x43, - 0x82, 0x24, 0x00, 0x43, 0xc0, 0xa3, 0x04, 0x43, 0xef, 0x84, 0x1a, 0x43, 0x94, 0xf3, 0x1e, 0x43, - 0x39, 0xc6, 0x16, 0x43, 0x0b, 0x1c, 0xe3, 0x42, 0x13, 0xc2, 0x9f, 0x42, 0x46, 0x36, 0xe7, 0x42, - 0xb2, 0xe7, 0xe3, 0x42, 0x49, 0xd1, 0xea, 0x42, 0x57, 0x47, 0xd8, 0x42, 0xde, 0xdc, 0xf3, 0x42, - 0xaa, 0x16, 0xf5, 0x42, 0x03, 0x47, 0x19, 0x43, 0xa9, 0xb3, 0x16, 0x43, 0x02, 0x3a, 0x1e, 0x43, - 0xa6, 0x2d, 0x1c, 0x43, 0x9b, 0xdf, 0x21, 0x43, 0x7e, 0xc3, 0x15, 0x43, 0x78, 0x93, 0xb7, 0x42, - 0xb0, 0xf2, 0x9b, 0x42, 0xad, 0xdd, 0xdc, 0x42, 0xe2, 0x68, 0xdd, 0x42, 0xc2, 0x61, 0xc7, 0x42, - 0x24, 0xb6, 0xc8, 0x42, 0x56, 0xf7, 0xc9, 0x42, 0x96, 0xc0, 0xd4, 0x42, 0x78, 0x58, 0x04, 0x43, - 0x33, 0x0e, 0x0f, 0x43, 0x81, 0x82, 0x21, 0x43, 0x1f, 0x59, 0x0c, 0x43, 0xf4, 0xdd, 0x01, 0x43, - 0x52, 0xe7, 0xee, 0x42, 0x04, 0xc8, 0x86, 0x42, 0xa1, 0x7e, 0x54, 0x42, 0x68, 0x63, 0x6f, 0x42, - 0x3c, 0xf8, 0x63, 0x42, 0xf8, 0xd5, 0x7b, 0x42, 0xf2, 0x8e, 0x84, 0x42, 0x4a, 0x7b, 0x96, 0x42, - 0x5d, 0x49, 0xac, 0x42, 0xb6, 0x7c, 0xc0, 0x42, 0xa9, 0x8f, 0xbe, 0x42, 0xae, 0x9e, 0xcf, 0x42, - 0x44, 0x57, 0xb2, 0x42, 0x39, 0xef, 0xaf, 0x42, 0xec, 0xa4, 0x4a, 0x42, 0x96, 0x71, 0x46, 0x42, - 0x38, 0xf8, 0x70, 0x42, 0xb1, 0x2c, 0x86, 0x42, 0x9a, 0xde, 0xa0, 0x42, 0x19, 0x05, 0xae, 0x42, - 0x70, 0x85, 0xc3, 0x42, 0x1a, 0xa9, 0xc7, 0x42, 0x8e, 0x52, 0xda, 0x42, 0x6d, 0x50, 0xda, 0x42, - 0x49, 0x6d, 0xd4, 0x42, 0xc0, 0x4f, 0xaa, 0x42, 0x99, 0x3e, 0xcd, 0x42, 0x23, 0x8b, 0xd6, 0x42, - 0x12, 0x8e, 0xbf, 0x42, 0x7c, 0x70, 0x6b, 0x42, 0x9f, 0xe3, 0xc5, 0x42, 0xdf, 0xdb, 0xf8, 0x42, - 0xcf, 0xce, 0xe3, 0x42, 0x1b, 0x12, 0xf3, 0x42, 0xad, 0xd0, 0x14, 0x43, 0x37, 0xea, 0x0c, 0x43, - 0x23, 0x92, 0x2a, 0x43, 0x5e, 0x19, 0x1d, 0x43, 0xdd, 0x1b, 0x2a, 0x43, 0xf6, 0x06, 0x0b, 0x43, - 0xa7, 0xfc, 0x26, 0x43, 0x55, 0xf6, 0x11, 0x43, 0x63, 0x49, 0x36, 0x43, 0xf6, 0xca, 0xc8, 0x42, - 0xeb, 0x08, 0xc8, 0x42, 0x1e, 0x9f, 0x03, 0x43, 0xf0, 0xbf, 0xd9, 0x42, 0x88, 0x0c, 0x0d, 0x43, - 0xac, 0x0d, 0x1f, 0x43, 0x6f, 0xa2, 0x1f, 0x43, 0xdb, 0xa2, 0x47, 0x43, 0x6f, 0x62, 0x37, 0x43, - 0x2c, 0x63, 0x2b, 0x43, 0x59, 0x79, 0x0b, 0x43, 0x17, 0xa5, 0x22, 0x43, 0x20, 0xc9, 0x24, 0x43, - 0xc5, 0x1b, 0x20, 0x43, 0x12, 0x48, 0xdd, 0x42, 0x24, 0x5d, 0xd0, 0x42, 0xec, 0x10, 0x04, 0x43, - 0xdb, 0xa9, 0xda, 0x42, 0x92, 0xd8, 0x06, 0x43, 0xc3, 0x22, 0x19, 0x43, 0xa7, 0xe5, 0x11, 0x43, - 0xdc, 0xd1, 0x2f, 0x43, 0x17, 0x6f, 0x51, 0x43, 0xe9, 0xa6, 0x4e, 0x43, 0x80, 0x3b, 0x1d, 0x43, - 0x13, 0xa0, 0x1f, 0x43, 0xf3, 0xb5, 0x1c, 0x43, 0xb6, 0x5a, 0x0f, 0x43, 0xbd, 0xbc, 0xb8, 0x42, - 0x3d, 0x79, 0xc9, 0x42, 0x56, 0xfd, 0x07, 0x43, 0x24, 0x9e, 0x02, 0x43, 0x64, 0xed, 0x12, 0x43, - 0xfa, 0xb7, 0x1d, 0x43, 0x2c, 0x40, 0x1a, 0x43, 0xa5, 0x37, 0x42, 0x43, 0x1e, 0xed, 0x3f, 0x43, - 0x3b, 0x4a, 0x45, 0x43, 0x4d, 0x09, 0x1f, 0x43, 0x73, 0x3d, 0x1c, 0x43, 0x8c, 0xaa, 0x14, 0x43, - 0x29, 0xe6, 0xf6, 0x42, 0x57, 0x51, 0xc9, 0x42, 0x4b, 0x59, 0xcd, 0x42, 0x41, 0x39, 0x1f, 0x43, - 0x75, 0x0b, 0x0b, 0x43, 0xd5, 0x1c, 0x17, 0x43, 0xad, 0x94, 0x11, 0x43, 0xb8, 0x07, 0x24, 0x43, - 0xe5, 0xe9, 0x49, 0x43, 0x3b, 0xdf, 0x5e, 0x43, 0x7b, 0x7f, 0x42, 0x43, 0xd8, 0x40, 0x1b, 0x43, - 0xea, 0x7a, 0x1d, 0x43, 0x93, 0xf5, 0x0a, 0x43, 0x41, 0x91, 0x15, 0x43, 0x35, 0xe8, 0xb2, 0x42, - 0x4f, 0x39, 0xe8, 0x42, 0xff, 0xcb, 0x1c, 0x43, 0xc9, 0x3d, 0x01, 0x43, 0xb1, 0x85, 0x10, 0x43, - 0xde, 0x62, 0x26, 0x43, 0xe1, 0x97, 0x23, 0x43, 0x51, 0x37, 0x3a, 0x43, 0xf7, 0xac, 0x31, 0x43, - 0x68, 0x02, 0x11, 0x43, 0xf1, 0xcf, 0xec, 0x42, 0x9a, 0xc5, 0x00, 0x43, 0xc5, 0x20, 0x06, 0x43, - 0x9b, 0x91, 0x21, 0x43, 0x3f, 0xbc, 0xd4, 0x42, 0x7d, 0x29, 0xe0, 0x42, 0xf9, 0x72, 0x22, 0x43, - 0x15, 0xe9, 0xfd, 0x42, 0x8c, 0x7f, 0x11, 0x43, 0x76, 0x23, 0x23, 0x43, 0xdd, 0x70, 0x29, 0x43, - 0x4f, 0x92, 0x2c, 0x43, 0x8f, 0x2e, 0x2a, 0x43, 0x27, 0xcf, 0x1b, 0x43, 0xa3, 0x60, 0xfe, 0x42, - 0x3e, 0xee, 0xe1, 0x42, 0xd9, 0x41, 0x08, 0x43, 0x2f, 0xb5, 0x1b, 0x43, 0xaa, 0x6e, 0xee, 0x42, - 0x10, 0x4b, 0xc5, 0x42, 0x93, 0x46, 0x22, 0x43, 0xb8, 0xa2, 0x14, 0x43, 0x14, 0xe8, 0x22, 0x43, - 0x83, 0x2e, 0x19, 0x43, 0x41, 0x0d, 0x2a, 0x43, 0x3d, 0x94, 0x28, 0x43, 0x7f, 0x7a, 0x26, 0x43, - 0xcd, 0x1c, 0x07, 0x43, 0xdf, 0x39, 0x05, 0x43, 0x57, 0xda, 0x04, 0x43, 0xa3, 0x98, 0x0a, 0x43, - 0xdb, 0x40, 0x1a, 0x43, 0xdd, 0x43, 0xd7, 0x42, 0x9a, 0xd0, 0xce, 0x42, 0x2d, 0x1f, 0x23, 0x43, - 0x0a, 0x7e, 0x23, 0x43, 0x86, 0x54, 0x37, 0x43, 0x0b, 0x35, 0x2b, 0x43, 0x68, 0xf0, 0x2b, 0x43, - 0x6b, 0xdf, 0x1e, 0x43, 0x27, 0x4e, 0x1f, 0x43, 0x06, 0x74, 0x19, 0x43, 0x74, 0x45, 0x0e, 0x43, - 0x5d, 0x68, 0x13, 0x43, 0x8d, 0xf2, 0x16, 0x43, 0x41, 0x7d, 0x3c, 0x43, 0x8f, 0xa1, 0x0a, 0x43, - 0xab, 0xd3, 0xc5, 0x42, 0x6c, 0x88, 0x23, 0x43, 0xed, 0xed, 0x2a, 0x43, 0x94, 0x0c, 0x18, 0x43, - 0x24, 0x68, 0x08, 0x43, 0xd7, 0x70, 0x1b, 0x43, 0xed, 0x30, 0x20, 0x43, 0x30, 0x0f, 0x34, 0x43, - 0xf8, 0x3a, 0x14, 0x43, 0x77, 0x0f, 0x14, 0x43, 0x9a, 0xf1, 0x30, 0x43, 0x1d, 0xd3, 0x33, 0x43, - 0x45, 0x35, 0x3b, 0x43, 0x4f, 0xe5, 0xe6, 0x42, 0x72, 0x58, 0xc6, 0x42, 0x21, 0xff, 0x13, 0x43, - 0xd0, 0xe1, 0x04, 0x43, 0x32, 0x02, 0x0e, 0x43, 0x65, 0x72, 0xf6, 0x42, 0x09, 0xe2, 0x0e, 0x43, - 0xf1, 0xe4, 0x14, 0x43, 0xc5, 0x4b, 0x33, 0x43, 0x99, 0xde, 0x29, 0x43, 0xf7, 0x6c, 0x37, 0x43, - 0x9f, 0xde, 0x31, 0x43, 0xbc, 0xf7, 0x40, 0x43, 0x5e, 0x4a, 0x29, 0x43, 0x6b, 0x14, 0xe5, 0x42, - 0xb3, 0x32, 0xb9, 0x42, 0x50, 0xd7, 0x03, 0x43, 0x95, 0xca, 0xf0, 0x42, 0xbe, 0xf0, 0x00, 0x43, - 0xf3, 0x62, 0xfe, 0x42, 0x82, 0xdd, 0x00, 0x43, 0xf3, 0x07, 0x08, 0x43, 0xa3, 0x5e, 0x28, 0x43, - 0xc3, 0xfd, 0x32, 0x43, 0x20, 0xff, 0x39, 0x43, 0xc0, 0xc6, 0x28, 0x43, 0xec, 0x59, 0x1c, 0x43, - 0xde, 0xfa, 0x12, 0x43, 0x0e, 0x75, 0xbe, 0x42, 0x1a, 0xe3, 0x64, 0x42, 0x3d, 0x9c, 0x9d, 0x42, - 0xc9, 0xd9, 0x98, 0x42, 0x3b, 0x1a, 0xa0, 0x42, 0xd6, 0x79, 0xaf, 0x42, 0xd0, 0xfa, 0xa1, 0x42, - 0xb9, 0x9c, 0xc7, 0x42, 0xf9, 0xea, 0xe3, 0x42, 0x96, 0xd9, 0xf2, 0x42, 0x13, 0x88, 0x07, 0x43, - 0xc5, 0x59, 0xc8, 0x42, 0x70, 0xd9, 0xc1, 0x42, 0xaf, 0xd3, 0x98, 0x42, 0xe0, 0xae, 0x85, 0x42}; - -unsigned char conv2d_winograd_fp16_in[] = { - 0x3a, 0xb9, 0xc0, 0x30, 0x28, 0xbc, 0x72, 0xc1, 0x3c, 0xbe, 0xee, 0xc0, 0x1b, 0x3d, 0xf5, 0xbf, - 0x77, 0xbd, 0x05, 0xbd, 0x12, 0x2b, 0x5f, 0xb8, 0x73, 0xa2, 0xac, 0xbc, 0x19, 0xbf, 0x62, 0xc2, - 0xc5, 0xb7, 0x84, 0x3a, 0x70, 0xb4, 0xe9, 0xbd, 0xcf, 0xb9, 0x9b, 0xbe, 0xad, 0xb8, 0x4c, 0x39, - 0xaa, 0xc1, 0x50, 0xad, 0x4c, 0xbf, 0x8b, 0xb9, 0x9e, 0xbe, 0xbe, 0xb8, 0x05, 0xbf, 0x1c, 0xbc, - 0x7c, 0xbb, 0xce, 0xb3, 0x8a, 0x2c, 0xe7, 0xc1, 0xca, 0xb4, 0xde, 0x38, 0xe0, 0xbc, 0x46, 0xb9, - 0x37, 0xbf, 0xe0, 0x36, 0xef, 0xbd, 0xe9, 0xc0, 0x97, 0xc0, 0x5e, 0xbd, 0x5b, 0xbb, 0xf9, 0x2a, - 0x23, 0xb8, 0x6c, 0xbe, 0x09, 0xba, 0xd4, 0xbc, 0x39, 0xc0, 0x9d, 0xbd, 0xf8, 0xba, 0x7c, 0xb2, - 0x05, 0xc0, 0x14, 0xb5, 0xd0, 0x2e, 0x67, 0xb5, 0x20, 0xb9, 0x91, 0xb9, 0x3e, 0xa6, 0x78, 0xc0, - 0xcc, 0xbc, 0x10, 0xc1, 0x2f, 0xbd, 0x4a, 0xc1, 0x38, 0xbe, 0x2f, 0xb3, 0x01, 0xbc, 0x8d, 0x3b, - 0xcb, 0xc0, 0xa2, 0xbc, 0xb4, 0x22, 0x7c, 0xbe, 0x82, 0xbf, 0xa7, 0xbb, 0xf6, 0xbd, 0xd8, 0xbf, - 0x30, 0xb2, 0xb4, 0xb8, 0xe2, 0xbb, 0x5a, 0xbc, 0x93, 0xab, 0xb1, 0x3a, 0x08, 0xb8, 0x92, 0xbd, - 0xa7, 0xbc, 0x1a, 0xb8, 0x6f, 0xbe, 0xc8, 0xc1, 0xac, 0xbd, 0x32, 0xc0, 0x42, 0xbb, 0x60, 0x3c, - 0x3f, 0x34, 0x04, 0xbe, 0xed, 0xbe, 0x3e, 0x33, 0xbb, 0xbc, 0x4e, 0xbf, 0x48, 0xba, 0xaf, 0xbd, - 0x89, 0xb9, 0x06, 0x2b, 0x49, 0x38, 0x2d, 0xb9, 0x4f, 0xc0, 0xc7, 0xbd, 0xeb, 0x30, 0x47, 0x34, - 0x03, 0xbe, 0x47, 0xbe, 0x6d, 0xbf, 0x9a, 0xbe, 0x33, 0xbe, 0x89, 0xbf, 0x3b, 0x3a, 0xbc, 0x37, - 0xfb, 0xbd, 0xe4, 0xb9, 0x80, 0xb9, 0xd4, 0xbc, 0xe4, 0xc1, 0x63, 0xbb, 0xe6, 0x39, 0x0c, 0xc1, - 0x16, 0xbd, 0xdc, 0xaa, 0x06, 0xb5, 0x3b, 0xc0, 0xd4, 0xc4, 0x85, 0x28, 0x5c, 0xbf, 0x36, 0xbb, - 0x10, 0xbc, 0x3b, 0xbc, 0x28, 0x35, 0xe0, 0xb6, 0x99, 0xc0, 0x6f, 0xbe, 0xae, 0xbc, 0xe2, 0xac, - 0x21, 0xc0, 0x52, 0xc0, 0x7e, 0xb6, 0x0f, 0xc0, 0x9c, 0xb7, 0x44, 0xba, 0xb0, 0xb9, 0xd9, 0xc0, - 0xb9, 0xc0, 0x9f, 0xb9, 0x99, 0xaf, 0x71, 0xbd, 0x32, 0xc0, 0x53, 0x3b, 0x19, 0xc0, 0x78, 0x3a, - 0x6f, 0xb9, 0x43, 0xb9, 0x67, 0xbb, 0x20, 0xba, 0xf3, 0xb8, 0x1a, 0xb0, 0x45, 0xc2, 0x38, 0xaf, - 0x03, 0xbe, 0xbf, 0xb9, 0xae, 0xba, 0xc9, 0xb2, 0xb3, 0xbc, 0x1f, 0xbc, 0x35, 0xbc, 0x39, 0xc0, - 0x2a, 0xbe, 0x2f, 0xbd, 0x8c, 0xc0, 0xd4, 0xc1, 0x4e, 0x38, 0x13, 0xc1, 0x4c, 0xba, 0x31, 0xb9, - 0xa7, 0xbe, 0x7e, 0xc0, 0x1e, 0xb8, 0x86, 0xb4, 0xce, 0xbc, 0x51, 0xb7, 0x9d, 0xb0, 0xd7, 0xc1, - 0x89, 0xb4, 0xc4, 0x39, 0x55, 0xbc, 0x44, 0x33, 0x84, 0x3a, 0x29, 0xb9, 0x61, 0xb5, 0x8e, 0xbd, - 0xe2, 0xb2, 0x54, 0xa1, 0x46, 0xb5, 0xb5, 0x34, 0x4b, 0xc0, 0x84, 0xb8, 0x0d, 0x38, 0x31, 0xc4, - 0xe1, 0xbe, 0x40, 0x34, 0x47, 0xc0, 0xf4, 0xba, 0x4a, 0x39, 0x92, 0x2d, 0x62, 0x38, 0x44, 0xbd, - 0x72, 0xbc, 0xf1, 0xbc, 0x01, 0xbf, 0xed, 0xbb, 0xbd, 0x40, 0xa6, 0xc1, 0x2c, 0x40, 0xec, 0x2f, - 0x5f, 0xc1, 0x96, 0xbc, 0xfc, 0xba, 0xef, 0xbc, 0x3f, 0xbd, 0x0f, 0xbc, 0x9d, 0xba, 0x2b, 0xc2, - 0xda, 0xbd, 0x9c, 0xc2, 0x39, 0xb1, 0xd3, 0xbf, 0x59, 0xc1, 0xac, 0xc0, 0x01, 0xb4, 0x32, 0xb8, - 0xac, 0xb4, 0xfa, 0xbb, 0x44, 0xbd, 0xa8, 0xb5, 0x8a, 0xbd, 0x10, 0xbb, 0x34, 0xb8, 0x0c, 0x3d, - 0xfd, 0xac, 0x69, 0xbc, 0xd8, 0xc0, 0x60, 0xbc, 0x1c, 0x33, 0x16, 0xb7, 0x58, 0xc0, 0xad, 0xb8, - 0x35, 0xc3, 0xba, 0xbe, 0xec, 0xb5, 0x95, 0xc2, 0xeb, 0xbd, 0x72, 0xb5, 0x97, 0x38, 0x24, 0x30, - 0xc8, 0xba, 0xab, 0x3a, 0x4c, 0xbf, 0xef, 0xba, 0xe9, 0xb6, 0xa2, 0xb8, 0x64, 0xbe, 0x0e, 0xc0, - 0xfb, 0xbd, 0x06, 0x32, 0xd2, 0xbe, 0x65, 0xb8, 0xd4, 0x3a, 0xa4, 0xbb, 0x0d, 0x39, 0x7a, 0xbc, - 0x9d, 0x2a, 0x92, 0xb3, 0x02, 0xc0, 0x54, 0xbe, 0x12, 0x2e, 0x84, 0xc0, 0x44, 0xc3, 0x8a, 0xbc, - 0xfb, 0xbc, 0x8b, 0xba, 0x91, 0xbc, 0x74, 0xba, 0x25, 0xab, 0xb3, 0xba, 0xd0, 0xbc, 0x8e, 0x3a, - 0xb9, 0xb8, 0x6f, 0x22, 0x92, 0xbc, 0xdc, 0xc1, 0x58, 0xc1, 0xea, 0xba, 0xbf, 0xa4, 0xaf, 0x40, - 0x10, 0xbb, 0x93, 0xbf, 0x33, 0xb5, 0x8b, 0xbe, 0xbe, 0xc1, 0x3b, 0xb9, 0x1e, 0xbe, 0xb0, 0x37, - 0x7e, 0xc1, 0x5c, 0xb9, 0x26, 0xc0, 0x0c, 0xbd, 0x18, 0xbe, 0x37, 0x3c, 0xdb, 0x2d, 0xea, 0xb4, - 0x18, 0xbc, 0x09, 0xba, 0xee, 0xb2, 0xc0, 0xc0, 0xae, 0xbd, 0x73, 0xbc, 0x12, 0xc0, 0x69, 0x3b, - 0x14, 0xbc, 0x46, 0xc0, 0x8d, 0x38, 0xd8, 0xbb, 0x31, 0xbb, 0x88, 0xbc, 0x2e, 0x39, 0x22, 0xc0, - 0x67, 0xba, 0x14, 0x32, 0x24, 0xb7, 0x20, 0xc1, 0x72, 0xc0, 0xc8, 0x33, 0x0e, 0xbe, 0xab, 0x3a, - 0x95, 0xbd, 0x93, 0xb4, 0xf1, 0xb8, 0x72, 0xc0, 0x13, 0xc0, 0x2e, 0xc0, 0x2c, 0xbd, 0x4b, 0xc1, - 0x0a, 0x31, 0x34, 0xb3, 0x13, 0xb5, 0x4c, 0xb9, 0x45, 0xbe, 0x5d, 0xba, 0x4d, 0xbe, 0x15, 0x36, - 0xcb, 0xbe, 0x55, 0xc0, 0x53, 0xbd, 0x48, 0xb4, 0x39, 0xbc, 0xbd, 0xbc, 0x9a, 0x2d, 0x2c, 0xbc, - 0x84, 0x3b, 0xb4, 0xba, 0x32, 0xb2, 0x9b, 0xba, 0xba, 0xbc, 0x9f, 0xbc, 0xca, 0xb6, 0x32, 0xbe, - 0x36, 0x37, 0x3f, 0xbe, 0xe9, 0xbb, 0x51, 0xbc, 0x96, 0xb8, 0xb0, 0xbc, 0x4c, 0xbf, 0xad, 0xbc, - 0x03, 0xb6, 0x9d, 0xbe, 0xcc, 0xbf, 0x62, 0x29, 0x59, 0xbe, 0xaa, 0xb6, 0xcb, 0xbf, 0x1c, 0xb8, - 0x59, 0x3c, 0x8e, 0xb4, 0x2d, 0xb6, 0xb7, 0xac, 0x0b, 0xba, 0x91, 0xbe, 0x3a, 0xb5, 0xd7, 0xbe, - 0xea, 0xbe, 0x92, 0xb5, 0x40, 0xaf, 0x90, 0xb9, 0xa2, 0xbe, 0xab, 0x35, 0x22, 0xbc, 0xa0, 0xb8, - 0x10, 0x2e, 0xce, 0xbb, 0xd6, 0xbe, 0x2e, 0x32, 0x64, 0x32, 0x52, 0xb4, 0xe2, 0xc0, 0x95, 0xbd, - 0xb5, 0xc0, 0x33, 0xbe, 0x52, 0xb4, 0x5b, 0xbd, 0x77, 0x38, 0xe1, 0xbf, 0x2f, 0xbd, 0x94, 0xb9, - 0xd0, 0xb8, 0x47, 0xbc, 0xc2, 0xb5, 0xa0, 0x39, 0x0b, 0x42, 0xb1, 0xbc, 0x35, 0xbb, 0xd7, 0xb3, - 0xc1, 0xbe, 0xe7, 0xc0, 0x27, 0xb7, 0x7c, 0xb6, 0x57, 0x35, 0x93, 0xbd, 0x23, 0xb6, 0x5f, 0xbe, - 0xa7, 0xbc, 0x49, 0xb9, 0x5b, 0xb8, 0x36, 0xb6, 0xb8, 0xba, 0xc3, 0x33, 0x24, 0xb3, 0xef, 0xb8, - 0xba, 0xc0, 0x57, 0x39, 0x9c, 0xb6, 0xcf, 0xbe, 0x4c, 0xba, 0x4e, 0x34, 0x55, 0xbc, 0xaa, 0xb9, - 0xd8, 0xbe, 0xfc, 0x3a, 0xb9, 0xc1, 0x7b, 0x30, 0xb2, 0xbc, 0x0e, 0xa9, 0xb0, 0xb7, 0x31, 0xbc, - 0x13, 0xb1, 0x15, 0x3a, 0xbf, 0x32, 0x2f, 0x39, 0xb9, 0xc2, 0xb9, 0xbf, 0x04, 0xba, 0xf7, 0xbd, - 0x61, 0x37, 0x99, 0xbe, 0x8d, 0xb8, 0x5c, 0xb5, 0xc3, 0xc2, 0xb8, 0x32, 0xc5, 0xb4, 0xb1, 0xb6, - 0xe2, 0x2e, 0xb9, 0xbb, 0x95, 0x39, 0xc9, 0xbf, 0x58, 0xb4, 0xa3, 0xb9, 0xeb, 0xb5, 0x09, 0xc0, - 0x9f, 0xc1, 0x10, 0xba, 0x28, 0xbf, 0x09, 0xc0, 0x64, 0xb9, 0xd7, 0x3d, 0xad, 0xbc, 0xf6, 0xb8, - 0xa5, 0xba, 0x16, 0xbe, 0xec, 0x3c, 0xf8, 0xbb, 0x42, 0xbe, 0x90, 0xb8, 0x89, 0xb8, 0x91, 0xb8, - 0xa5, 0xbd, 0x63, 0xbb, 0xe8, 0xb3, 0x22, 0xb8, 0x8c, 0xba, 0x17, 0xbd, 0xc4, 0xba, 0x84, 0xbc, - 0x2f, 0xbf, 0xb2, 0xbc, 0x2c, 0xb6, 0xfe, 0xbc, 0x0b, 0xb9, 0xb7, 0xb3, 0x8f, 0xbe, 0xe9, 0xbd, - 0xe7, 0xbe, 0x78, 0xb8, 0x3c, 0x3d, 0xf8, 0xba, 0x7c, 0xb0, 0x3d, 0xbd, 0x62, 0xc0, 0xdf, 0xbc, - 0xc7, 0xb8, 0x5c, 0xc1, 0x3b, 0xbe, 0x9d, 0xb8, 0x63, 0xba, 0x26, 0xbb, 0x3c, 0xbf, 0x24, 0xbf, - 0x83, 0xbd, 0xb3, 0xc0, 0x89, 0x34, 0xf5, 0xb0, 0xf1, 0x32, 0xa0, 0xbb, 0xaf, 0xbf, 0x31, 0xbe, - 0xe3, 0x2f, 0x56, 0x36, 0x3d, 0xb4, 0x7a, 0x9b, 0x77, 0xbd, 0x9f, 0x31, 0xf1, 0xb8, 0xb3, 0x34, - 0xc4, 0xbe, 0xbd, 0x2d, 0xfc, 0xbb, 0xbb, 0xba, 0xc5, 0xbc, 0xa4, 0xb5, 0xd7, 0xb9, 0x1b, 0xbc, - 0x8b, 0xbd, 0x0e, 0xb8, 0x18, 0xbe, 0x6b, 0xb6, 0xee, 0x2d, 0xd2, 0xb1, 0xbf, 0xba, 0x36, 0xbf, - 0xc3, 0xba, 0xa7, 0x3b, 0x9f, 0xbd, 0x91, 0xbf, 0x3e, 0x2f, 0x55, 0xb9, 0x24, 0xbe, 0xb4, 0xbe, - 0x2d, 0x32, 0x42, 0xbe, 0x7a, 0x3d, 0x5b, 0xbf, 0x97, 0xc0, 0x69, 0xbc, 0xf9, 0xb2, 0xd5, 0xbf, - 0xe8, 0x39, 0xb4, 0xb3, 0xbb, 0xbe, 0xc9, 0xb7, 0x62, 0xbc, 0xd2, 0xbc, 0x1c, 0x38, 0xac, 0x3b, - 0xd2, 0x34, 0x58, 0xaf, 0x8c, 0xbc, 0xda, 0xbf, 0xb6, 0xb1, 0x21, 0xbf, 0x77, 0xb9, 0x70, 0xbe, - 0xbe, 0x38, 0xc3, 0x35, 0xe2, 0xbc, 0xa4, 0xb8, 0x7c, 0xb9, 0xad, 0xbc, 0x50, 0xc0, 0xcd, 0xba, - 0x3c, 0x35, 0x4e, 0xbf, 0x3f, 0xc0, 0xd2, 0xbe, 0xaa, 0xbc, 0x2e, 0xb9, 0x57, 0xb9, 0x04, 0xb3, - 0x47, 0xc0, 0x46, 0x30, 0xa6, 0x3e, 0x52, 0x39, 0x13, 0x3e, 0x4f, 0x36, 0x99, 0xbd, 0xf9, 0xbc, - 0x61, 0x38, 0x8a, 0xbc, 0xf6, 0xbb, 0x07, 0xaa, 0x27, 0xb3, 0x26, 0xbe, 0xfa, 0xbd, 0x8a, 0xbb, - 0xb1, 0xb0, 0x44, 0xc3, 0x71, 0xb6, 0x34, 0xc0, 0xfe, 0xbd, 0x23, 0xc0, 0xde, 0x2e, 0x68, 0xc0, - 0x74, 0xbd, 0xeb, 0xb2, 0x9e, 0xbb, 0xd7, 0xb3, 0x44, 0xbe, 0x8b, 0xc1, 0x35, 0xba, 0xfd, 0x30, - 0xc0, 0xbd, 0x7f, 0xc0, 0xb7, 0xc1, 0xb7, 0xbe, 0x25, 0xb9, 0xd0, 0xc0, 0xcb, 0xbd, 0x41, 0xc0, - 0x2e, 0x3b, 0x01, 0xbe, 0x72, 0xbc, 0xf4, 0x2f, 0x56, 0xb2, 0xc9, 0xbe, 0xfa, 0x3d, 0xc6, 0xba, - 0x33, 0xc0, 0xdf, 0xaa, 0xf8, 0xb9, 0xe0, 0xc0, 0x7e, 0xbc, 0x5a, 0x3a, 0xbd, 0xc0, 0x06, 0xbe, - 0xe0, 0xbe, 0x6b, 0xbb, 0x2a, 0xc0, 0xee, 0xbe, 0x88, 0xb2, 0x7c, 0xb2, 0xb7, 0xbe, 0xea, 0xc0, - 0x2d, 0xb3, 0x97, 0xb9, 0xf1, 0xb9, 0x5c, 0x28, 0xc7, 0xbc, 0x4d, 0xbd, 0x63, 0xb5, 0x51, 0xb1, - 0x6b, 0xbf, 0xf9, 0xbf, 0x36, 0xbb, 0xad, 0xab, 0x8d, 0xbd, 0xe5, 0xbc, 0x9e, 0xbd, 0x14, 0xc0, - 0x05, 0xba, 0xbe, 0xbf, 0xfe, 0xad, 0xfd, 0xbe, 0x3e, 0x2f, 0x03, 0x37, 0x78, 0x38, 0xc6, 0xb9, - 0xd3, 0x35, 0x6f, 0xbe, 0x55, 0xbb, 0x61, 0xbe, 0xa8, 0xb3, 0xdf, 0xbf, 0x63, 0xbd, 0x28, 0xbb, - 0xda, 0xbe, 0xf2, 0xbc, 0x15, 0xa1, 0xfd, 0xb8, 0x0d, 0xbe, 0x0e, 0x2e, 0x91, 0x38, 0x75, 0xbc, - 0x64, 0xb2, 0x32, 0xbe, 0x10, 0xc4, 0x6b, 0xbe, 0xa9, 0x39, 0x18, 0xbe, 0x26, 0xaf, 0xc5, 0xb4, - 0x58, 0xc2, 0xe6, 0x3c, 0xaa, 0xbe, 0x15, 0xbe, 0xab, 0xbe, 0xda, 0xbe, 0x95, 0xbc, 0x38, 0xc0, - 0x27, 0xc0, 0x6d, 0xbc, 0x27, 0xbb, 0x59, 0xba, 0x7c, 0xb9, 0xd1, 0xba, 0x8a, 0xbf, 0xa5, 0x40, - 0x07, 0x3c, 0x53, 0xbf, 0x9f, 0xc2, 0x6a, 0x39, 0x6e, 0xc0, 0x81, 0xbf, 0x73, 0xbd, 0x37, 0xbf, - 0x50, 0x24, 0xfc, 0xbe, 0x1f, 0xc1, 0x07, 0x32, 0x42, 0xb0, 0xa8, 0x39, 0x73, 0x39, 0x07, 0xb9, - 0xce, 0xc0, 0xb4, 0xbc, 0xfd, 0xbd, 0xa6, 0x30, 0xb7, 0xbf, 0xf7, 0xbb, 0x64, 0xc1, 0x6f, 0x39, - 0xf2, 0xbe, 0x9a, 0x3a, 0xc5, 0xbe, 0x8d, 0xb4, 0xd3, 0x35, 0x67, 0xbf, 0x40, 0xb9, 0xcf, 0xbc, - 0x7c, 0xbd, 0x2b, 0x32, 0x4c, 0xbe, 0xaa, 0xbe, 0xea, 0xc0, 0x9c, 0xb2, 0xa6, 0x34, 0x1b, 0x9b, - 0xde, 0xbc, 0x30, 0xbc, 0x52, 0xbc, 0x7b, 0xbc, 0x11, 0xc0, 0x03, 0xbb, 0x65, 0xbb, 0x8e, 0x3a, - 0x85, 0xba, 0x3f, 0x41, 0x84, 0xbd, 0xe0, 0xbf, 0x73, 0x35, 0xce, 0xb9, 0xac, 0x33, 0xcb, 0x3a, - 0x28, 0xb5, 0xd9, 0xbb, 0x7e, 0xbc, 0xe9, 0xbf, 0x33, 0xbc, 0x3c, 0xbf, 0x04, 0x36, 0xd4, 0xa0, - 0x76, 0xbe, 0x3c, 0x2d, 0x1e, 0xc0, 0x28, 0xbe, 0xcb, 0xc0, 0x41, 0x36, 0xcd, 0xba, 0x0d, 0xc0, - 0x6e, 0xc0, 0x58, 0xb8, 0x2b, 0xc0, 0x4d, 0xc4, 0x98, 0xbd, 0xa6, 0xbd, 0x16, 0x38, 0x6d, 0xb8, - 0x07, 0xbd, 0xd5, 0x3d, 0x2f, 0xbd, 0x0a, 0xba, 0x23, 0xba, 0x11, 0xb5, 0xf9, 0xbd, 0x67, 0xb6, - 0x60, 0xbc, 0x0e, 0xc0, 0xa9, 0xbc, 0x13, 0xba, 0xd1, 0xb4, 0xc4, 0xbe, 0xd1, 0xb1, 0x0e, 0xc0, - 0xa5, 0x2d, 0xd6, 0xb4, 0x68, 0xbb, 0xa3, 0xb9, 0x3d, 0xbd, 0x31, 0xbc, 0x11, 0xb4, 0xba, 0xb7, - 0xf2, 0x37, 0x91, 0xb6, 0x20, 0xbf, 0x0b, 0xc0, 0xd4, 0xbb, 0x0e, 0xb8, 0xad, 0xc1, 0x59, 0xbd, - 0xf9, 0xb7, 0x45, 0xc0, 0xe2, 0xba, 0x8f, 0xbf, 0xd1, 0x3a, 0xe2, 0xb9, 0x5b, 0xbc, 0x4d, 0xbe, - 0x75, 0xbd, 0x2e, 0xbc, 0xa2, 0x30, 0x4f, 0x28, 0xe3, 0xbf, 0x06, 0xb9, 0xd6, 0xbf, 0x18, 0xb8, - 0x2e, 0xc0, 0xc2, 0x38, 0x42, 0xb7, 0x08, 0xc1, 0xb3, 0xb8, 0xa7, 0xba, 0xc4, 0xb8, 0x31, 0xa6, - 0xbe, 0xc1, 0x79, 0xb4, 0x52, 0xb0, 0x43, 0xbb, 0x76, 0xba, 0x08, 0xba, 0x05, 0xc1, 0xfb, 0xc2, - 0x25, 0xc0, 0x9b, 0x3b, 0x49, 0x34, 0xda, 0x2d, 0xfd, 0xb9, 0xa8, 0x32, 0x05, 0x34, 0x59, 0xb8, - 0x5b, 0x33, 0x8f, 0xba, 0xd4, 0xb4, 0x60, 0xbd, 0x28, 0xc2, 0x31, 0xbb, 0xdf, 0xc0, 0x1c, 0xbf, - 0x23, 0xb6, 0x3a, 0xbd, 0x76, 0xb9, 0x43, 0xb9, 0xe8, 0xb7, 0x84, 0xbf, 0x8f, 0x34, 0xbf, 0xbb, - 0x4c, 0xc0, 0xfb, 0x3c, 0x6e, 0xbf, 0x82, 0xbd, 0xe1, 0xbd, 0x6d, 0xc1, 0x08, 0xbe, 0x01, 0xbc, - 0x28, 0xbc, 0xf4, 0xba, 0x77, 0xba, 0xa0, 0xc1, 0x64, 0xb8, 0xcc, 0xbc, 0x74, 0xc2, 0xed, 0xaf, - 0x26, 0xc0, 0x21, 0xbe, 0x07, 0xbd, 0x7b, 0xc1, 0xba, 0xba, 0x38, 0x39, 0xf7, 0xbc, 0xc1, 0xb4, - 0xc6, 0xc0, 0x92, 0xc0, 0x30, 0xbb, 0xdf, 0xbe, 0xcb, 0xb8, 0x91, 0xbd, 0x52, 0x3b, 0xa9, 0xb9, - 0x43, 0xba, 0xbd, 0xb8, 0xc3, 0xbd, 0x47, 0xbb, 0x93, 0xaa, 0xc8, 0xc1, 0xf6, 0x38, 0x62, 0xbb, - 0xba, 0xb6, 0xb8, 0xb1, 0xe8, 0xb8, 0xb4, 0xc0, 0x61, 0xb1, 0x6b, 0xba, 0xc3, 0xbe, 0x1a, 0xbb, - 0x81, 0xc0, 0x21, 0xbd, 0x0d, 0xc2, 0x49, 0xac, 0x80, 0xbe, 0xc0, 0x34, 0xe7, 0xac, 0x09, 0xb1, - 0xc0, 0xb5, 0x17, 0xbd, 0x45, 0xb9, 0xba, 0x35, 0x6f, 0xbd, 0x91, 0xbd, 0x01, 0xbf, 0xca, 0xb9, - 0x2c, 0xad, 0xd7, 0x3d, 0x1a, 0xbb, 0x63, 0xbc, 0x1b, 0xc2, 0x46, 0xb0, 0xe2, 0xba, 0x06, 0xbc, - 0x2e, 0xba, 0xc0, 0xb8, 0xeb, 0xbc, 0xed, 0xbc, 0xe5, 0xb9, 0x47, 0xba, 0xd0, 0x37, 0xf7, 0xbc, - 0x72, 0xbe, 0x00, 0xbd, 0xdb, 0x2e, 0xbc, 0xb8, 0x5b, 0xbe, 0x3c, 0xbd, 0x69, 0xbe, 0x5d, 0x34, - 0xd2, 0xbf, 0x4f, 0xbf, 0xb2, 0xb9, 0x50, 0xbe, 0xfc, 0xbc, 0x5c, 0xb9, 0x9d, 0xc0, 0xc9, 0xbf, - 0x38, 0xc1, 0xfa, 0xc0, 0xa5, 0x3c, 0x67, 0xbc, 0xc6, 0xc0, 0x5a, 0x32, 0x92, 0xbd, 0x10, 0xc1, - 0x79, 0xc0, 0xe3, 0xbf, 0x0d, 0xba, 0xb0, 0xc1, 0x5f, 0xba, 0xb1, 0xbc, 0x42, 0xbc, 0x4e, 0x3f, - 0x4b, 0xb8, 0x77, 0x2f, 0x87, 0xc1, 0x89, 0xc0, 0xf9, 0xc0, 0x12, 0xbe, 0x19, 0xbe, 0x75, 0xb6, - 0xe1, 0xc2, 0xad, 0xbb, 0x3e, 0xbc, 0x23, 0xba, 0xcd, 0xbc, 0xe1, 0x37, 0x7c, 0xb9, 0xa8, 0xb1, - 0x07, 0xb4, 0xe9, 0x38, 0x12, 0xb7, 0x06, 0xbd, 0x2d, 0xb0, 0x4e, 0xc1, 0xc6, 0xc0, 0x9a, 0x39, - 0x49, 0x3c, 0x00, 0xbe, 0x24, 0xb5, 0x86, 0xbd, 0x9f, 0xb4, 0x64, 0xbf, 0xf7, 0xba, 0x5f, 0xbe, - 0x31, 0x36, 0x64, 0xbe, 0x41, 0x35, 0x35, 0xc1, 0x81, 0xbf, 0x7f, 0xbf, 0xb2, 0xbe, 0xf9, 0xbd, - 0x65, 0xc2, 0x09, 0xba, 0x20, 0x30, 0x10, 0xbd, 0xf2, 0xc1, 0x64, 0xc0, 0xab, 0xbc, 0x43, 0xc0, - 0xd1, 0xb8, 0xd0, 0xbe, 0x09, 0xb9, 0xac, 0xbd, 0x27, 0xb8, 0x14, 0xb8, 0x3b, 0xc0, 0x26, 0xb7, - 0x57, 0xbd, 0x3a, 0xbb, 0x20, 0x3b, 0xe7, 0xb9, 0xb3, 0x36, 0xeb, 0xbd, 0x4a, 0xb8, 0x6a, 0x34, - 0xae, 0x3d, 0xc4, 0xb6, 0x78, 0xbf, 0xa6, 0xbe, 0x3e, 0x2c, 0xb3, 0x3a, 0xcd, 0xbb, 0x71, 0xbe, - 0x69, 0xbc, 0x5a, 0x27, 0x90, 0xbd, 0x65, 0xbf, 0x9d, 0xbc, 0x76, 0xad, 0x28, 0xb7, 0x54, 0xbd, - 0xe7, 0xbe, 0x68, 0xb6, 0xe8, 0xaa, 0x46, 0xbe, 0xc4, 0xbd, 0x1e, 0xc0, 0x15, 0x2a, 0x7c, 0xba, - 0xf9, 0xbd, 0x6b, 0xbd, 0x55, 0x3b, 0x07, 0xbd, 0x07, 0xc0, 0x85, 0xb8, 0xd5, 0xb4, 0x30, 0xc0, - 0x1c, 0x27, 0x27, 0xbb, 0xef, 0xbd, 0x37, 0xbb, 0x65, 0xb8, 0x76, 0x33, 0x9b, 0xbc, 0x89, 0xbc, - 0x64, 0xc2, 0x06, 0xba, 0x39, 0x3c, 0xd6, 0xb9, 0x35, 0xc0, 0xb9, 0xbf, 0xcf, 0xb6, 0x4d, 0xbf, - 0x72, 0xbb, 0x85, 0xbd, 0x34, 0xb0, 0xd1, 0xbe, 0x5c, 0xb9, 0x07, 0x35, 0x03, 0xb9, 0xea, 0xbc, - 0x00, 0xc0, 0x0d, 0xc1, 0x2f, 0xbc, 0x1b, 0xc0, 0x1f, 0xbf, 0x72, 0xbb, 0x83, 0xbc, 0x0e, 0xba, - 0xb0, 0xad, 0xd9, 0xb6, 0xc5, 0xbd, 0x80, 0xbf, 0xc6, 0xbc, 0x54, 0xb9, 0x8a, 0xbc, 0x95, 0xbc, - 0x67, 0xbe, 0x16, 0xa7, 0x9a, 0xbf, 0xc2, 0x33, 0xa6, 0xbd, 0xa3, 0xb9, 0x08, 0xc0, 0xe6, 0xbb, - 0xc5, 0x37, 0x12, 0xbc, 0xd8, 0xbf, 0x92, 0xbd, 0x71, 0xc0, 0xa7, 0x38, 0x43, 0xb8, 0x27, 0xbd, - 0x55, 0xbd, 0x21, 0xb8, 0xe8, 0xa9, 0x9e, 0x3d, 0x87, 0xbe, 0x43, 0xc0, 0xa8, 0xba, 0x66, 0xb2, - 0x0d, 0xb8, 0xa8, 0xb2, 0x50, 0xb4, 0x3b, 0xbe, 0xc0, 0xbe, 0xf4, 0x32, 0xda, 0xbd, 0x71, 0xbc, - 0x10, 0xbd, 0xc3, 0xb6, 0x0c, 0xbf, 0xb1, 0xbc, 0xbe, 0xbd, 0xf9, 0xba, 0xe5, 0x34, 0xfa, 0xbc, - 0x1e, 0xb9, 0xec, 0xb7, 0x72, 0xb8, 0x96, 0xbf, 0xa0, 0xbc, 0xea, 0xac, 0x36, 0x2c, 0xf8, 0xc0, - 0x5f, 0x38, 0xae, 0xc0, 0x80, 0x3c, 0xab, 0xc1, 0x3f, 0xbf, 0xde, 0xc1, 0x12, 0xb7, 0x85, 0xc0, - 0xc2, 0xbf, 0xa4, 0xba, 0x4d, 0xbd, 0x2e, 0x3a, 0x26, 0x30, 0x4e, 0xbe, 0x09, 0x38, 0x2d, 0xb9, - 0xa6, 0xbc, 0xe7, 0x38, 0x6c, 0xc0, 0x9e, 0x36, 0xd7, 0xbb, 0x86, 0xc0, 0xa1, 0xbd, 0xb9, 0xba, - 0x6c, 0xa4, 0x9b, 0xbe, 0x94, 0xbc, 0x91, 0xaa, 0x98, 0x3a, 0xb5, 0x3a, 0x1a, 0xc1, 0x36, 0xc2, - 0x28, 0xbd, 0x5d, 0xbc, 0x97, 0xbc, 0x2e, 0xbc, 0x55, 0xc0, 0x94, 0xbc, 0xa5, 0xbc, 0xcb, 0xa1, - 0x25, 0x9d, 0xe3, 0xbd, 0x19, 0xbf, 0x89, 0x1b, 0x9b, 0xbf, 0x9d, 0xbf, 0x59, 0xbc, 0xeb, 0xb2, - 0x4f, 0xb8, 0x6b, 0xbc, 0x20, 0xc2, 0xb6, 0xb4, 0xef, 0xc0, 0x72, 0xbe, 0xed, 0xba, 0xbd, 0xbe, - 0x5b, 0x32, 0x1a, 0xbd, 0x9c, 0xc2, 0xbd, 0xba, 0x19, 0xc0, 0x94, 0xc0, 0x75, 0x3b, 0x5f, 0xbe, - 0x8c, 0xbe, 0x8d, 0x32, 0xf2, 0xbd, 0xd1, 0xc0, 0xa8, 0xbd, 0xf7, 0x2e, 0xad, 0x36, 0x9c, 0xbd, - 0x75, 0x3c, 0x7d, 0xb8, 0x9e, 0xbe, 0xde, 0x29, 0x3d, 0xbf, 0x29, 0xc0, 0x47, 0xbd, 0x39, 0xbf, - 0x71, 0xbd, 0x32, 0xc1, 0x25, 0xb8, 0xb2, 0xb5, 0x7e, 0xae, 0x7c, 0x38, 0x5f, 0xbc, 0xa0, 0xb6, - 0xc9, 0xc0, 0xf2, 0xbc, 0x74, 0xbc, 0x2f, 0x37, 0xa0, 0xb2, 0xfc, 0xbc, 0x09, 0xc2, 0xc6, 0x35, - 0x45, 0xc1, 0x62, 0xc1, 0x18, 0xc4, 0x25, 0xbb, 0x74, 0xba, 0x83, 0xb9, 0x6b, 0x36, 0x7b, 0xbc, - 0xa2, 0xb0, 0xf8, 0xbe, 0x20, 0xbe, 0xfc, 0xba, 0x35, 0xbe, 0x51, 0xbe, 0xbf, 0xbd, 0x4d, 0x3d, - 0x15, 0xb4, 0xd8, 0xbd, 0x37, 0xc0, 0x93, 0xbc, 0x9d, 0xbc, 0xdd, 0xbd, 0xd5, 0xc0, 0x1c, 0xbe, - 0x09, 0xc1, 0x97, 0xc0, 0xe9, 0xba, 0x22, 0xba, 0xc6, 0xbe, 0x27, 0xbe, 0x38, 0xb9, 0x99, 0xb6, - 0xca, 0x38, 0x1d, 0xc1, 0xdc, 0xb4, 0x9c, 0xbe, 0xeb, 0xbe, 0x63, 0xba, 0x9f, 0xbc, 0xef, 0xc1, - 0xa8, 0xae, 0x9d, 0xbc, 0x21, 0x31, 0x5e, 0xbc, 0x34, 0xc1, 0x3f, 0xbd, 0x2b, 0xb0, 0x4c, 0xba, - 0x55, 0xbe, 0x83, 0xc0, 0x6f, 0xc1, 0x92, 0xb6, 0x99, 0x35, 0x94, 0x35, 0x0a, 0xb2, 0x11, 0xbf, - 0x0f, 0xa1, 0xb8, 0x1e, 0x69, 0xbe, 0x49, 0xba, 0xd2, 0xbd, 0xa4, 0x37, 0xb8, 0xb8, 0x1b, 0xb9, - 0x37, 0xbc, 0x7c, 0xbe, 0xba, 0x2c, 0x1b, 0xc3, 0x2a, 0x32, 0x25, 0xbb, 0x35, 0xc1, 0x44, 0xbe, - 0x91, 0xba, 0x39, 0xc0, 0xee, 0x34, 0xd7, 0xc2, 0xd4, 0x94, 0x2c, 0xbe, 0xd3, 0xc0, 0x6a, 0xb1, - 0x21, 0x34, 0x65, 0xb9, 0x78, 0x35, 0x30, 0x3d, 0xdc, 0xbe, 0x71, 0xbf, 0xa2, 0xb9, 0x02, 0xbd, - 0x67, 0xbc, 0x06, 0xc0, 0x49, 0xaa, 0x7c, 0xbd, 0xc7, 0xb0, 0xdc, 0xbf, 0x9c, 0xb8, 0x3c, 0xb9, - 0x35, 0xbc, 0xf7, 0xb5, 0xfa, 0xbe, 0x0c, 0x34, 0x3d, 0xbd, 0x68, 0xbf, 0xba, 0xb9, 0x20, 0xb7, - 0x6e, 0xbf, 0x0b, 0xad, 0x5a, 0xbf, 0xf9, 0xbd, 0xe8, 0xbc, 0x77, 0xc0, 0x30, 0xbe, 0x0b, 0xbf, - 0xeb, 0xae, 0x1e, 0xb8, 0xd6, 0xc1, 0x06, 0xb9, 0xf2, 0xbe, 0x0c, 0xbc, 0x65, 0xbc, 0x95, 0xbc, - 0xb5, 0xba, 0x7d, 0xb9, 0x76, 0xb8, 0x95, 0x34, 0x88, 0xbe, 0x53, 0xbe, 0x49, 0xbe, 0xd8, 0xbd, - 0xa4, 0xb9, 0xf2, 0xb8, 0x68, 0x21, 0x39, 0xc2, 0x88, 0xc0, 0x8d, 0xb8, 0x90, 0x37, 0xa2, 0xb5, - 0xce, 0xba, 0xa5, 0xbd, 0x27, 0xc0, 0x5a, 0xc0, 0x4a, 0xbd, 0x0c, 0xbf, 0x5c, 0xc0, 0x37, 0xb6, - 0x05, 0xc2, 0x58, 0xc1, 0xf5, 0xc1, 0xb4, 0xbb, 0xed, 0xb3, 0x5e, 0xbe, 0x17, 0xb6, 0xce, 0xb9, - 0xfb, 0xb6, 0x9f, 0xbc, 0xb6, 0xbc, 0xe1, 0x30, 0x82, 0xc0, 0x1d, 0xb9, 0xf0, 0xb9, 0x1e, 0xbd, - 0x11, 0xb2, 0x3e, 0x3b, 0x14, 0xb9, 0x93, 0xbd, 0xdf, 0xbd, 0x81, 0xbd, 0x6b, 0xbb, 0xbd, 0xbe, - 0xb9, 0xa5, 0x06, 0xbb, 0x43, 0xb4, 0x08, 0xbe, 0x5c, 0x34, 0x57, 0xc1, 0x2e, 0xc1, 0xb3, 0xb9, - 0xa3, 0xbc, 0xd7, 0xb8, 0x14, 0xc0, 0xff, 0xba, 0x4c, 0xc1, 0x47, 0xbd, 0xe3, 0x35, 0x6d, 0xbc, - 0xf5, 0xbd, 0x0f, 0xbd, 0x2d, 0x21, 0x9a, 0x36, 0x8d, 0xbf, 0x0b, 0xbe, 0x80, 0xb8, 0xec, 0xb8, - 0xba, 0xbf, 0x45, 0xc0, 0xd3, 0xb6, 0xfc, 0xbc, 0xff, 0xba, 0x2c, 0xc3, 0x5e, 0xb9, 0x56, 0xbd, - 0x75, 0xbc, 0x27, 0x34, 0x08, 0xbd, 0x1b, 0xbd, 0xf4, 0xb8, 0x43, 0xb9, 0x95, 0xb6, 0x79, 0xbf, - 0xbc, 0xba, 0x50, 0xbd, 0xc6, 0xbe, 0x79, 0xb7, 0xe9, 0xbc, 0xe1, 0xb8, 0x65, 0x2a, 0x07, 0xb1, - 0x66, 0x39, 0xbc, 0x38, 0xd7, 0xbe, 0xdc, 0xb8, 0x0e, 0x3a, 0x23, 0xbe, 0x8e, 0xbc, 0xa3, 0xbb, - 0x41, 0xbb, 0x56, 0x29, 0x58, 0x2b, 0xef, 0xbe, 0x69, 0xc0, 0xbd, 0xbd, 0x8c, 0xb5, 0x63, 0xbe, - 0xb1, 0xbf, 0x93, 0xbe, 0xf3, 0xb8, 0xbe, 0x36, 0x4b, 0xbd, 0x4f, 0x38, 0xb6, 0xbe, 0xe9, 0xbe, - 0xbb, 0xba, 0x5d, 0x3c, 0xdb, 0x25, 0x3e, 0xc1, 0x65, 0xbc, 0x41, 0xbd, 0x22, 0xbe, 0xfa, 0x31, - 0x32, 0xbd, 0x4e, 0x38, 0xb7, 0xbe, 0x3f, 0xbc, 0x81, 0xad, 0x82, 0xbb, 0x22, 0xba, 0xe2, 0xb3, - 0x39, 0xbc, 0x7d, 0xb4, 0x3e, 0xc0, 0x2b, 0xbc, 0xaf, 0xb9, 0x91, 0xbd, 0x51, 0xc0, 0x27, 0xc1}; -unsigned char conv2d_winograd_fp16_ker[] = { - 0x28, 0xbe, 0x1c, 0xc0, 0x38, 0xbe, 0xde, 0xbb, 0xad, 0xbf, 0x2a, 0xc1, 0x53, 0xc0, 0x29, 0xbd, - 0xea, 0xc0, 0xd5, 0xbc, 0x63, 0xba, 0x39, 0xbf, 0xe7, 0xc1, 0x9f, 0xbc, 0x45, 0xc4, 0x97, 0xc1, - 0xe0, 0xb9, 0x52, 0xc1, 0x1a, 0xc1, 0xa2, 0xc0, 0x6d, 0xc2, 0xb0, 0xbf, 0x7f, 0xc0, 0x4f, 0xb6, - 0x5d, 0xbc, 0x61, 0xbc, 0x0e, 0xbf, 0x43, 0xc2, 0xe8, 0xc0, 0x83, 0xc1, 0x02, 0xbf, 0x01, 0xba, - 0xeb, 0xc0, 0x83, 0xc4, 0x89, 0xbc, 0x10, 0xc3, 0xc8, 0xc0, 0xd1, 0xc0, 0x06, 0xb9, 0x1d, 0xc3, - 0x65, 0xc2, 0x91, 0xc1, 0xdc, 0xbe, 0x79, 0xbd, 0x29, 0xbe, 0x91, 0xc0, 0xd4, 0xbf, 0x98, 0xc1, - 0x4b, 0xc1, 0x68, 0xc4, 0x55, 0xc3, 0x9b, 0xbd, 0x2a, 0xc2, 0x66, 0xc2, 0x42, 0xb9, 0x59, 0xbe, - 0xe0, 0xc0, 0xa1, 0xbc, 0xe8, 0xc0, 0xbc, 0xbf, 0xd1, 0xc3, 0x11, 0xbe, 0xf2, 0xc1, 0xe8, 0xbb, - 0x0c, 0xb0, 0x63, 0xc3, 0x9e, 0xc0, 0xf5, 0xba, 0x8f, 0xc1, 0x1d, 0xbf, 0x05, 0xc0, 0x0e, 0xc2, - 0x50, 0xbf, 0xef, 0xbf, 0x37, 0xc0, 0x0e, 0xbc, 0x87, 0xbd, 0x72, 0xbe, 0xab, 0xb8, 0xbd, 0xc2, - 0xed, 0xbf, 0x5f, 0xbd, 0x2e, 0xc0, 0x0e, 0xbd, 0xfc, 0xbe, 0x93, 0xc1, 0x53, 0xc1, 0x7e, 0xbc, - 0x35, 0xc0, 0x38, 0xc1, 0xbb, 0xaf, 0xba, 0xbe, 0xde, 0xc1, 0xa4, 0xbc, 0x33, 0xbe, 0xcd, 0xc1, - 0x08, 0xbb, 0x0c, 0xc0, 0x31, 0xc0, 0xad, 0xbd, 0x64, 0xc0, 0x4e, 0xbf, 0x91, 0xb9, 0xd5, 0xc1, - 0x95, 0xc0, 0x7d, 0xbf, 0x1c, 0xc2, 0x83, 0xbe, 0x3f, 0xc0, 0xda, 0xbd, 0x7a, 0xbe, 0x07, 0xc2, - 0xa1, 0xbe, 0x45, 0xb9, 0x32, 0xae, 0x44, 0xc0, 0xde, 0xc1, 0xdf, 0xbd, 0x7f, 0xbe, 0xa6, 0xc3, - 0x65, 0xc3, 0x4c, 0xbc, 0xbd, 0xbd, 0xea, 0xc1, 0x80, 0xc1, 0x60, 0xc0, 0x84, 0xc0, 0x9d, 0xc1, - 0x74, 0xbd, 0x75, 0xbe, 0x87, 0xbe, 0xf7, 0xbd, 0x43, 0xbf, 0xfa, 0xc1, 0x2a, 0xc2, 0x84, 0xbb, - 0x2f, 0xbf, 0x37, 0xc1, 0xb6, 0xba, 0x91, 0xc1, 0xc5, 0xc1, 0xee, 0xc2, 0x38, 0xc0, 0xe2, 0xbe, - 0x4b, 0xbe, 0x4c, 0xbd, 0x5e, 0xbe, 0x61, 0xc2, 0x9a, 0xad, 0xbf, 0xbe, 0x51, 0xba, 0x3b, 0xc1, - 0x89, 0xc1, 0xaa, 0xbf, 0x01, 0xbd, 0x3f, 0xc2, 0x05, 0xbe, 0xcd, 0xbc, 0xc3, 0xc0, 0x3d, 0xc2, - 0xab, 0xc3, 0x1c, 0xbe, 0x49, 0xc1, 0x0e, 0xc0, 0x20, 0xc1, 0x88, 0xc2, 0xfc, 0xbf, 0x3f, 0xb9, - 0xf9, 0xb4, 0xc2, 0xb8, 0x94, 0xbe, 0xe1, 0xbf, 0x36, 0xbd, 0x24, 0xc2, 0x84, 0xc1, 0xc7, 0xc1, - 0x1f, 0x33, 0x2a, 0xbf, 0x4b, 0xc0, 0xa3, 0xbf, 0x57, 0xba, 0xbc, 0xba, 0x4f, 0xc0, 0xbe, 0x33, - 0x3d, 0xc3, 0x77, 0xc0, 0x65, 0xb4, 0x18, 0xbd, 0x51, 0xc1, 0xdc, 0xbe, 0xc8, 0xb9, 0x4c, 0xc0, - 0x16, 0x35, 0xbe, 0xbc, 0x31, 0xc1, 0xe4, 0xbd, 0x57, 0xbc, 0x49, 0xc1, 0xd4, 0xbd, 0xeb, 0xba, - 0x02, 0xc1, 0xa8, 0xbb, 0xcd, 0xc0, 0x7b, 0xc0, 0x21, 0xb2, 0x61, 0xc0, 0x8a, 0xc1, 0xe4, 0xbe, - 0x0f, 0xc2, 0xaf, 0xc0, 0x70, 0xc3, 0xd2, 0xbc, 0x67, 0xbd, 0xd9, 0xc1, 0x4e, 0xc2, 0x6e, 0xc1, - 0x1e, 0xc4, 0x09, 0xc3, 0x42, 0xbf, 0x50, 0xc1, 0x52, 0xbd, 0x77, 0xc3, 0x1d, 0xc0, 0x31, 0xbb, - 0xd2, 0xbe, 0x66, 0xc3, 0x9b, 0xbc, 0x4d, 0xbf, 0x66, 0xb6, 0x02, 0xc2, 0xbe, 0xc3, 0xd1, 0x28, - 0xef, 0xc2, 0x11, 0xbd, 0x9d, 0xc2, 0xd9, 0xbd, 0xb0, 0xbe, 0xd9, 0xbf, 0x49, 0xc2, 0x71, 0x9e, - 0x5b, 0xb5, 0x59, 0xc2, 0xf6, 0xbd, 0x4a, 0xb5, 0x12, 0xbd, 0x19, 0xbe, 0x73, 0xc3, 0xe5, 0xbc, - 0xec, 0xbc, 0x2d, 0xbf, 0x43, 0xbe, 0xfc, 0xc0, 0x68, 0xbc, 0x24, 0xc0, 0x7f, 0xc0, 0x8c, 0xc0, - 0x92, 0xba, 0x52, 0xba, 0x42, 0xc0, 0x18, 0xb9, 0x14, 0x3c, 0x11, 0xc2, 0xa2, 0xc2, 0x10, 0xbd, - 0xaa, 0xc0, 0x0f, 0xc0, 0x38, 0xc0, 0xa3, 0xc1, 0x58, 0xbe, 0x62, 0xc2, 0xe9, 0xc0, 0x36, 0xc0, - 0xc6, 0xc1, 0x21, 0xbc, 0xf5, 0xc2, 0x42, 0xbd, 0x35, 0xbc, 0xda, 0xc1, 0xcb, 0xbb, 0x5f, 0xba, - 0x2b, 0xbd, 0xff, 0xc2, 0x5f, 0xab, 0xc7, 0x2c, 0x41, 0xc0, 0x2e, 0xbe, 0x38, 0xc0, 0xf7, 0xc3, - 0x60, 0xbd, 0x73, 0xc2, 0x01, 0xbf, 0x3b, 0xc0, 0x8c, 0xc0, 0x88, 0xae, 0x26, 0xc0, 0x2a, 0xbf, - 0xd5, 0xc0, 0x9e, 0xc2, 0x75, 0xbe, 0x67, 0xc0, 0xc8, 0xbf, 0x7d, 0xbe, 0xf9, 0xc0, 0xaf, 0xbc, - 0x40, 0xba, 0x30, 0xbf, 0x19, 0xc1, 0x16, 0xc3, 0x10, 0xc0, 0x85, 0xb0, 0x31, 0xc3, 0xae, 0xbd, - 0xb0, 0xc0, 0xd4, 0xbd, 0x06, 0xc1, 0x72, 0xbf, 0x02, 0xc0, 0x83, 0xb7, 0x02, 0xc2, 0x56, 0xc2, - 0xa9, 0xc1, 0x7b, 0xbf, 0xce, 0xc0, 0x2a, 0xbf, 0x02, 0xc0, 0x97, 0xc1, 0x91, 0xba, 0xda, 0xb9, - 0xf2, 0xbd, 0xa5, 0xc1, 0xd3, 0xbf, 0x65, 0xbb, 0x32, 0xc0, 0x33, 0xbf, 0x93, 0xbb, 0x73, 0xc0, - 0xa2, 0xbf, 0xe6, 0xc2, 0x29, 0xc2, 0xbc, 0xc1, 0xfa, 0xc0, 0x3d, 0xc1, 0x28, 0xc2, 0xa4, 0xc2, - 0x44, 0xb9, 0x1d, 0xc4, 0x0d, 0xbf, 0x05, 0xc0, 0xe0, 0xc0, 0xc3, 0xbf, 0x25, 0x2c, 0xc3, 0xc1, - 0x03, 0xbf, 0x58, 0xbf, 0x21, 0xbe, 0x3c, 0xbd, 0x6f, 0xc3, 0x89, 0xc1, 0x14, 0xc0, 0xce, 0xc3, - 0xd3, 0xbd, 0xeb, 0xc1, 0x28, 0xc2, 0x79, 0xc1, 0x57, 0xbf, 0xe3, 0xbe, 0xa8, 0xbc, 0xca, 0xc0, - 0x5a, 0xbd, 0xaa, 0xbe, 0x40, 0xbd, 0x0d, 0xc1, 0x5b, 0xb9, 0x8f, 0xbc, 0xc5, 0xc1, 0xfd, 0xb9, - 0x1a, 0xc0, 0x6a, 0xc1, 0xac, 0xc1, 0x89, 0xbf, 0xf2, 0xbc, 0x7e, 0xc3, 0x04, 0xc2, 0xbe, 0xc0, - 0x3b, 0xc0, 0x2a, 0xc1, 0x4a, 0xc2, 0xa4, 0xc1, 0x60, 0xc2, 0x3b, 0xbd, 0x75, 0x35, 0xcc, 0xc0, - 0xbe, 0xc1, 0x74, 0xc0, 0x8e, 0xc0, 0xb6, 0xc0, 0xa1, 0xc0, 0x59, 0xc1, 0xbe, 0xc0, 0xe9, 0xbc, - 0x9f, 0xbe, 0x6e, 0xbe, 0x54, 0xc0, 0x28, 0xc2, 0x05, 0xbc, 0xf1, 0xc1, 0x26, 0xa7, 0x6b, 0xbe, - 0x4b, 0xbd, 0xc4, 0xb9, 0x48, 0xbe, 0x0b, 0xbb, 0x68, 0xbf, 0xe9, 0xbc, 0xe5, 0xbc, 0xdc, 0xc1, - 0xdc, 0xc4, 0xcd, 0xc1, 0xf7, 0xa4, 0xb1, 0x35, 0x32, 0xc0, 0x9c, 0xbe, 0x3a, 0xc0, 0x13, 0xc0, - 0x76, 0xb8, 0x47, 0xb9, 0x26, 0xc1, 0x25, 0xc2, 0x40, 0x38, 0x4c, 0xc2, 0xfb, 0x30, 0x32, 0xc0, - 0xb0, 0xb6, 0xaa, 0xbc, 0x7f, 0xc1, 0x42, 0xc0, 0xd5, 0xbf, 0x8d, 0xc1, 0xe0, 0xbe, 0x4b, 0xba, - 0x77, 0xbf, 0x16, 0xbe, 0xfc, 0xbf, 0x13, 0xc0, 0x52, 0xc0, 0x82, 0xc0, 0xf7, 0xbf, 0xe5, 0xb0, - 0x44, 0xc2, 0xe6, 0xbe, 0x8b, 0xba, 0x75, 0xbd, 0xb6, 0xc1, 0xcb, 0xbd, 0xb1, 0xc0, 0x28, 0xc3, - 0x09, 0xc3, 0xaa, 0xc0, 0xda, 0xbc, 0xde, 0xbd, 0x90, 0xb6, 0xeb, 0xc2, 0x13, 0xc0, 0x6e, 0xc2, - 0x40, 0xbd, 0x0a, 0xc0, 0xfb, 0xbc, 0x3c, 0xb8, 0xf1, 0xbf, 0x9f, 0xc0, 0xac, 0xc2, 0x8b, 0xc0, - 0x31, 0xc2, 0xbe, 0xc1, 0xc8, 0xbf, 0x19, 0xb9, 0x8f, 0xbc, 0x38, 0xbd, 0x2c, 0xc0, 0x4e, 0xc2, - 0xa9, 0xc3, 0x77, 0xc1, 0xa3, 0xbe, 0x2c, 0xc2, 0x67, 0xbe, 0x0b, 0xbe, 0xf1, 0xbc, 0xf6, 0xc0, - 0x58, 0xb7, 0x3a, 0xbf, 0xef, 0xbf, 0x6d, 0x3b, 0xe3, 0xc3, 0x04, 0xc4, 0x38, 0xc2, 0xdf, 0xbe, - 0x03, 0xbf, 0x88, 0xba, 0x13, 0xc0, 0x52, 0xbc, 0x85, 0xbe, 0x9a, 0xc4, 0x05, 0xbf, 0x96, 0xbb, - 0xab, 0xb3, 0x39, 0xb7, 0xfc, 0xc2, 0x64, 0xbf, 0x3a, 0xc2, 0xc1, 0xc1, 0xf3, 0xc1, 0x76, 0xbf, - 0x37, 0xbc, 0xd2, 0x33, 0xcb, 0xc0, 0x86, 0xc1, 0x10, 0xc1, 0x61, 0xc0, 0x60, 0xc1, 0xc8, 0xc0, - 0x36, 0xc0, 0x3d, 0xc0, 0xba, 0xb5, 0x60, 0xbc, 0x88, 0xbe, 0xe2, 0xbe, 0x52, 0xc1, 0xff, 0xc2, - 0xb7, 0xb1, 0x8f, 0xc0, 0x8a, 0xbd, 0xf6, 0xc0, 0xb7, 0xbe, 0x4f, 0xbe, 0x19, 0xc2, 0xa0, 0xc0, - 0xae, 0xbf, 0xf8, 0xc1, 0x94, 0xc3, 0xdc, 0xbd, 0x4b, 0xbf, 0x87, 0xbe, 0x43, 0xc0, 0x02, 0xc3, - 0xa2, 0xc2, 0x35, 0xbc, 0x47, 0xc3, 0xfc, 0x38, 0x0c, 0xbb, 0x71, 0xbd, 0xde, 0xc0, 0x2d, 0xbc, - 0x78, 0xbd, 0x65, 0xc2, 0x0e, 0xbc, 0x1c, 0xbc, 0x09, 0xc2, 0x22, 0xbe, 0xe2, 0xc1, 0xdd, 0xbb, - 0x58, 0xc0, 0x0e, 0xc0, 0x16, 0xc2, 0x80, 0xc1, 0xfc, 0xbc, 0x2c, 0xc2, 0x99, 0xc3, 0x07, 0xc1, - 0xa7, 0xbc, 0x4d, 0xc1, 0x4e, 0xc2, 0xb0, 0xba, 0x04, 0xbc, 0x27, 0xc0, 0x84, 0xbc, 0x68, 0xc0, - 0x91, 0xc2, 0x75, 0xb9, 0x54, 0xc0, 0x61, 0xc1, 0xdb, 0xbe, 0x77, 0xbb, 0x44, 0xbd, 0x80, 0xc2, - 0xf0, 0x2b, 0xe4, 0xbe, 0xcd, 0xb8, 0x5b, 0xc1, 0x21, 0xc0, 0x02, 0xba, 0xf2, 0xbd, 0x67, 0xc0, - 0xe6, 0xba, 0x58, 0xc2, 0x96, 0xbb, 0xa6, 0xc2, 0x44, 0xbf, 0x63, 0xc0, 0xde, 0xc0, 0x0d, 0xc1, - 0x72, 0xc1, 0x28, 0xc3, 0xd6, 0xc1, 0x1c, 0xb9, 0x4c, 0xbf, 0x49, 0xbf, 0xb8, 0xb4, 0xd5, 0xc2, - 0x9f, 0xc1, 0x53, 0xba, 0x09, 0xc2, 0xd8, 0x30, 0xd3, 0xc0, 0xd8, 0xbe, 0x28, 0xbe, 0x5e, 0xc0, - 0x2f, 0xc3, 0xf4, 0xbd, 0x3d, 0xbd, 0x37, 0xc0, 0xeb, 0xc0, 0x21, 0xc0, 0xe2, 0xb9, 0x20, 0xb9, - 0xa5, 0xc0, 0xe6, 0xbe, 0x16, 0xc4, 0x07, 0xbc, 0x93, 0xbd, 0x95, 0xc1, 0x91, 0xb5, 0xaa, 0xc1, - 0xa1, 0xbe, 0x8a, 0xba, 0xf4, 0xbc, 0xf1, 0xc1, 0x46, 0xc1, 0x8f, 0xbd, 0xa0, 0xbd, 0x21, 0xc0, - 0xc1, 0xc0, 0x9f, 0xbc, 0x3c, 0xc1, 0x61, 0xc1, 0xc4, 0xbe, 0x76, 0xbd, 0x69, 0xc0, 0xb0, 0xbe, - 0x21, 0xbc, 0x09, 0xc0, 0x86, 0xc1, 0x51, 0xbc, 0x7d, 0xbf, 0xad, 0xbf, 0xec, 0xbb, 0x98, 0xc0, - 0x0e, 0xc1, 0x13, 0xc1, 0x06, 0xc1, 0x38, 0xbd, 0x2e, 0xbe, 0xd1, 0xc0, 0x5c, 0xb4, 0xfd, 0xbd, - 0x49, 0xb0, 0x6b, 0xc0, 0x25, 0xc1, 0x7b, 0xbf, 0x91, 0xc0, 0x4a, 0xc4, 0x07, 0xc0, 0xf0, 0xbd, - 0x5a, 0xbf, 0x40, 0xc0, 0x17, 0xbf, 0xd4, 0xbf, 0xd2, 0xbe, 0x76, 0xc2, 0x33, 0xc2, 0x2a, 0xb2, - 0x28, 0xbd, 0x75, 0xc1, 0xa0, 0xbe, 0x0d, 0xc4, 0x57, 0xbc, 0x78, 0xc2, 0x2e, 0xc3, 0x62, 0xbe, - 0xfb, 0xbe, 0x48, 0xa9, 0x93, 0xc0, 0x9e, 0xc1, 0xaf, 0xc1, 0x76, 0xc0, 0x94, 0xc1, 0xfb, 0xbf, - 0xc8, 0xc1, 0xdc, 0xbe, 0xca, 0xbb, 0x23, 0xbe, 0xfd, 0xc4, 0x2c, 0xc0, 0x46, 0xc0, 0xd3, 0xc4, - 0xab, 0xc2, 0x84, 0xbb, 0x64, 0xc1, 0x2d, 0xb4, 0x25, 0xbd, 0x8c, 0xb8, 0xaa, 0xc1, 0x75, 0xc2, - 0x0f, 0xbf, 0x28, 0xc0, 0xde, 0xbf, 0x6e, 0xc2, 0xfc, 0xb7, 0x6d, 0xb9, 0x5c, 0xbe, 0xa4, 0xc4, - 0x27, 0xc0, 0xc4, 0xc2, 0x72, 0xb4, 0x43, 0xc2, 0xe8, 0xc2, 0xb5, 0xbd, 0x2b, 0xbe, 0xd6, 0xc3, - 0xc1, 0xb8, 0x5f, 0xc1, 0xde, 0xc0, 0x96, 0xbf, 0x99, 0xb9, 0x0e, 0xbd, 0x8b, 0xbb, 0x43, 0xbe, - 0xa3, 0xc1, 0x97, 0xbf, 0xa3, 0xbf, 0x08, 0xbf, 0x27, 0xbf, 0xae, 0xc1, 0x39, 0xbd, 0xf1, 0xbf, - 0x79, 0xc1, 0x54, 0xbf, 0xbc, 0xc2, 0xd6, 0xbe, 0x5a, 0xbc, 0x4d, 0xbe, 0x8d, 0xb9, 0xd2, 0xc2, - 0xe0, 0xc0, 0xd5, 0xc2, 0x7e, 0xbf, 0x31, 0xbf, 0x03, 0xbe, 0xa7, 0xbe, 0x22, 0xc0, 0x3a, 0xc0, - 0xf2, 0xbc, 0x39, 0xb9, 0x9c, 0x3c, 0x89, 0xbd, 0x2a, 0xc1, 0x02, 0xc0, 0x88, 0xc0, 0x07, 0xc2, - 0x92, 0xc1, 0xc3, 0xbb, 0x88, 0xbe, 0xe9, 0xba, 0x19, 0xbe, 0x70, 0xc1, 0xd4, 0xbc, 0xd5, 0xbc, - 0xb6, 0xbe, 0x1f, 0xc0, 0xdc, 0xbf, 0xa8, 0xc2, 0x88, 0xbf, 0xe5, 0xc0, 0x21, 0xc0, 0xeb, 0xbf, - 0xac, 0xbe, 0x3c, 0xc0, 0xb0, 0xc2, 0xdf, 0xc0, 0xb7, 0xc1, 0xa8, 0xc3, 0x2b, 0xb5, 0xd0, 0xb2, - 0x74, 0xbe, 0xe4, 0xb5, 0xb4, 0xbd, 0x44, 0xc1, 0x1c, 0xbb, 0x96, 0xc3, 0xfb, 0xba, 0xa2, 0xc3, - 0x84, 0xc1, 0x40, 0xbc, 0xe0, 0xbd, 0xd7, 0xbe, 0x80, 0xc1, 0x75, 0xc0, 0xb2, 0xc0, 0x7d, 0xc2, - 0xc0, 0xbc, 0x0e, 0xbc, 0xb9, 0xbe, 0x76, 0xb9, 0xc0, 0xc2, 0xcb, 0xbf, 0xef, 0xc0, 0x2f, 0xbe, - 0xb3, 0xbe, 0x22, 0xbe, 0x9b, 0xb8, 0xd4, 0xc0, 0x5b, 0xc1, 0xe8, 0xc1, 0x9a, 0xc0, 0x04, 0xbf, - 0x18, 0xbf, 0x87, 0xbc, 0x3e, 0xc0, 0x42, 0xc2, 0x24, 0xc0, 0xba, 0xbb, 0x1f, 0xc1, 0x4d, 0xbd, - 0xbe, 0xb9, 0x24, 0xc0, 0x22, 0xc0, 0x37, 0xbe, 0x61, 0xbd, 0xdd, 0xbb, 0xb8, 0xc1, 0x52, 0xbe, - 0x0e, 0xc0, 0x64, 0xb8, 0x4c, 0xbe, 0xd2, 0xba, 0xef, 0xc2, 0x82, 0xc3, 0x45, 0xb9, 0xa1, 0xba, - 0x63, 0xc0, 0x10, 0xc2, 0x14, 0xc2, 0xd1, 0xc1, 0x5d, 0xbf, 0x02, 0xbf, 0x1a, 0xac, 0x59, 0xc1, - 0x41, 0xbe, 0x99, 0xb4, 0x75, 0xc2, 0xf2, 0x37, 0xb7, 0xc0, 0x55, 0xc1, 0xb0, 0xba, 0x8d, 0xbe, - 0x65, 0xbd, 0x45, 0xc0, 0x1f, 0xbd, 0x77, 0xbc, 0x49, 0xc2, 0x39, 0xc1, 0xcb, 0xb8, 0x2d, 0xbe, - 0x90, 0xbb, 0x0e, 0xc2, 0x35, 0xc0, 0xad, 0xc3, 0x86, 0xba, 0xb5, 0xc2, 0x07, 0xc0, 0xcd, 0xbd, - 0x2f, 0xc1, 0x1c, 0xc1, 0x0d, 0xc2, 0x13, 0xc1, 0x16, 0xc1, 0xee, 0xba, 0x13, 0xba, 0xd7, 0xc4, - 0xf8, 0xc1, 0xfe, 0xba, 0xf1, 0xbe, 0xba, 0xbb, 0x67, 0xbf, 0xa4, 0xc4, 0xd2, 0xb5, 0x9b, 0xc2, - 0xdc, 0xc0, 0xe4, 0xbf, 0x94, 0xc0, 0x45, 0xbd, 0xf2, 0xc1, 0xa0, 0xbd, 0xd4, 0x33, 0x8b, 0xc3, - 0x51, 0xbf, 0x48, 0xbd, 0xc2, 0xb5, 0xcc, 0xc2, 0x05, 0xbf, 0x59, 0xc0, 0x18, 0xbe, 0x41, 0x32, - 0xf3, 0xc0, 0x0e, 0xbf, 0xe6, 0xba, 0xd8, 0xc3, 0x19, 0xc0, 0x2f, 0xbb, 0xb9, 0xbe, 0xb4, 0xc2, - 0x1e, 0xc0, 0x4a, 0xc1, 0xa2, 0x39, 0xad, 0xc2, 0x9a, 0xc2, 0x57, 0xc3, 0x64, 0xc0, 0xc5, 0xc3, - 0x89, 0xc3, 0x8f, 0xb6, 0x7b, 0xc2, 0x27, 0xc0, 0x41, 0xc0, 0x25, 0xc0, 0x7f, 0xc0, 0x3a, 0xc0, - 0x70, 0xc1, 0x5a, 0xb9, 0x99, 0xbd, 0x8e, 0x33, 0x65, 0xc1, 0x6d, 0xc0, 0x3c, 0xbe, 0x69, 0xbf, - 0x11, 0xc3, 0x26, 0xbc, 0x60, 0xc0, 0x52, 0xbf, 0xee, 0xc1, 0x9a, 0xbf, 0x27, 0xc0, 0xf7, 0xc0, - 0x81, 0xbe, 0xef, 0xc2, 0x7b, 0xbd, 0xc1, 0xc2, 0x2f, 0xc1, 0xcd, 0xbc, 0xa5, 0xc0, 0x0c, 0xbf, - 0x77, 0xc1, 0x60, 0xb8, 0xdc, 0xc0, 0x17, 0xb8, 0x67, 0xbd, 0xb0, 0xbc, 0x4f, 0xbf, 0x96, 0xc1, - 0x6e, 0xc1, 0xc2, 0xb5, 0x48, 0xbb, 0xcb, 0xbf, 0xc0, 0xc2, 0xba, 0xbf, 0x60, 0xba, 0xba, 0xb8, - 0x0f, 0xc4, 0x93, 0xc1, 0x2f, 0xc0, 0x69, 0xc1, 0x09, 0xc1, 0xa6, 0xb8, 0xe6, 0xbe, 0x02, 0xc1, - 0xdf, 0xc0, 0xca, 0xc0, 0x8b, 0xc0, 0x22, 0xc0, 0xa3, 0xc0, 0x5b, 0xbe, 0xea, 0xc3, 0x3d, 0xc0, - 0x87, 0xc1, 0xbe, 0xc3, 0x37, 0xc2, 0x86, 0xbd, 0x82, 0xbd, 0x59, 0xc0, 0x08, 0xbc, 0x10, 0xc2, - 0x81, 0xc1, 0xd3, 0xbc, 0xe7, 0xbd, 0xe5, 0xbe, 0x6c, 0xc0, 0x25, 0xbd, 0x41, 0x21, 0x62, 0xc1, - 0x2d, 0xbf, 0xdd, 0xc0, 0x53, 0xbf, 0x11, 0xbe, 0x33, 0xb7, 0x34, 0xb9, 0x5c, 0xc3, 0x5e, 0xc1, - 0x32, 0xc2, 0x0d, 0x34, 0xa7, 0xc0, 0xe3, 0xbc, 0xa2, 0xc2, 0x25, 0xc1, 0x1f, 0xc1, 0xa0, 0xbf, - 0xa3, 0xc0, 0x73, 0xc0, 0xe8, 0xbb, 0x4a, 0xc1, 0xbc, 0xc0, 0x47, 0xc1, 0x21, 0xc2, 0x4d, 0xc1, - 0x99, 0xbc, 0x90, 0xc1, 0x12, 0xc1, 0x98, 0xc0, 0x2e, 0xbc, 0x8c, 0xbc, 0x25, 0xbe, 0x13, 0xbc, - 0xae, 0xb9, 0x62, 0xc0, 0x41, 0xc0, 0x1b, 0xc4, 0x1a, 0xc1, 0x0d, 0xc3, 0xb5, 0xbd, 0x76, 0xc0, - 0x1e, 0xad, 0x64, 0xbf, 0xb5, 0xb9, 0xe8, 0xbf, 0x11, 0xc0, 0xf8, 0xbe, 0xc1, 0xc4, 0x16, 0xc1, - 0xa5, 0xc0, 0x23, 0xc0, 0x73, 0xbe, 0x9a, 0xbd, 0xd0, 0xc0, 0x5d, 0xbf, 0xd7, 0xbf, 0x84, 0xbf, - 0x61, 0xc3, 0x29, 0xc1, 0x32, 0xc2, 0xbb, 0xbc, 0x78, 0xc0, 0xe1, 0x31, 0xfe, 0xc0, 0xdd, 0x27, - 0x86, 0xb2, 0x59, 0xbc, 0x1f, 0x38, 0x10, 0xc2, 0xba, 0xbd, 0x78, 0xc1, 0x87, 0xc0, 0x64, 0xb5, - 0x62, 0xc1, 0x24, 0xc1, 0x41, 0xbd, 0x6f, 0xb4, 0x3b, 0xb9, 0x47, 0xc0, 0x87, 0xc0, 0x1d, 0xbe, - 0x56, 0xc2, 0x9f, 0xc0, 0x6a, 0xc0, 0xfa, 0xc0, 0x03, 0xc3, 0x39, 0xb3, 0x42, 0xc2, 0xc4, 0xc1, - 0x1a, 0xc4, 0xb6, 0xc0, 0x3d, 0xbf, 0x37, 0xba, 0x15, 0xbe, 0x0f, 0xc2, 0x5c, 0xc0, 0xb8, 0xbe, - 0x99, 0xbf, 0x66, 0xc1, 0xea, 0xbe, 0xf1, 0xc2, 0x3d, 0xc0, 0xd9, 0xbf, 0x29, 0xbf, 0x8e, 0xbe, - 0x70, 0xbb, 0x3a, 0xc1, 0xc8, 0xbf, 0x85, 0xbe, 0x1f, 0xc1, 0x50, 0xc2, 0xfa, 0xbd, 0x3f, 0xb9, - 0x36, 0xc3, 0x6f, 0xbf, 0x2e, 0xbe, 0x69, 0xc0, 0xd1, 0xc0, 0x01, 0xc0, 0xc1, 0xc1, 0x88, 0xbd, - 0x95, 0xbc, 0x91, 0xc2, 0x05, 0xc2, 0x2e, 0xc3, 0x39, 0xbf, 0xef, 0xc2, 0x78, 0xbd, 0x15, 0xc1, - 0x73, 0xbe, 0xff, 0xbe, 0x3b, 0xc0, 0xef, 0xbd, 0x22, 0xc0, 0x67, 0xbd, 0x20, 0xbb, 0xab, 0xbc, - 0xef, 0xb9, 0x80, 0xc0, 0x4d, 0xc1, 0xdb, 0xc0, 0xfe, 0xbd, 0x4f, 0xc0, 0x6a, 0xc3, 0x2c, 0xc0}; -unsigned char conv2d_winograd_fp16_ker1[] = { - 0x28, 0xbe, 0x50, 0xbf, 0x4b, 0xbe, 0x1e, 0xc4, 0x60, 0xbd, 0xd3, 0xbd, 0xb0, 0xb6, 0xab, 0xb3, - 0xd5, 0xbc, 0x5f, 0xbd, 0xaa, 0xbf, 0x66, 0xc3, 0x9e, 0xc2, 0xaa, 0xbe, 0x16, 0xbe, 0xd2, 0x33, - 0x1a, 0xc1, 0xbb, 0xaf, 0x49, 0xc1, 0x9d, 0xc2, 0x19, 0xc1, 0xac, 0xc1, 0x8b, 0xba, 0xba, 0xb5, - 0x43, 0xc2, 0xad, 0xbd, 0xe1, 0xbf, 0x4a, 0xb5, 0x72, 0xbf, 0xa4, 0xc1, 0xde, 0xbd, 0xf6, 0xc0, - 0xc8, 0xc0, 0x3f, 0xc0, 0x57, 0xba, 0x68, 0xbc, 0x02, 0xc0, 0xa1, 0xc0, 0xf1, 0xbf, 0x4b, 0xbf, - 0x91, 0xc0, 0xdf, 0xbd, 0xdc, 0xbe, 0x11, 0xc2, 0x33, 0xbf, 0xf1, 0xc1, 0x38, 0xbd, 0x71, 0xbd, - 0x42, 0xb9, 0x84, 0xc0, 0xd4, 0xbd, 0xe9, 0xc0, 0x28, 0xc2, 0xe5, 0xbc, 0xf1, 0xbc, 0xe2, 0xc1, - 0xe8, 0xbb, 0x84, 0xbb, 0xe4, 0xbe, 0x5f, 0xba, 0xc3, 0xc1, 0x13, 0xc0, 0xdf, 0xbe, 0x07, 0xc1, - 0x2a, 0x3c, 0x16, 0x3a, 0xf0, 0x3c, 0xd9, 0x3f, 0xeb, 0x3c, 0xc3, 0x3c, 0x95, 0x3b, 0x7f, 0x3c, - 0x2e, 0x3e, 0x7d, 0x3b, 0xd0, 0x3d, 0x38, 0x3b, 0xb6, 0x3d, 0x7a, 0x39, 0xd2, 0x3a, 0x28, 0x3c, - 0xf2, 0x3c, 0xae, 0x37, 0x87, 0x3d, 0xfb, 0x3c, 0x79, 0x3c, 0xba, 0x3f, 0x24, 0x3d, 0x03, 0x38, - 0x2c, 0x40, 0x16, 0x3b, 0xcc, 0x3d, 0x32, 0x3d, 0xfc, 0x3d, 0x2e, 0x3c, 0xe8, 0x3c, 0x91, 0x3f, - 0xcf, 0x3e, 0xa6, 0x3c, 0xde, 0x31, 0xe4, 0x3c, 0x2c, 0x3c, 0x12, 0x3d, 0x84, 0x3d, 0xf8, 0x3f, - 0xa1, 0x3d, 0x38, 0x3f, 0x1a, 0x39, 0x45, 0x3f, 0xd8, 0x3d, 0x99, 0x3c, 0x4e, 0x3f, 0xac, 0x3a, - 0x16, 0x3d, 0x0e, 0x3d, 0xa1, 0x38, 0x09, 0x3c, 0x47, 0x40, 0x88, 0x3d, 0x35, 0x3e, 0x86, 0x3d, - 0x82, 0x3c, 0xa9, 0x3c, 0x6f, 0x3f, 0x44, 0x38, 0x62, 0x3e, 0xe6, 0x3e, 0x6d, 0x3f, 0xe1, 0x3e, - 0xd5, 0x38, 0xf8, 0x34, 0xdf, 0xb1, 0x40, 0x3a, 0xa2, 0x34, 0xa0, 0xa6, 0x00, 0x17, 0xdb, 0x34, - 0x7a, 0x33, 0x1e, 0x31, 0x46, 0x3a, 0xcc, 0x39, 0x81, 0x38, 0x34, 0x36, 0xe7, 0xae, 0x78, 0xad, - 0x1e, 0x36, 0x90, 0xa8, 0x75, 0xac, 0xfa, 0x35, 0x39, 0x3c, 0x49, 0x34, 0x21, 0x39, 0x36, 0xb4, - 0x3c, 0x3d, 0x9d, 0x38, 0x20, 0x33, 0xb2, 0xb5, 0x2c, 0x31, 0xca, 0x3c, 0x27, 0x35, 0x4c, 0x38, - 0xd4, 0x2f, 0xa4, 0xb1, 0xa7, 0x34, 0xce, 0x32, 0xbd, 0x39, 0xc7, 0x39, 0xe5, 0x35, 0xf7, 0x36, - 0x62, 0x33, 0x2c, 0x31, 0x3b, 0x3a, 0x41, 0x3a, 0xe8, 0x38, 0x7e, 0x38, 0xf0, 0x2f, 0x42, 0x33, - 0x0e, 0x3a, 0x5e, 0x38, 0xea, 0x30, 0x66, 0x38, 0xfc, 0x34, 0xfc, 0x2d, 0xfe, 0x39, 0xad, 0x37, - 0x88, 0x2e, 0x57, 0x3a, 0x98, 0x32, 0x0f, 0x38, 0x51, 0x3b, 0xa5, 0x38, 0x9c, 0x3b, 0x1d, 0x35, - 0x52, 0xb0, 0x67, 0xac, 0xe6, 0xaf, 0x46, 0xb2, 0xee, 0xb0, 0x1e, 0xb0, 0x1b, 0xb0, 0xa1, 0xb1, - 0x80, 0xb2, 0xa2, 0xae, 0x30, 0xb2, 0x2f, 0xaa, 0x39, 0xb0, 0x44, 0xac, 0x97, 0xac, 0x1c, 0xb1, - 0xa6, 0xaf, 0x3c, 0xac, 0x68, 0xaf, 0x18, 0xae, 0x57, 0xb0, 0xae, 0xb2, 0x52, 0xb2, 0x6b, 0xaa, - 0x63, 0xb4, 0x52, 0xaf, 0x35, 0xb1, 0x51, 0xb1, 0x74, 0xb1, 0xda, 0xaf, 0xd7, 0xb0, 0x4b, 0xb3, - 0xd1, 0xb1, 0x12, 0xae, 0x01, 0xa4, 0x09, 0xb1, 0x04, 0xb0, 0xc6, 0xb0, 0x16, 0xb1, 0x28, 0xb4, - 0xb0, 0xb0, 0x5a, 0xb3, 0xf4, 0xac, 0xbe, 0xb2, 0x13, 0xb2, 0x7f, 0xae, 0x93, 0xb3, 0xd6, 0xad, - 0x9e, 0xb2, 0x88, 0xb0, 0xe2, 0xa9, 0x34, 0xae, 0x7b, 0xb3, 0x7b, 0xb1, 0x54, 0xb3, 0x42, 0xb0, - 0x86, 0xb0, 0xdb, 0xb1, 0x6a, 0xb3, 0x0b, 0xad, 0x0c, 0xb2, 0x08, 0xb3, 0x4d, 0xb4, 0x16, 0xb2, - 0xd8, 0xad, 0x12, 0xa6, 0xb0, 0x24, 0x00, 0xad, 0xb1, 0xab, 0x48, 0x9f, 0x50, 0xa8, 0x01, 0xae, - 0x9d, 0xac, 0xaa, 0xa6, 0x0b, 0xb0, 0xd2, 0xa7, 0xd5, 0xa9, 0xb9, 0xa8, 0x38, 0x26, 0x0c, 0xaa, - 0x5e, 0xa8, 0x7e, 0xa3, 0x87, 0x27, 0x1d, 0xa0, 0x23, 0xb0, 0x68, 0xa9, 0x43, 0xb0, 0xbe, 0x26, - 0x48, 0xb2, 0x58, 0xad, 0x25, 0xa9, 0x00, 0x91, 0xbe, 0xa8, 0x69, 0xb0, 0xc7, 0xab, 0xea, 0xad, - 0x10, 0xa6, 0x00, 0x29, 0xc1, 0xa6, 0x36, 0xab, 0xf2, 0xad, 0x0e, 0xae, 0x6c, 0xab, 0xa9, 0xae, - 0x60, 0xa7, 0x31, 0xac, 0xdc, 0xad, 0xdb, 0xae, 0xb9, 0xae, 0x78, 0xa9, 0x42, 0xac, 0xc8, 0xa7, - 0xf8, 0xb0, 0x7a, 0xac, 0x0c, 0x9b, 0x89, 0xaa, 0x8a, 0xaa, 0x6c, 0xa9, 0xc3, 0xb0, 0x81, 0xa9, - 0xf5, 0xa8, 0xaa, 0xb0, 0x40, 0xac, 0xe1, 0xac, 0xbe, 0xaf, 0xbe, 0xae, 0xb5, 0xb1, 0x6b, 0xaa, - 0x50, 0xab, 0x0e, 0xab, 0xc9, 0xac, 0x3d, 0xb0, 0x27, 0xac, 0x6e, 0xac, 0x70, 0xa9, 0x6c, 0xa9, - 0xcf, 0xac, 0x19, 0xab, 0xe7, 0xac, 0x89, 0xad, 0x81, 0xae, 0x39, 0xaa, 0x82, 0xab, 0x9a, 0xa8, - 0x61, 0xad, 0x3c, 0xa5, 0x30, 0xae, 0x37, 0xae, 0x8f, 0xac, 0x72, 0xaf, 0xe2, 0xaa, 0x2f, 0xa7, - 0x4e, 0xaf, 0x5b, 0xaa, 0x66, 0xad, 0x84, 0xab, 0x72, 0xad, 0x90, 0xac, 0x41, 0xac, 0xc2, 0xae, - 0x8a, 0xae, 0x33, 0xad, 0x36, 0xa4, 0xe2, 0xab, 0x10, 0xac, 0xef, 0xac, 0x21, 0xad, 0x60, 0xae, - 0xa0, 0xad, 0xc5, 0xad, 0x78, 0xa9, 0xf8, 0xae, 0xef, 0xac, 0x7a, 0xad, 0xad, 0xad, 0x8b, 0xaa, - 0x4c, 0xaa, 0x01, 0xad, 0xa4, 0xa9, 0x99, 0xac, 0x15, 0xb0, 0x8c, 0xac, 0x71, 0xac, 0x11, 0xae, - 0x5c, 0xab, 0x54, 0xaa, 0x22, 0xae, 0xe4, 0xa6, 0x2c, 0xae, 0xd8, 0xad, 0x87, 0xad, 0x8d, 0xae, - 0x84, 0xa8, 0x2c, 0xa8, 0xfc, 0x9b, 0xb3, 0xac, 0x93, 0xa4, 0x50, 0xa0, 0xf0, 0x1c, 0x70, 0x95, - 0xe9, 0xa0, 0x45, 0xa4, 0x86, 0xa9, 0xf7, 0xac, 0x79, 0xab, 0x52, 0xa8, 0x75, 0xa1, 0x30, 0x25, - 0x4c, 0xa9, 0x72, 0x1d, 0x2f, 0xa6, 0xdb, 0xaa, 0x5c, 0xac, 0x3d, 0xa8, 0x89, 0xa5, 0x36, 0x21, - 0xd0, 0xac, 0x61, 0xa8, 0xe8, 0xa5, 0x29, 0x26, 0xb4, 0xa4, 0x0c, 0xad, 0x6c, 0xa5, 0xd7, 0xa8, - 0xea, 0xa5, 0x4a, 0xa3, 0x96, 0xa5, 0xa8, 0xa1, 0x0d, 0xaa, 0x60, 0xaa, 0x98, 0xa7, 0x94, 0xa5, - 0x73, 0xa7, 0x14, 0xa0, 0x60, 0xaa, 0x50, 0xab, 0x72, 0xa8, 0x30, 0xab, 0x58, 0x9b, 0x50, 0xa5, - 0x02, 0xa6, 0x6a, 0xa9, 0xd8, 0xa5, 0x42, 0xaa, 0xa2, 0xa8, 0xc6, 0x9e, 0x7f, 0xa7, 0x5f, 0xaa, - 0x56, 0x9e, 0xe2, 0xa7, 0xc0, 0xa2, 0x90, 0xa6, 0xfc, 0xab, 0x5f, 0xa8, 0x43, 0xa9, 0x25, 0xa8, - 0x53, 0xc0, 0xab, 0xb8, 0x51, 0xba, 0x1d, 0xc0, 0x26, 0xc0, 0xa8, 0xbc, 0xe0, 0xbe, 0xf3, 0xc1, - 0x97, 0xc1, 0x7e, 0xbc, 0x3d, 0xc2, 0xd1, 0x28, 0xaf, 0xbc, 0xfd, 0xb9, 0xe5, 0xb0, 0xc8, 0xc0, - 0x5d, 0xbc, 0x08, 0xbb, 0xf9, 0xb4, 0x5b, 0xb5, 0xb0, 0xc0, 0x3b, 0xc0, 0x09, 0xc3, 0xb7, 0xb1, - 0x83, 0xc4, 0x7d, 0xbf, 0x2a, 0xbf, 0x2d, 0xbf, 0x7b, 0xbf, 0x74, 0xc0, 0x0a, 0xc0, 0xf8, 0xc1, - 0xdc, 0xbe, 0x32, 0xae, 0x65, 0xb4, 0x42, 0xc0, 0xd3, 0xbf, 0x54, 0xc0, 0xc8, 0xbf, 0x47, 0xc3, - 0x9b, 0xbd, 0xea, 0xc1, 0xe4, 0xbd, 0xa3, 0xc1, 0xbc, 0xc1, 0x0b, 0xbb, 0x2c, 0xc2, 0x1c, 0xbc, - 0xd1, 0xc3, 0x43, 0xbf, 0x21, 0xb2, 0x35, 0xbc, 0xe0, 0xc0, 0x32, 0xc0, 0xe3, 0xc3, 0xfc, 0xbc, - 0x1d, 0xbf, 0xee, 0xc2, 0xd9, 0xc1, 0x2e, 0xbe, 0x89, 0xc1, 0x4c, 0xc2, 0x9a, 0xc4, 0x27, 0xc0, - 0x94, 0x3c, 0x42, 0x3d, 0xfa, 0x3b, 0x32, 0x40, 0x9d, 0x3d, 0xa8, 0x3e, 0xb2, 0x3b, 0x70, 0x3b, - 0xc6, 0x3a, 0x2c, 0x3c, 0x97, 0x3d, 0xef, 0x3d, 0x55, 0x3e, 0xe4, 0x3c, 0xf0, 0x3c, 0x5e, 0x3c, - 0x2f, 0x3f, 0x36, 0x3c, 0x6d, 0x3e, 0xb9, 0x3d, 0x38, 0x3f, 0x4b, 0x3d, 0x7a, 0x3c, 0x7c, 0x39, - 0x69, 0x3f, 0xd6, 0x3c, 0xa2, 0x3d, 0x8c, 0x39, 0xb5, 0x3b, 0x80, 0x3e, 0xbe, 0x3c, 0x19, 0x3d, - 0xd3, 0x3c, 0xa0, 0x3c, 0xbc, 0x3a, 0xd1, 0x3c, 0xff, 0x3c, 0x8a, 0x3e, 0xc8, 0x3e, 0xf7, 0x3c, - 0x42, 0x3e, 0x26, 0x3e, 0x13, 0x3c, 0xc4, 0x3e, 0x6b, 0x3c, 0x18, 0x3c, 0xd0, 0x3d, 0x4c, 0x3c, - 0x29, 0x3c, 0xb6, 0x3d, 0x4a, 0x3c, 0x9e, 0x3e, 0x46, 0x3e, 0x02, 0x40, 0x6c, 0x3b, 0x6a, 0x3d, - 0x46, 0x3c, 0xbf, 0x3c, 0x4e, 0x3e, 0xf7, 0x3c, 0xc0, 0x3d, 0xc9, 0x39, 0x9e, 0x3b, 0xa0, 0x3d, - 0x89, 0xba, 0x43, 0xba, 0x2c, 0xba, 0x4f, 0xbc, 0xbf, 0xba, 0x61, 0xbb, 0x26, 0xba, 0x14, 0xbb, - 0x42, 0xbb, 0x00, 0xbb, 0xd8, 0xbb, 0x5c, 0xbb, 0xaf, 0xba, 0x34, 0xba, 0xb4, 0xba, 0x7a, 0xbb, - 0x8e, 0xba, 0x0f, 0xba, 0x33, 0xba, 0x89, 0xba, 0xbc, 0xbb, 0x86, 0xbc, 0xb0, 0xbb, 0xd8, 0xb9, - 0x70, 0xbc, 0x10, 0xbb, 0xf3, 0xba, 0xfc, 0xb9, 0xa9, 0xbb, 0x8b, 0xbb, 0x34, 0xba, 0x32, 0xbc, - 0xbc, 0xbb, 0x32, 0xba, 0x5f, 0xb9, 0x5d, 0xb8, 0x2d, 0xba, 0x26, 0xbb, 0xbc, 0xbb, 0xdb, 0xba, - 0x06, 0xbd, 0x26, 0xbc, 0x3c, 0xb9, 0x48, 0xbc, 0x38, 0xbc, 0xcf, 0xb8, 0x23, 0xbc, 0x51, 0xba, - 0x5a, 0xbb, 0x85, 0xbb, 0x27, 0xba, 0x32, 0xbb, 0x9a, 0xbb, 0xe4, 0xba, 0x26, 0xbb, 0x5a, 0xbc, - 0xf0, 0xba, 0x90, 0xbb, 0x60, 0xbc, 0x0e, 0xba, 0x4b, 0xbc, 0x50, 0xb9, 0x74, 0xba, 0x9a, 0xba, - 0x67, 0xb4, 0x32, 0xb6, 0x80, 0xb4, 0x0a, 0xb5, 0x68, 0xb6, 0xcf, 0xb4, 0xce, 0xad, 0x14, 0xaf, - 0x1e, 0xad, 0x46, 0xb1, 0xa8, 0xb7, 0x78, 0xb2, 0x9e, 0xb3, 0xfe, 0xb4, 0x90, 0xb2, 0x81, 0xb2, - 0xe4, 0xb5, 0x85, 0xb2, 0x1b, 0xb2, 0x00, 0xb5, 0x54, 0xb7, 0x60, 0xb3, 0x77, 0xb3, 0xfc, 0x29, - 0xf6, 0xb8, 0xd0, 0xb4, 0x57, 0xb5, 0x6a, 0xb0, 0x6a, 0xac, 0x4d, 0xb7, 0x0d, 0xb0, 0x48, 0xb5, - 0xa0, 0xa6, 0xf6, 0xb3, 0x8a, 0xaf, 0x2e, 0xb1, 0x64, 0xb4, 0x34, 0xb7, 0xeb, 0xb0, 0x18, 0xad, - 0x56, 0xb2, 0xcd, 0xb6, 0xfe, 0xb4, 0xe7, 0xb6, 0x22, 0xb3, 0xd3, 0xb3, 0x22, 0xb3, 0xa3, 0xb3, - 0xf2, 0xb5, 0x8f, 0xb7, 0xec, 0xb2, 0x32, 0xb5, 0x82, 0xb1, 0xde, 0xb8, 0xe4, 0xb8, 0x0e, 0xb5, - 0x78, 0xb4, 0xd8, 0xb4, 0x97, 0xb7, 0x64, 0xb8, 0xcf, 0xb6, 0x1a, 0xb1, 0x68, 0xb5, 0x54, 0xb5, - 0x48, 0x2e, 0xf3, 0x2d, 0x2d, 0x2e, 0xe0, 0x2e, 0x62, 0x2e, 0x44, 0x2e, 0x9d, 0x2d, 0xdc, 0x2e, - 0x28, 0x2f, 0xb4, 0x2e, 0xf6, 0x2f, 0x52, 0x2e, 0x68, 0x2d, 0xd5, 0x2d, 0x12, 0x2e, 0x4c, 0x2f, - 0x36, 0x2d, 0xae, 0x2d, 0x9f, 0x2c, 0xca, 0x2d, 0xe6, 0x2e, 0x64, 0x30, 0x96, 0x2f, 0x68, 0x2d, - 0x57, 0x30, 0xde, 0x2e, 0x68, 0x2e, 0x24, 0x2e, 0x5c, 0x2f, 0x0b, 0x2f, 0x51, 0x2d, 0x34, 0x30, - 0xca, 0x2e, 0xc4, 0x2d, 0x08, 0x2d, 0x60, 0x2a, 0xa0, 0x2d, 0x88, 0x2e, 0x29, 0x2e, 0xd4, 0x2d, - 0xad, 0x30, 0x05, 0x30, 0x1a, 0x2d, 0x06, 0x30, 0x3e, 0x30, 0x5f, 0x2c, 0x8c, 0x2f, 0x0c, 0x2e, - 0xcc, 0x2f, 0x7e, 0x2f, 0xc9, 0x2d, 0x25, 0x2e, 0x55, 0x2e, 0xf0, 0x2d, 0x47, 0x30, 0x49, 0x30, - 0xf2, 0x2e, 0x82, 0x2f, 0x54, 0x30, 0x60, 0x2e, 0x4c, 0x30, 0x58, 0x2d, 0xcb, 0x2e, 0xfe, 0x2d, - 0xa0, 0x29, 0xd6, 0x2a, 0x14, 0x2a, 0xfd, 0x27, 0x19, 0x2b, 0x94, 0x28, 0x1c, 0x25, 0xa3, 0x27, - 0x76, 0x27, 0x51, 0x28, 0xbf, 0x2c, 0xe4, 0x26, 0x54, 0x26, 0xbc, 0x29, 0x09, 0x28, 0x3a, 0x29, - 0xa8, 0x28, 0x45, 0x28, 0x10, 0x23, 0x20, 0x29, 0x49, 0x2b, 0x06, 0x2a, 0xdb, 0x29, 0xd8, 0x1e, - 0x8c, 0x2d, 0x45, 0x2a, 0xf7, 0x29, 0x7a, 0x28, 0xb2, 0x26, 0xdc, 0x2b, 0xab, 0x24, 0x9e, 0x2b, - 0x28, 0x22, 0xce, 0x28, 0xf1, 0x25, 0xd9, 0x21, 0xe2, 0x28, 0x62, 0x2b, 0xa0, 0x23, 0xdc, 0x22, - 0x2a, 0x29, 0x1f, 0x2c, 0xd5, 0x29, 0xea, 0x2b, 0x52, 0x2a, 0x2d, 0x28, 0xb5, 0x28, 0x0d, 0x29, - 0x4b, 0x2c, 0x80, 0x2c, 0x7f, 0x28, 0xee, 0x28, 0x68, 0x25, 0x52, 0x2c, 0xc0, 0x2e, 0x42, 0x2b, - 0x5d, 0x2a, 0xcc, 0x2a, 0xb2, 0x2c, 0x0b, 0x2d, 0x74, 0x2c, 0x3b, 0x28, 0x96, 0x2b, 0xae, 0x29, - 0xeb, 0x29, 0xf1, 0x29, 0x60, 0x29, 0x92, 0x2c, 0x66, 0x2a, 0x7e, 0x2b, 0x99, 0x29, 0x0d, 0x2a, - 0x08, 0x2a, 0x29, 0x2a, 0xeb, 0x2a, 0x42, 0x2b, 0x02, 0x2b, 0xd5, 0x29, 0x54, 0x2a, 0x7e, 0x2a, - 0x2c, 0x2b, 0x8e, 0x29, 0xd7, 0x2a, 0x79, 0x2a, 0xc2, 0x2b, 0xe3, 0x2b, 0xa0, 0x2a, 0x0b, 0x29, - 0x24, 0x2c, 0x57, 0x2a, 0xa3, 0x2a, 0xd9, 0x28, 0x8b, 0x2a, 0x43, 0x2b, 0x0c, 0x2a, 0x3a, 0x2b, - 0x2b, 0x2b, 0xca, 0x29, 0xd4, 0x28, 0xee, 0x28, 0xee, 0x29, 0x0e, 0x2b, 0x01, 0x2c, 0xa2, 0x2a, - 0x86, 0x2c, 0x93, 0x2b, 0xd0, 0x28, 0x06, 0x2c, 0x10, 0x2b, 0xad, 0x28, 0xb5, 0x2b, 0xb2, 0x29, - 0x08, 0x2a, 0xcd, 0x2a, 0xa1, 0x29, 0x53, 0x2b, 0xa2, 0x2b, 0x6f, 0x2b, 0x4a, 0x29, 0x9b, 0x2b, - 0x00, 0x2a, 0x95, 0x2a, 0xda, 0x2b, 0x67, 0x29, 0x88, 0x2b, 0x7a, 0x28, 0x5c, 0x29, 0x6f, 0x2a, - 0xe9, 0x24, 0xd2, 0x26, 0x7c, 0x24, 0x43, 0x28, 0x21, 0x27, 0x08, 0x27, 0x09, 0x21, 0x66, 0x20, - 0xea, 0x1d, 0x78, 0x22, 0x6b, 0x27, 0x53, 0x25, 0x5c, 0x26, 0xba, 0x25, 0x8d, 0x24, 0x3c, 0x23, - 0x4a, 0x28, 0x06, 0x24, 0x33, 0x26, 0x7e, 0x26, 0x80, 0x28, 0x3d, 0x24, 0xe0, 0x23, 0x58, 0x10, - 0x27, 0x29, 0x37, 0x25, 0x6f, 0x26, 0x8d, 0x1f, 0xd6, 0x1e, 0x26, 0x28, 0x96, 0x23, 0x40, 0x25, - 0x9a, 0x20, 0xd8, 0x24, 0x26, 0x21, 0xf1, 0x24, 0x7f, 0x25, 0x38, 0x28, 0x9c, 0x25, 0xa2, 0x22, - 0x8e, 0x24, 0x52, 0x27, 0x40, 0x25, 0xee, 0x27, 0xf2, 0x22, 0xca, 0x24, 0x08, 0x25, 0x59, 0x24, - 0x10, 0x25, 0x9e, 0x27, 0x30, 0x24, 0x4a, 0x27, 0x4e, 0x25, 0xd1, 0x29, 0xf7, 0x26, 0x54, 0x25, - 0x77, 0x24, 0xf2, 0x24, 0xc7, 0x27, 0x12, 0x28, 0xc8, 0x26, 0xfc, 0x20, 0xb9, 0x24, 0x8e, 0x26, - 0x40, 0x3d, 0x46, 0x3d, 0x7c, 0x3d, 0x24, 0x3c, 0x95, 0x3d, 0x5a, 0x3c, 0xc5, 0x3b, 0x3d, 0x3d, - 0x80, 0x3d, 0x30, 0x3d, 0x8a, 0x3f, 0x2c, 0x3c, 0xaa, 0x3a, 0xe5, 0x3c, 0x74, 0x3c, 0xe1, 0x3d, - 0x04, 0x3b, 0x71, 0x3c, 0x88, 0x38, 0x71, 0x3c, 0x9c, 0x3d, 0xf8, 0x3e, 0x46, 0x3e, 0xd4, 0x3a, - 0x14, 0x40, 0xd3, 0x3d, 0x32, 0x3d, 0x33, 0x3d, 0x64, 0x3d, 0x18, 0x3e, 0xbf, 0x3a, 0x52, 0x3f, - 0x1c, 0x3c, 0x97, 0x3c, 0x7a, 0x3b, 0x34, 0x36, 0x6c, 0x3c, 0x8e, 0x3d, 0x9e, 0x3a, 0xed, 0x3a, - 0xd4, 0x3e, 0x04, 0x3f, 0x9f, 0x3c, 0xc0, 0x3e, 0x16, 0x3f, 0x0a, 0x3b, 0x82, 0x3d, 0xf5, 0x3c, - 0x76, 0x3f, 0x02, 0x3f, 0x94, 0x3c, 0x67, 0x3c, 0xab, 0x3b, 0x36, 0x3d, 0xeb, 0x40, 0x3a, 0x3f, - 0x0e, 0x3e, 0x7c, 0x3e, 0xd0, 0x3f, 0xca, 0x3e, 0xbe, 0x3f, 0x86, 0x3c, 0x7e, 0x3e, 0xce, 0x3c, - 0x64, 0x33, 0xf1, 0x36, 0x8c, 0x36, 0x4a, 0x38, 0x60, 0xa7, 0x9b, 0x35, 0x1b, 0x37, 0xd5, 0x39, - 0xe0, 0x37, 0x58, 0x2f, 0xbc, 0x3a, 0xc6, 0x3b, 0xec, 0x3a, 0x1e, 0x39, 0x8f, 0x35, 0x00, 0x27, - 0x21, 0x3a, 0xe2, 0x34, 0xa6, 0x39, 0x40, 0x3a, 0x60, 0x33, 0xc7, 0x37, 0x1b, 0x38, 0x60, 0x32, - 0x1b, 0x3a, 0x76, 0x33, 0xa4, 0x3a, 0x2e, 0x30, 0xa5, 0x2c, 0xb0, 0x32, 0x04, 0x3c, 0x3a, 0x38, - 0x57, 0x30, 0x0d, 0x38, 0x7b, 0x37, 0x8c, 0x34, 0xc0, 0x1e, 0x26, 0x37, 0x5a, 0x39, 0x20, 0x38, - 0x8e, 0x39, 0x85, 0x3a, 0x95, 0x39, 0xfc, 0x32, 0x78, 0x39, 0x0a, 0x3c, 0x36, 0x38, 0x80, 0x9e, - 0x5c, 0x35, 0xca, 0x31, 0x80, 0x39, 0xc0, 0x39, 0xec, 0x2d, 0x9c, 0x39, 0x98, 0xb1, 0x57, 0x3b, - 0x0c, 0x3c, 0x39, 0x36, 0x60, 0x33, 0x56, 0x39, 0x45, 0x39, 0x9a, 0x37, 0x8e, 0x31, 0x1d, 0x3b, - 0xc0, 0xb4, 0x8c, 0xaf, 0xfa, 0xb5, 0x15, 0xb8, 0xf1, 0xaf, 0xcd, 0xb2, 0x1d, 0xb6, 0x92, 0xb5, - 0x22, 0xb9, 0xf3, 0xb1, 0xc1, 0xb5, 0x60, 0xb1, 0x06, 0xb7, 0x4a, 0xb5, 0xfa, 0xae, 0x64, 0xb4, - 0x2a, 0xb4, 0xa5, 0xb3, 0x1b, 0xb5, 0x46, 0xaa, 0x95, 0xaf, 0x4c, 0xb6, 0xd6, 0xb5, 0x54, 0xb0, - 0x74, 0xb9, 0xf0, 0xac, 0xce, 0xb3, 0x90, 0xb5, 0xb8, 0xb2, 0x56, 0xb1, 0xb4, 0xb4, 0x80, 0xb4, - 0x74, 0xb4, 0x1a, 0xb4, 0xbe, 0xae, 0x4e, 0xb2, 0x20, 0xb4, 0x2e, 0xb1, 0xed, 0xb5, 0xe0, 0xb6, - 0x2c, 0xb5, 0xfe, 0xb7, 0xbc, 0xb5, 0x2c, 0xb6, 0x04, 0xb6, 0x82, 0xb5, 0x6a, 0xb6, 0x1d, 0x2c, - 0xee, 0xb5, 0xa0, 0xb2, 0x5e, 0xb3, 0x99, 0xab, 0x1d, 0xb4, 0x81, 0xb6, 0x3c, 0xab, 0x2d, 0xb6, - 0x91, 0xb8, 0x8e, 0xb4, 0xd6, 0xb5, 0xdb, 0xb6, 0x8e, 0xb8, 0x24, 0xb5, 0xa9, 0xb5, 0x22, 0xb8, - 0x4c, 0xb0, 0xe8, 0x1c, 0x58, 0x2e, 0x80, 0xa1, 0x25, 0xb0, 0xf3, 0x29, 0xd8, 0xad, 0x0e, 0xb2, - 0x84, 0xa9, 0xa0, 0xa6, 0x0e, 0xae, 0x80, 0xa9, 0x2b, 0xb1, 0xe8, 0xad, 0x03, 0x2d, 0x58, 0x26, - 0x10, 0xb4, 0xbc, 0x20, 0x21, 0xb0, 0x48, 0xb1, 0x1c, 0xb5, 0x8b, 0xad, 0x67, 0xae, 0x84, 0x2f, - 0x70, 0xb5, 0x80, 0xac, 0x75, 0xb4, 0x58, 0x25, 0xd7, 0x2a, 0xeb, 0xb0, 0x7e, 0xb4, 0xd4, 0xa4, - 0x10, 0x28, 0x56, 0xab, 0x42, 0xb4, 0x2f, 0x26, 0xe6, 0xaa, 0xd0, 0xa9, 0x64, 0xb1, 0xeb, 0xb4, - 0x54, 0xb0, 0x57, 0xae, 0x02, 0xb4, 0xb9, 0xb0, 0x2b, 0xb3, 0x27, 0xb0, 0x1e, 0xb0, 0x2f, 0xa4, - 0xec, 0xb4, 0xe3, 0xab, 0xd8, 0xb0, 0x7a, 0xb1, 0x8c, 0x31, 0x09, 0xb1, 0x4c, 0xb1, 0xe2, 0xb2, - 0xf2, 0xb0, 0x23, 0xb2, 0x48, 0x24, 0x92, 0xb2, 0xc8, 0xb1, 0xc4, 0xb6, 0x4c, 0xae, 0x0d, 0xb2, - 0x94, 0x29, 0x28, 0x1a, 0xcb, 0x28, 0x94, 0x2b, 0xab, 0x26, 0x94, 0x24, 0x07, 0x2a, 0xca, 0x28, - 0x40, 0x2d, 0x7e, 0x26, 0xd8, 0x27, 0x28, 0x9d, 0xc4, 0x29, 0x36, 0x28, 0xf0, 0x12, 0xfe, 0x28, - 0xb6, 0x26, 0x72, 0x26, 0xe2, 0x27, 0x58, 0xa1, 0xab, 0x26, 0x0a, 0x2a, 0x74, 0x29, 0xf6, 0x1f, - 0x00, 0x2e, 0x80, 0x1f, 0xd8, 0x25, 0xfb, 0x29, 0xc2, 0x26, 0x97, 0x26, 0x52, 0x26, 0x87, 0x26, - 0x90, 0x28, 0x42, 0x26, 0xec, 0x22, 0xd6, 0x24, 0x45, 0x29, 0xfa, 0x21, 0x56, 0x29, 0x02, 0x2c, - 0x1c, 0x28, 0xc0, 0x2a, 0x82, 0x29, 0x6c, 0x2b, 0xbc, 0x29, 0x44, 0x26, 0x46, 0x2a, 0x69, 0xa0, - 0xa8, 0x2b, 0x32, 0x27, 0xea, 0x24, 0xa5, 0x9e, 0xdc, 0x26, 0xd4, 0x29, 0xc8, 0x25, 0xe0, 0x28, - 0xa2, 0x2b, 0x0a, 0x29, 0xe9, 0x29, 0xb2, 0x2a, 0xa4, 0x2c, 0xd9, 0x2a, 0xae, 0x2a, 0x58, 0x2b, - 0xfd, 0x26, 0xc0, 0xa0, 0x03, 0xa1, 0xbc, 0x20, 0xcf, 0x26, 0x1c, 0xa0, 0x9d, 0x24, 0x80, 0x25, - 0xaa, 0x25, 0xca, 0x20, 0x38, 0x1c, 0x7d, 0xa4, 0x70, 0x24, 0xa4, 0x20, 0x94, 0xa4, 0xa7, 0x20, - 0x8a, 0x26, 0x40, 0x0e, 0x08, 0x22, 0xc0, 0x14, 0xe8, 0x29, 0x38, 0x24, 0x21, 0x24, 0x7a, 0xa4, - 0xa2, 0x2b, 0xce, 0x1e, 0xbc, 0x26, 0x10, 0x21, 0x80, 0x04, 0x42, 0x26, 0xfc, 0x25, 0xe8, 0x91, - 0xc0, 0x1c, 0xa6, 0x1c, 0x94, 0x27, 0xa2, 0x9a, 0x52, 0x25, 0x70, 0x01, 0x81, 0x25, 0x74, 0x2a, - 0xd0, 0x22, 0x7f, 0x22, 0x21, 0x28, 0x5f, 0x28, 0x95, 0x27, 0x2d, 0x1a, 0x95, 0x25, 0x80, 0x90, - 0xda, 0x2a, 0xf2, 0x22, 0xcb, 0x21, 0xcc, 0x1c, 0x28, 0xa4, 0x44, 0x25, 0x4a, 0x28, 0x5e, 0x25, - 0x9e, 0x24, 0xb1, 0x27, 0x1c, 0x20, 0xae, 0x27, 0x4e, 0x28, 0x13, 0x2c, 0xd2, 0x26, 0x4c, 0x26, - 0xd0, 0x22, 0x00, 0x22, 0xa6, 0x25, 0x27, 0x27, 0x3b, 0x19, 0x5e, 0x23, 0x45, 0x25, 0xd9, 0x25, - 0x10, 0x28, 0x62, 0x20, 0xbe, 0x26, 0x9f, 0x25, 0x63, 0x27, 0x9c, 0x25, 0x8e, 0x21, 0xa9, 0x21, - 0xf4, 0x24, 0x3f, 0x23, 0xaa, 0x25, 0x8e, 0x22, 0x64, 0x1c, 0x92, 0x25, 0x5f, 0x25, 0x0c, 0x21, - 0x5a, 0x28, 0x0b, 0x1e, 0xfe, 0x24, 0xeb, 0x23, 0xfe, 0x20, 0x00, 0x20, 0x36, 0x26, 0xe3, 0x24, - 0xb4, 0x22, 0x70, 0x24, 0x4e, 0x20, 0x5c, 0x22, 0xaa, 0x20, 0xa1, 0x22, 0xdf, 0x25, 0x57, 0x25, - 0xa2, 0x25, 0xf5, 0x27, 0x9e, 0x25, 0x2a, 0x24, 0xcf, 0x25, 0x33, 0x27, 0xae, 0x25, 0x93, 0x99, - 0x01, 0x24, 0x20, 0x21, 0xa8, 0x24, 0x2c, 0x22, 0xd6, 0x22, 0x65, 0x26, 0xdd, 0x99, 0xee, 0x26, - 0xa7, 0x28, 0x90, 0x23, 0x8a, 0x24, 0x4c, 0x26, 0xb3, 0x27, 0x84, 0x23, 0xa4, 0x23, 0x13, 0x28, - 0x4c, 0x1d, 0xb4, 0x1d, 0x5f, 0x96, 0x0d, 0x1a, 0xca, 0x19, 0xe0, 0x15, 0x32, 0x1e, 0x9f, 0x23, - 0x54, 0x17, 0x94, 0x10, 0xb2, 0x22, 0x05, 0x24, 0xae, 0x23, 0x1d, 0x21, 0x1a, 0x16, 0x00, 0x9c, - 0xde, 0x24, 0x2f, 0x17, 0x75, 0x22, 0xc3, 0x24, 0x58, 0x23, 0x95, 0x1e, 0xec, 0x1f, 0xb0, 0x99, - 0x56, 0x24, 0xb1, 0x1d, 0x70, 0x25, 0x0c, 0x9a, 0x00, 0x9b, 0x56, 0x1f, 0x0a, 0x26, 0x24, 0x1e, - 0xef, 0x98, 0x8e, 0x1f, 0x36, 0x24, 0x98, 0x15, 0x7a, 0x94, 0x4b, 0x1f, 0x92, 0x22, 0x8e, 0x23, - 0x74, 0x22, 0xaa, 0x21, 0x3c, 0x24, 0x74, 0x1c, 0xba, 0x23, 0x73, 0x24, 0x66, 0x20, 0x5e, 0x15, - 0x64, 0x22, 0x9b, 0x19, 0x4d, 0x23, 0x84, 0x24, 0x2e, 0xa0, 0x67, 0x22, 0x4f, 0x19, 0xbe, 0x24, - 0xf2, 0x23, 0x2e, 0x21, 0x88, 0x95, 0xe4, 0x22, 0x72, 0x21, 0x0f, 0x25, 0x6a, 0x18, 0xea, 0x23, - 0xeb, 0x39, 0xba, 0xb1, 0xb0, 0x32, 0x17, 0x39, 0xcc, 0x38, 0x00, 0x26, 0xf9, 0x38, 0xb0, 0x37, - 0x32, 0x3c, 0xca, 0x35, 0xc8, 0x31, 0xf3, 0xb7, 0xc6, 0x37, 0xff, 0x34, 0x75, 0xb4, 0x46, 0x38, - 0x42, 0x36, 0xc4, 0x32, 0xd7, 0x34, 0xcc, 0xb4, 0x0a, 0x3a, 0xc2, 0x38, 0x43, 0x38, 0x18, 0xb1, - 0x24, 0x3e, 0x54, 0x2e, 0x86, 0x35, 0xd6, 0x38, 0x7a, 0x34, 0x19, 0x38, 0xa3, 0x34, 0xb2, 0x30, - 0xb5, 0x36, 0x62, 0x32, 0xe5, 0x35, 0xb8, 0x2e, 0x90, 0x39, 0x18, 0x25, 0x5a, 0x38, 0x92, 0x3c, - 0x67, 0x35, 0x42, 0x38, 0x62, 0x39, 0xdc, 0x3b, 0x55, 0x39, 0x48, 0x2b, 0x53, 0x39, 0x80, 0xae, - 0xc4, 0x3c, 0xc8, 0x36, 0x7c, 0x31, 0x65, 0xb2, 0x24, 0x2e, 0x8c, 0x38, 0x66, 0x39, 0xf6, 0x36, - 0xf8, 0x38, 0x78, 0x39, 0x6b, 0x38, 0x13, 0x3a, 0x1e, 0x3c, 0xcd, 0x3c, 0xc0, 0x3a, 0x9a, 0x39, - 0x38, 0xb0, 0x0e, 0xb1, 0x87, 0xaf, 0x8c, 0xb2, 0x42, 0xb1, 0xff, 0xb2, 0xe3, 0xb0, 0x5f, 0xb1, - 0x22, 0xaf, 0x85, 0xaf, 0x03, 0xb2, 0xba, 0xb0, 0x74, 0xb1, 0x1e, 0xb1, 0xdb, 0xb0, 0x8c, 0xb1, - 0x1e, 0xb3, 0x69, 0xb1, 0x06, 0xb2, 0x98, 0xb0, 0x50, 0xb2, 0x1b, 0xb0, 0x52, 0xb1, 0x74, 0xae, - 0xc6, 0xb2, 0xa9, 0xb0, 0xfe, 0xb1, 0x60, 0xae, 0x82, 0xad, 0x21, 0xb1, 0xbb, 0xb1, 0x51, 0xb0, - 0xe9, 0xae, 0x19, 0xb0, 0xe3, 0xaf, 0x10, 0xb1, 0xbc, 0xaf, 0x18, 0xb2, 0x17, 0xb3, 0xd6, 0xb0, - 0x2e, 0xb2, 0x1d, 0xb3, 0x2e, 0xb0, 0x3c, 0xb1, 0x7a, 0xb0, 0xb6, 0xae, 0x6e, 0xb2, 0x66, 0xaf, - 0xd4, 0xb0, 0xbc, 0xb0, 0xb2, 0xb0, 0x7a, 0xb2, 0x64, 0xb0, 0xb7, 0xb4, 0xb6, 0xad, 0xd5, 0xb0, - 0xb1, 0xb1, 0x51, 0xb1, 0x18, 0xb2, 0x2c, 0xb2, 0xe2, 0xb0, 0x1e, 0xac, 0x0a, 0xae, 0x7d, 0xb1, - 0x91, 0x2e, 0x44, 0x2e, 0xf8, 0x2d, 0xcc, 0x2f, 0xe0, 0x2d, 0xff, 0x2e, 0x94, 0x2e, 0x34, 0x2f, - 0x78, 0x2f, 0x00, 0x2f, 0x76, 0x2f, 0x72, 0x2f, 0x61, 0x2e, 0x0a, 0x2f, 0x92, 0x2e, 0xa2, 0x2f, - 0x0f, 0x2e, 0x03, 0x2f, 0x82, 0x2d, 0x58, 0x2d, 0x47, 0x2f, 0x01, 0x30, 0xa7, 0x2f, 0x62, 0x2e, - 0x30, 0x30, 0xd6, 0x2e, 0x0a, 0x2e, 0x85, 0x2d, 0xad, 0x2e, 0x5b, 0x2f, 0xc2, 0x2d, 0xf0, 0x2e, - 0x8b, 0x2e, 0xc8, 0x2d, 0x64, 0x2e, 0x85, 0x2a, 0x03, 0x2e, 0x63, 0x2e, 0x88, 0x2f, 0x59, 0x2d, - 0x0c, 0x31, 0xb5, 0x2f, 0x12, 0x2e, 0xa2, 0x2f, 0x1c, 0x30, 0x6c, 0x2c, 0x52, 0x2f, 0x98, 0x2d, - 0x4c, 0x2f, 0xfe, 0x2e, 0xe2, 0x2e, 0xac, 0x2e, 0x62, 0x2d, 0x9e, 0x2e, 0x74, 0x2d, 0x5e, 0x30, - 0xc0, 0x2f, 0x7f, 0x2f, 0xb4, 0x2f, 0x7f, 0x2f, 0x5c, 0x30, 0x75, 0x2b, 0xea, 0x2c, 0xea, 0x2d, - 0x22, 0x27, 0x14, 0x2a, 0x8e, 0x29, 0x94, 0x25, 0x48, 0x2b, 0x96, 0x29, 0xa8, 0x24, 0x12, 0x24, - 0x8c, 0x1e, 0x4b, 0x25, 0x71, 0x2a, 0x40, 0x20, 0x67, 0x26, 0xed, 0x28, 0xcc, 0x27, 0x1d, 0x28, - 0x10, 0x2b, 0x00, 0x28, 0xc8, 0x28, 0x76, 0x29, 0x56, 0x2a, 0xb0, 0x27, 0xc4, 0x24, 0x40, 0x9c, - 0x3e, 0x2c, 0x86, 0x27, 0x3f, 0x2b, 0x08, 0x28, 0xc0, 0x18, 0xa4, 0x28, 0xb6, 0x25, 0x2c, 0x28, - 0xa0, 0x97, 0xe5, 0x29, 0x72, 0x25, 0x1a, 0x24, 0x1d, 0x25, 0xd9, 0x29, 0xea, 0x24, 0x8a, 0x22, - 0x5c, 0x27, 0x0d, 0x2c, 0x47, 0x28, 0xe4, 0x29, 0x31, 0x26, 0x40, 0x26, 0x93, 0x28, 0x6f, 0x27, - 0xe1, 0x29, 0x23, 0x2b, 0x76, 0x28, 0xfa, 0x28, 0xf0, 0x1e, 0x24, 0x2e, 0xcd, 0x2c, 0x66, 0x29, - 0xee, 0x29, 0x50, 0x27, 0x0e, 0x2c, 0xd7, 0x2c, 0x6e, 0x29, 0x96, 0x26, 0x1d, 0x26, 0x3d, 0x2a, - 0x58, 0xa2, 0x0a, 0xa2, 0x44, 0xa2, 0x6c, 0xa2, 0xb3, 0xa1, 0xc7, 0xa1, 0xc1, 0xa1, 0x30, 0xa2, - 0x39, 0xa3, 0xec, 0xa2, 0x09, 0xa3, 0xa0, 0xa2, 0x64, 0xa1, 0xb4, 0xa2, 0x14, 0xa2, 0x07, 0xa3, - 0xe2, 0xa0, 0x54, 0xa2, 0x5c, 0xa0, 0x0e, 0xa1, 0xa2, 0xa2, 0x18, 0xa4, 0xd2, 0xa2, 0xd4, 0xa1, - 0x12, 0xa4, 0x79, 0xa2, 0x80, 0xa1, 0xbc, 0xa1, 0x9a, 0xa2, 0x05, 0xa3, 0x6d, 0xa0, 0xdc, 0xa2, - 0xf6, 0xa1, 0xee, 0xa1, 0x1c, 0xa2, 0x7e, 0x9a, 0xaa, 0xa1, 0x96, 0xa1, 0xbf, 0xa1, 0x2b, 0xa0, - 0xc6, 0xa4, 0x14, 0xa3, 0xe8, 0xa1, 0x8c, 0xa3, 0x0c, 0xa4, 0x23, 0xa0, 0x46, 0xa2, 0x7c, 0xa1, - 0x5d, 0xa3, 0x4c, 0xa3, 0xa6, 0xa2, 0x8e, 0xa1, 0x4b, 0xa0, 0x5e, 0xa1, 0xf1, 0xa2, 0x7f, 0xa4, - 0x74, 0xa3, 0xe5, 0xa2, 0x9e, 0xa3, 0xa4, 0xa3, 0x78, 0xa4, 0x09, 0xa0, 0xe6, 0xa0, 0x6a, 0xa1, - 0x08, 0x9d, 0xeb, 0x9e, 0x6c, 0x9f, 0x36, 0x99, 0xd3, 0x9f, 0xd6, 0x9c, 0xb0, 0x99, 0xf6, 0x98, - 0x50, 0x9a, 0xc4, 0x9c, 0x48, 0x9f, 0x62, 0x98, 0x42, 0x9a, 0x16, 0x9e, 0xc2, 0x9c, 0x22, 0x9d, - 0xbc, 0x9d, 0xa3, 0x9c, 0x74, 0x9b, 0xee, 0x9d, 0xb2, 0x9e, 0xa2, 0x9e, 0x9e, 0x9a, 0x14, 0x94, - 0xd5, 0xa0, 0x04, 0x9d, 0x23, 0x9f, 0xdb, 0x9d, 0x84, 0x99, 0xfc, 0x9d, 0xb0, 0x97, 0xf3, 0x9d, - 0x0c, 0x95, 0x55, 0x9f, 0x2d, 0x9c, 0x47, 0x0d, 0x81, 0x9b, 0xa2, 0x9d, 0xa6, 0x95, 0x56, 0x94, - 0xe6, 0x9d, 0x26, 0xa0, 0x87, 0x9d, 0x98, 0x9f, 0x85, 0x9d, 0x68, 0x9b, 0x7d, 0x9c, 0xfe, 0x9c, - 0xc0, 0x9f, 0x8f, 0xa0, 0xdc, 0x9d, 0x69, 0x9c, 0x4c, 0x91, 0xfc, 0xa0, 0x6b, 0xa2, 0x28, 0xa0, - 0x3f, 0x9f, 0xb8, 0x9c, 0xb4, 0xa0, 0x84, 0xa1, 0x23, 0xa0, 0xbe, 0x9c, 0x61, 0x9c, 0x5c, 0x9e, - 0xd6, 0x9d, 0xd8, 0x9d, 0x08, 0x9d, 0xe4, 0x9f, 0x9a, 0x9d, 0x5e, 0x9f, 0x52, 0x9e, 0xfc, 0x9e, - 0x4d, 0x9e, 0xf1, 0x9d, 0x09, 0x9f, 0xe1, 0x9e, 0x6a, 0x9e, 0x76, 0x9e, 0x24, 0x9e, 0x20, 0x9f, - 0xb9, 0x9e, 0xb1, 0x9e, 0x0b, 0x9e, 0x1c, 0x9d, 0x1a, 0x9f, 0x9b, 0x9e, 0x31, 0x9f, 0x9a, 0x9d, - 0xca, 0x9f, 0x36, 0x9e, 0x18, 0x9e, 0x9c, 0x9c, 0x72, 0x9d, 0xaf, 0x9e, 0x39, 0x9e, 0x0c, 0x9e, - 0xcb, 0x9d, 0x10, 0x9d, 0xaa, 0x9d, 0x73, 0x9c, 0x6a, 0x9d, 0x80, 0x9e, 0x06, 0xa0, 0xa0, 0x9d, - 0x7e, 0xa0, 0x9c, 0x9f, 0x71, 0x9d, 0xc8, 0x9e, 0x02, 0x9f, 0x20, 0x9c, 0x5d, 0x9f, 0xf5, 0x9c, - 0x5e, 0x9e, 0x02, 0x9e, 0x30, 0x9e, 0xf7, 0x9e, 0x7c, 0x9d, 0xc5, 0x9f, 0xaa, 0x9b, 0x48, 0x9f, - 0x18, 0x9f, 0xf8, 0x9e, 0x0f, 0x9f, 0xd2, 0x9e, 0x4b, 0x9f, 0x03, 0x9a, 0x40, 0x9c, 0xda, 0x9d, - 0x03, 0x98, 0x88, 0x9a, 0xf5, 0x98, 0x8b, 0x99, 0xa0, 0x9b, 0x02, 0x9c, 0xf4, 0x97, 0x11, 0x98, - 0x3c, 0x91, 0x9a, 0x95, 0x4a, 0x9b, 0x66, 0x95, 0x2f, 0x99, 0x99, 0x99, 0xe3, 0x98, 0x55, 0x99, - 0xb3, 0x9c, 0x5c, 0x99, 0x16, 0x9b, 0x09, 0x9a, 0x9f, 0x9b, 0xf5, 0x96, 0x0c, 0x98, 0x8a, 0x8c, - 0x7c, 0x9c, 0x7e, 0x98, 0x2a, 0x9c, 0x38, 0x97, 0x18, 0x8b, 0x50, 0x99, 0x85, 0x99, 0x54, 0x98, - 0x6a, 0x8f, 0x99, 0x99, 0x90, 0x96, 0x53, 0x99, 0x82, 0x96, 0x73, 0x9b, 0x06, 0x9a, 0xff, 0x97, - 0xc4, 0x98, 0xae, 0x9c, 0x98, 0x98, 0x10, 0x9a, 0xb0, 0x96, 0x5e, 0x97, 0xac, 0x9a, 0xe1, 0x97, - 0xc2, 0x99, 0x88, 0x9a, 0xef, 0x98, 0x39, 0x9b, 0x18, 0x96, 0x63, 0x9f, 0xa0, 0x9a, 0xe6, 0x98, - 0x86, 0x9a, 0xdd, 0x98, 0x25, 0x9c, 0xb2, 0x9c, 0xf9, 0x98, 0x5d, 0x95, 0x2d, 0x96, 0x3b, 0x9b, - 0x2e, 0xb1, 0x6b, 0xb1, 0x14, 0xb2, 0x96, 0xaf, 0x72, 0xb1, 0x04, 0xb0, 0xa6, 0xaf, 0xc3, 0xaf, - 0x4c, 0xb1, 0x9b, 0xb1, 0x02, 0xb2, 0x40, 0xb0, 0x0e, 0xaf, 0x93, 0xb1, 0xbc, 0xb0, 0x5c, 0xb1, - 0x59, 0xaf, 0xbb, 0xb0, 0xbe, 0xad, 0x7e, 0xb0, 0x71, 0xb1, 0x10, 0xb3, 0xa2, 0xb0, 0x4e, 0xaf, - 0x6e, 0xb3, 0x27, 0xb1, 0xe1, 0xb0, 0x4b, 0xb1, 0xec, 0xb0, 0xc2, 0xb1, 0xb6, 0xac, 0xd6, 0xb1, - 0x8b, 0xaf, 0xb9, 0xb1, 0xc8, 0xb0, 0xb0, 0x1f, 0x56, 0xb0, 0x63, 0xb0, 0x88, 0xad, 0x35, 0xac, - 0x3e, 0xb3, 0x1e, 0xb2, 0x0e, 0xb1, 0xab, 0xb2, 0x84, 0xb2, 0xc0, 0xae, 0x62, 0xb0, 0xb0, 0xb0, - 0xb2, 0xb2, 0x27, 0xb3, 0x90, 0xb1, 0xad, 0xaf, 0x4f, 0xac, 0xec, 0xb0, 0x57, 0xb4, 0x05, 0xb4, - 0x5e, 0xb2, 0x2e, 0xb1, 0x24, 0xb3, 0xb6, 0xb3, 0xfb, 0xb3, 0xfc, 0xaf, 0x3b, 0xb0, 0xac, 0xb0, - 0x30, 0xa9, 0x79, 0xac, 0x86, 0xab, 0x2d, 0xaa, 0x66, 0xa5, 0x94, 0xad, 0x74, 0xae, 0xbb, 0xb0, - 0xdc, 0xac, 0x50, 0xa6, 0x3c, 0xb0, 0x30, 0xae, 0x50, 0xae, 0x80, 0xae, 0x0a, 0xac, 0x46, 0xac, - 0xa7, 0xaf, 0x0a, 0xae, 0x49, 0xae, 0x07, 0xad, 0x1a, 0xa9, 0xb5, 0xa9, 0xc2, 0xae, 0xb0, 0xaa, - 0x93, 0xae, 0x28, 0xaa, 0x24, 0xb0, 0x8a, 0xa9, 0x20, 0x1a, 0xc6, 0xa4, 0x26, 0xb1, 0xb9, 0xab, - 0xe0, 0x93, 0x09, 0xac, 0x7e, 0xad, 0x3c, 0xac, 0xf0, 0x1a, 0x96, 0xac, 0x9c, 0xaf, 0x09, 0xad, - 0xcb, 0xae, 0xce, 0xb0, 0x4d, 0xae, 0x2a, 0xa4, 0x43, 0xae, 0xa2, 0xae, 0xec, 0xae, 0x98, 0x9f, - 0x24, 0xad, 0xf8, 0xa5, 0xee, 0xae, 0xf6, 0xae, 0x4e, 0x21, 0x44, 0xb1, 0x5f, 0x25, 0xdf, 0xae, - 0x83, 0xb1, 0x88, 0xad, 0x27, 0xab, 0x56, 0xb0, 0xc8, 0xac, 0x10, 0xa9, 0x35, 0xa4, 0xab, 0xaf, - 0x7c, 0x2a, 0x08, 0x28, 0xda, 0x2a, 0x2a, 0x2c, 0x13, 0x25, 0xe8, 0x28, 0x1c, 0x2c, 0x8c, 0x2b, - 0xc5, 0x2d, 0x2e, 0x29, 0xfb, 0x2a, 0x45, 0x29, 0xb0, 0x2b, 0x31, 0x2c, 0xa4, 0x27, 0xd0, 0x2a, - 0xf8, 0x28, 0x62, 0x2b, 0x31, 0x29, 0xe2, 0x1e, 0x64, 0x27, 0x8f, 0x2a, 0xac, 0x2b, 0x22, 0x29, - 0xa4, 0x2d, 0xb8, 0x26, 0x11, 0x28, 0xea, 0x29, 0x8c, 0x27, 0xc6, 0x28, 0x5e, 0x29, 0x0b, 0x28, - 0x46, 0x28, 0xec, 0x28, 0x8a, 0x29, 0x1f, 0x24, 0x72, 0x29, 0xd7, 0x26, 0x6c, 0x2b, 0x3c, 0x29, - 0x1f, 0x2c, 0x44, 0x2c, 0x2d, 0x2c, 0x82, 0x2a, 0xbe, 0x2b, 0x8e, 0x29, 0x88, 0x2a, 0x27, 0x18, - 0x94, 0x2b, 0x9c, 0x28, 0xdf, 0x2a, 0x32, 0x25, 0x90, 0x23, 0x6c, 0x2b, 0xf0, 0x15, 0x3e, 0x2c, - 0xda, 0x2d, 0x8c, 0x2a, 0x12, 0x2a, 0x66, 0x2d, 0x7f, 0x2d, 0x24, 0x26, 0x08, 0x28, 0xde, 0x2b, - 0x0e, 0x23, 0x40, 0x20, 0x7c, 0x1d, 0x72, 0xa0, 0xa1, 0x27, 0x94, 0x20, 0xaa, 0x24, 0x16, 0x26, - 0x80, 0x19, 0xf8, 0x1e, 0x20, 0x22, 0xc9, 0x9f, 0x70, 0x24, 0x3a, 0x24, 0x50, 0x18, 0xc2, 0x20, - 0x99, 0x29, 0x1e, 0x21, 0xfc, 0x27, 0x28, 0x27, 0x90, 0x28, 0x00, 0x24, 0x68, 0x1d, 0xb8, 0xa1, - 0xe7, 0x28, 0x82, 0x1e, 0x89, 0x2a, 0xfd, 0x23, 0x17, 0xa1, 0x22, 0x1e, 0xd2, 0x28, 0xf6, 0x15, - 0xc4, 0x9f, 0xe4, 0x26, 0xa0, 0x28, 0x54, 0x99, 0x50, 0x98, 0x64, 0x1d, 0x4a, 0x25, 0x8e, 0x28, - 0xbf, 0x25, 0xdc, 0x27, 0xf9, 0x26, 0x87, 0x24, 0x3a, 0x26, 0xa2, 0x22, 0xbd, 0x26, 0xfe, 0x1e, - 0x0f, 0x29, 0x84, 0x23, 0x47, 0x27, 0x0a, 0x26, 0x1e, 0xa7, 0x84, 0x2a, 0xcc, 0x27, 0x1c, 0x28, - 0x54, 0x28, 0x7c, 0x24, 0x53, 0x23, 0x4b, 0x29, 0x94, 0x24, 0xaa, 0x2a, 0x00, 0x85, 0x66, 0x28, - 0x1c, 0x9f, 0xc0, 0x99, 0x7c, 0x9e, 0xe6, 0x9f, 0xb3, 0x9b, 0xbd, 0x9a, 0x5a, 0x9f, 0x3b, 0x9d, - 0xb2, 0xa1, 0xb9, 0x9d, 0x78, 0x9c, 0x44, 0x99, 0xbe, 0x9e, 0x72, 0x9f, 0x14, 0x99, 0x60, 0x9e, - 0x1e, 0x9c, 0x27, 0x9e, 0x7a, 0x9c, 0xfd, 0x0a, 0xe6, 0x9c, 0x26, 0x9f, 0xdf, 0x9d, 0x9c, 0x9b, - 0xff, 0xa1, 0x6e, 0x99, 0x29, 0x9a, 0x6b, 0x9e, 0x20, 0x9c, 0x67, 0x9d, 0xc2, 0x99, 0xbd, 0x99, - 0xb4, 0x9c, 0x22, 0x9d, 0x92, 0x9d, 0x60, 0x85, 0x8a, 0x9e, 0xb8, 0x97, 0xe8, 0x9d, 0x6b, 0x9d, - 0x6a, 0x9f, 0xaf, 0x9e, 0x08, 0xa0, 0x14, 0xa0, 0x32, 0x9f, 0x94, 0x9b, 0x7e, 0x9d, 0x87, 0x8e, - 0x30, 0xa0, 0x7c, 0x9d, 0x02, 0x9e, 0xe6, 0x90, 0xce, 0x94, 0xe6, 0x9d, 0x00, 0x99, 0x16, 0xa0, - 0xf3, 0xa0, 0xce, 0x9d, 0x1c, 0x9e, 0x39, 0xa1, 0xc8, 0xa1, 0x00, 0x9d, 0x46, 0x9c, 0x1e, 0x9f, - 0xa9, 0x9a, 0x05, 0x90, 0x36, 0x96, 0xa0, 0x8d, 0xdf, 0x9c, 0xc6, 0x8d, 0x49, 0x99, 0x84, 0x96, - 0xde, 0x98, 0x8d, 0x98, 0x30, 0x87, 0xb8, 0x18, 0xbe, 0x98, 0xda, 0x98, 0xda, 0x0c, 0x80, 0x97, - 0xa0, 0x9c, 0x11, 0x95, 0x08, 0x9b, 0xd2, 0x97, 0x96, 0x9d, 0x00, 0x9b, 0x6c, 0x8c, 0xc8, 0x15, - 0xe5, 0x9e, 0x6c, 0x91, 0x0e, 0x9d, 0x92, 0x9a, 0x80, 0x85, 0x64, 0x98, 0xe4, 0x98, 0xf8, 0x0a, - 0x14, 0x90, 0xe7, 0x9b, 0xd6, 0x9c, 0x54, 0x17, 0xe0, 0x97, 0x4c, 0x0c, 0x2c, 0x98, 0xe0, 0x9c, - 0x3f, 0x9a, 0x01, 0x9a, 0x2b, 0x9c, 0xc4, 0x9c, 0xfc, 0x9a, 0x66, 0x91, 0xec, 0x99, 0x32, 0x93, - 0x5c, 0x9e, 0x99, 0x9a, 0xd8, 0x9a, 0x96, 0x93, 0x51, 0x1a, 0x2b, 0x9d, 0x78, 0x9d, 0xa9, 0x9c, - 0xfe, 0x9b, 0xb9, 0x98, 0x70, 0x99, 0x0a, 0x9e, 0x98, 0x9c, 0xe0, 0x9f, 0x28, 0x94, 0x56, 0x9c, - 0xec, 0x98, 0x97, 0x98, 0xfa, 0x99, 0xdd, 0x9a, 0xea, 0x91, 0xb0, 0x99, 0xe6, 0x9b, 0x7c, 0x9c, - 0xb8, 0x9c, 0x8d, 0x97, 0x33, 0x9c, 0xa7, 0x9a, 0x88, 0x9b, 0x0a, 0x9c, 0x65, 0x98, 0x21, 0x9a, - 0xf1, 0x99, 0x65, 0x9b, 0xab, 0x99, 0xf7, 0x94, 0x54, 0x95, 0x0e, 0x99, 0x09, 0x9c, 0x26, 0x99, - 0xa6, 0x9c, 0xfd, 0x96, 0x8c, 0x99, 0xa0, 0x98, 0xd6, 0x94, 0xb3, 0x96, 0xbf, 0x9b, 0x73, 0x98, - 0x94, 0x95, 0x6a, 0x98, 0x54, 0x99, 0x20, 0x97, 0x68, 0x96, 0x5d, 0x98, 0xfb, 0x9b, 0xea, 0x98, - 0xfe, 0x9b, 0xba, 0x9c, 0xac, 0x9b, 0x02, 0x98, 0x5d, 0x9b, 0x89, 0x9a, 0xf0, 0x9a, 0x7d, 0x87, - 0x47, 0x9a, 0x5e, 0x96, 0x17, 0x9b, 0xab, 0x98, 0xf8, 0x91, 0x62, 0x9c, 0x89, 0x11, 0xf0, 0x9b, - 0x05, 0x9e, 0x6d, 0x9a, 0x2f, 0x99, 0x06, 0x9d, 0x58, 0x9c, 0x6f, 0x93, 0xf6, 0x95, 0xef, 0x9b, - 0x94, 0x90, 0x6f, 0x94, 0x2e, 0x90, 0x79, 0x0d, 0x00, 0x95, 0x5c, 0x95, 0x64, 0x96, 0x5d, 0x99, - 0xd8, 0x8b, 0x40, 0x89, 0x09, 0x98, 0xa5, 0x92, 0x53, 0x96, 0x1d, 0x96, 0x33, 0x92, 0x86, 0x92, - 0x73, 0x9a, 0x04, 0x95, 0xb5, 0x98, 0xa7, 0x98, 0xb7, 0x96, 0xa4, 0x91, 0xf4, 0x94, 0x40, 0x03, - 0x32, 0x98, 0xd3, 0x91, 0x86, 0x9b, 0xf8, 0x91, 0x7c, 0x12, 0x88, 0x84, 0x4f, 0x9b, 0x74, 0x91, - 0xa0, 0x11, 0x78, 0x96, 0x98, 0x98, 0x94, 0x92, 0x77, 0x11, 0x54, 0x94, 0x29, 0x98, 0x5f, 0x98, - 0x65, 0x97, 0xc7, 0x99, 0x91, 0x97, 0xe6, 0x8c, 0x52, 0x97, 0xd7, 0x96, 0x68, 0x98, 0xce, 0x8d, - 0x42, 0x98, 0x00, 0x90, 0x81, 0x98, 0x01, 0x99, 0xbb, 0x15, 0x05, 0x9c, 0x58, 0x92, 0x6e, 0x98, - 0x21, 0x9a, 0xf9, 0x95, 0x2c, 0x93, 0xa3, 0x99, 0xe4, 0x92, 0xbc, 0x98, 0x06, 0x08, 0x27, 0x99, - 0xb1, 0xae, 0x08, 0xa5, 0xb7, 0xac, 0x08, 0xad, 0xcc, 0xad, 0xea, 0xa4, 0x6a, 0xad, 0xeb, 0xa8, - 0x41, 0xb0, 0x3b, 0xad, 0x6e, 0xa4, 0x6c, 0x25, 0xd5, 0xac, 0x51, 0xad, 0x80, 0xa0, 0xbc, 0xac, - 0x3c, 0xac, 0x6b, 0xab, 0x0d, 0xac, 0x80, 0x10, 0xbd, 0xae, 0xc7, 0xae, 0x44, 0xa9, 0x98, 0xa2, - 0xd1, 0xb1, 0x78, 0xa6, 0x4d, 0xab, 0x22, 0xae, 0x04, 0xaa, 0x0b, 0xad, 0xae, 0xa4, 0x30, 0xa3, - 0x4e, 0xab, 0x61, 0xad, 0xf2, 0xad, 0x94, 0x28, 0x0a, 0xae, 0x80, 0x8d, 0x6f, 0xab, 0xfa, 0xad, - 0xb7, 0xad, 0x69, 0xac, 0x05, 0xaf, 0x6a, 0xb0, 0xf4, 0xad, 0xee, 0xa5, 0x37, 0xac, 0x68, 0xa2, - 0x8a, 0xb0, 0xce, 0xad, 0xe6, 0xac, 0x24, 0x21, 0xd9, 0x24, 0x26, 0xad, 0xf0, 0xad, 0x4e, 0xaf, - 0x04, 0xaf, 0x5c, 0xac, 0x51, 0xad, 0xb7, 0xb0, 0x15, 0xb1, 0x36, 0xb0, 0xf2, 0xaa, 0x23, 0xae, - 0x34, 0xac, 0xc2, 0xac, 0x7d, 0xab, 0x81, 0xb0, 0xd3, 0xac, 0x44, 0xad, 0xcd, 0xa8, 0x9d, 0xa7, - 0xdb, 0xa9, 0xb0, 0xab, 0xb9, 0xac, 0xba, 0xae, 0xa2, 0xae, 0x33, 0xac, 0x4e, 0xac, 0xad, 0xa8, - 0x6a, 0xae, 0xd2, 0xa8, 0x1c, 0xae, 0x56, 0xae, 0xde, 0xae, 0xd0, 0xad, 0x4c, 0xaa, 0x78, 0xa7, - 0x2d, 0xaf, 0x3a, 0xac, 0xd2, 0xac, 0xa7, 0xa7, 0x3d, 0xac, 0xbe, 0xae, 0x36, 0xab, 0x48, 0xad, - 0x55, 0xad, 0xa2, 0xac, 0xfc, 0xa8, 0xad, 0xab, 0x21, 0xad, 0x09, 0xae, 0xa6, 0xad, 0x82, 0xac, - 0xa2, 0xad, 0x9a, 0xac, 0x6e, 0xab, 0x1f, 0xaf, 0x06, 0xac, 0xd0, 0xac, 0x76, 0xac, 0x09, 0xac, - 0xdc, 0xa9, 0xb1, 0xad, 0x28, 0xab, 0x04, 0xae, 0xf8, 0xae, 0x8d, 0xad, 0x5e, 0xab, 0xa8, 0xad, - 0x9e, 0xa9, 0x14, 0xab, 0x6f, 0xad, 0x96, 0xaa, 0xff, 0xad, 0x0e, 0xab, 0xf6, 0xab, 0x58, 0xad, - 0xaa, 0x29, 0x31, 0x29, 0xbc, 0x29, 0x46, 0x2c, 0x79, 0x2a, 0xa0, 0x2a, 0x19, 0x29, 0x16, 0x2a, - 0x9f, 0x2a, 0xe6, 0x29, 0x3a, 0x2b, 0x13, 0x2a, 0x59, 0x2a, 0xa8, 0x28, 0xaa, 0x29, 0x44, 0x2a, - 0x26, 0x2a, 0x46, 0x28, 0x29, 0x2a, 0x7f, 0x2a, 0xd6, 0x2a, 0x76, 0x2c, 0xc5, 0x2a, 0x5d, 0x28, - 0x58, 0x2c, 0x02, 0x2a, 0xe1, 0x2a, 0xce, 0x29, 0x73, 0x2b, 0x7e, 0x2a, 0xdc, 0x29, 0x4f, 0x2c, - 0xd3, 0x2b, 0xca, 0x29, 0xe4, 0x26, 0xe9, 0x28, 0x84, 0x29, 0xbc, 0x2a, 0xfc, 0x2a, 0xac, 0x2b, - 0x42, 0x2c, 0x03, 0x2c, 0xda, 0x27, 0x33, 0x2c, 0x95, 0x2b, 0xc9, 0x28, 0x1c, 0x2c, 0xac, 0x29, - 0x88, 0x2a, 0xe2, 0x2a, 0x8e, 0x28, 0x60, 0x2a, 0x5b, 0x2c, 0x6e, 0x2a, 0x82, 0x2b, 0x8c, 0x2b, - 0xa8, 0x29, 0x90, 0x2a, 0x51, 0x2c, 0x15, 0x28, 0xa4, 0x2b, 0x5a, 0x2a, 0x43, 0x2b, 0xce, 0x2a, - 0xca, 0x24, 0x0e, 0x25, 0x79, 0x20, 0x55, 0x26, 0xaa, 0x24, 0xda, 0x21, 0xb0, 0x18, 0x00, 0x1f, - 0x88, 0x1e, 0x5c, 0x20, 0xa0, 0x27, 0xe6, 0x24, 0x2c, 0x24, 0x5a, 0x24, 0x08, 0x1f, 0xac, 0x1e, - 0x5a, 0x24, 0x56, 0x1f, 0x7b, 0x1c, 0x0f, 0x24, 0xe1, 0x27, 0x26, 0x22, 0xa6, 0x24, 0xca, 0x9c, - 0x38, 0x29, 0x20, 0x25, 0x8a, 0x22, 0x20, 0x90, 0x14, 0x1e, 0x80, 0x28, 0xc4, 0x1e, 0x65, 0x25, - 0xe0, 0x1a, 0xce, 0x1d, 0x15, 0x1e, 0x14, 0x21, 0x86, 0x25, 0x42, 0x27, 0x04, 0x21, 0x8c, 0x1e, - 0xe2, 0x20, 0x72, 0x24, 0x84, 0x25, 0x02, 0x27, 0x0a, 0x24, 0x52, 0x24, 0x86, 0x20, 0x68, 0x22, - 0xd2, 0x25, 0xae, 0x26, 0x7e, 0x20, 0xea, 0x24, 0x10, 0x23, 0xba, 0x25, 0x44, 0x28, 0x66, 0x24, - 0x3e, 0x21, 0xac, 0x25, 0x7c, 0x25, 0xc6, 0x26, 0x68, 0x27, 0x4d, 0x21, 0xfc, 0x26, 0xec, 0x23, - 0x84, 0x9d, 0xb4, 0x9c, 0x55, 0x9d, 0xb6, 0x9e, 0x20, 0x9e, 0xba, 0x9d, 0xf4, 0x9c, 0xa3, 0x9e, - 0xb6, 0x9e, 0x76, 0x9d, 0xae, 0x9f, 0xae, 0x9c, 0xeb, 0x9c, 0x37, 0x9c, 0xda, 0x9c, 0x9a, 0x9e, - 0xd5, 0x9c, 0x37, 0x9c, 0x5b, 0x9c, 0x3c, 0x9d, 0x1e, 0x9e, 0x1d, 0xa0, 0x60, 0x9f, 0x00, 0x9c, - 0x58, 0xa0, 0xfa, 0x9d, 0x56, 0x9e, 0xec, 0x9d, 0xfd, 0x9e, 0xfe, 0x9d, 0x73, 0x9d, 0x4e, 0xa0, - 0xb8, 0x9e, 0xc4, 0x9c, 0x38, 0x9a, 0x5b, 0x9c, 0xfd, 0x9c, 0x4f, 0x9e, 0xe0, 0x9d, 0x20, 0x9f, - 0xb4, 0x9f, 0x09, 0xa0, 0xaa, 0x9b, 0xaf, 0x9f, 0xc2, 0x9f, 0x0a, 0x9c, 0xf0, 0x9f, 0x4b, 0x9d, - 0x71, 0x9f, 0x8e, 0x9e, 0x02, 0x9c, 0x6c, 0x9d, 0x72, 0x9f, 0xe7, 0x9d, 0x60, 0xa0, 0xec, 0x9e, - 0xd0, 0x9d, 0x20, 0x9f, 0x47, 0xa0, 0x80, 0x9c, 0x74, 0x9f, 0x46, 0x9e, 0xfe, 0x9f, 0x1b, 0x9e, - 0xcd, 0x99, 0x26, 0x99, 0x8e, 0x96, 0xd4, 0x98, 0xa1, 0x99, 0x62, 0x96, 0x71, 0x94, 0xf0, 0x98, - 0x22, 0x98, 0x78, 0x96, 0xf3, 0x9c, 0x20, 0x97, 0x06, 0x96, 0x78, 0x98, 0x1a, 0x94, 0x85, 0x98, - 0x9b, 0x96, 0x1b, 0x96, 0xd0, 0x07, 0xa1, 0x96, 0x9a, 0x9b, 0xa2, 0x98, 0xa4, 0x9b, 0xf0, 0x81, - 0xeb, 0x9d, 0x72, 0x9a, 0x47, 0x98, 0xe8, 0x94, 0x88, 0x96, 0x66, 0x9c, 0xc0, 0x95, 0xbe, 0x9b, - 0x6c, 0x92, 0xba, 0x91, 0xb1, 0x93, 0xc4, 0x95, 0x91, 0x99, 0xa8, 0x9b, 0x26, 0x95, 0x96, 0x96, - 0x79, 0x97, 0xda, 0x9a, 0xcc, 0x99, 0x8a, 0x9b, 0x9d, 0x9a, 0xc3, 0x97, 0x88, 0x98, 0x14, 0x98, - 0x8b, 0x9c, 0x71, 0x9b, 0x0d, 0x95, 0x94, 0x98, 0x8c, 0x97, 0x18, 0x9a, 0x28, 0x9e, 0x4a, 0x99, - 0xac, 0x98, 0x24, 0x9c, 0xb0, 0x9b, 0xe2, 0x9b, 0x4e, 0x9c, 0x81, 0x98, 0xfa, 0x9c, 0x7e, 0x98, - 0x2a, 0x99, 0x26, 0x99, 0x36, 0x99, 0xa6, 0x9c, 0xfd, 0x99, 0x7d, 0x9a, 0x48, 0x98, 0x7e, 0x98, - 0x60, 0x99, 0x5f, 0x99, 0x18, 0x9a, 0xbd, 0x9a, 0xee, 0x9a, 0x99, 0x98, 0x84, 0x99, 0xd6, 0x98, - 0xac, 0x9a, 0x56, 0x97, 0xd7, 0x9a, 0xde, 0x9a, 0xfa, 0x9a, 0x13, 0x9c, 0x46, 0x99, 0x77, 0x97, - 0xff, 0x9b, 0x58, 0x99, 0x5e, 0x9a, 0x84, 0x98, 0x95, 0x9a, 0x97, 0x9a, 0x45, 0x99, 0x73, 0x9b, - 0x74, 0x9b, 0xcd, 0x99, 0x7d, 0x96, 0xbe, 0x98, 0x7e, 0x99, 0x88, 0x9a, 0xf4, 0x9a, 0xd8, 0x9a, - 0xde, 0x9b, 0xd4, 0x9a, 0x91, 0x97, 0x11, 0x9c, 0x4a, 0x9a, 0x02, 0x99, 0x16, 0x9b, 0x3c, 0x99, - 0xe6, 0x98, 0x7e, 0x9a, 0x6a, 0x98, 0x8a, 0x9a, 0x4c, 0x9c, 0x3f, 0x9a, 0x9e, 0x99, 0x1b, 0x9b, - 0xa1, 0x98, 0x42, 0x99, 0x87, 0x9b, 0x0f, 0x97, 0x0a, 0x9b, 0x6d, 0x99, 0xc8, 0x99, 0x88, 0x9a, - 0x18, 0x95, 0x08, 0x96, 0x15, 0x92, 0xfe, 0x98, 0x5a, 0x95, 0xb9, 0x94, 0xf8, 0x8a, 0x2c, 0x89, - 0xd1, 0x8d, 0x18, 0x92, 0xb9, 0x96, 0xae, 0x97, 0x08, 0x97, 0x3b, 0x95, 0xc4, 0x92, 0xe8, 0x8b, - 0xfd, 0x96, 0xd8, 0x8f, 0xba, 0x94, 0x98, 0x96, 0xa8, 0x98, 0x7a, 0x94, 0xe2, 0x92, 0x92, 0x07, - 0x39, 0x99, 0x2d, 0x95, 0x5a, 0x94, 0xb8, 0x05, 0xe6, 0x90, 0xff, 0x98, 0x08, 0x91, 0x6c, 0x95, - 0x26, 0x92, 0xfd, 0x92, 0x62, 0x90, 0x4e, 0x93, 0x93, 0x96, 0x0c, 0x98, 0xbc, 0x94, 0x72, 0x91, - 0x11, 0x94, 0x5d, 0x94, 0xb3, 0x95, 0x38, 0x98, 0x5d, 0x93, 0xcf, 0x95, 0xa7, 0x91, 0xeb, 0x93, - 0x00, 0x94, 0x53, 0x97, 0xae, 0x92, 0xd1, 0x96, 0x10, 0x96, 0xc8, 0x96, 0x0b, 0x96, 0xa6, 0x95, - 0x9c, 0x90, 0x8d, 0x94, 0xa2, 0x95, 0xf1, 0x95, 0xc8, 0x97, 0xa0, 0x91, 0x88, 0x95, 0x5a, 0x95, - 0xde, 0xac, 0xdc, 0xab, 0xef, 0xab, 0x2b, 0xac, 0x15, 0xad, 0xaa, 0xab, 0x55, 0xab, 0xee, 0xad, - 0x71, 0xad, 0x05, 0xac, 0xb4, 0xaf, 0xba, 0xa9, 0xb8, 0xa9, 0xf9, 0xaa, 0x1d, 0xaa, 0xa6, 0xad, - 0xe9, 0xa9, 0xde, 0xaa, 0xcc, 0xa5, 0x86, 0xaa, 0x43, 0xad, 0xf6, 0xad, 0x16, 0xaf, 0xbc, 0xa8, - 0x3d, 0xb0, 0x6a, 0xad, 0xc1, 0xac, 0x8d, 0xac, 0x01, 0xad, 0x9f, 0xad, 0xe3, 0xab, 0x75, 0xaf, - 0xea, 0xab, 0x65, 0xa9, 0x7b, 0xa8, 0x3c, 0xaa, 0x2f, 0xac, 0x9f, 0xad, 0x3f, 0xab, 0x0b, 0xad, - 0x52, 0xad, 0xef, 0xae, 0x9c, 0xab, 0x46, 0xae, 0xc7, 0xae, 0x02, 0xaa, 0x10, 0xae, 0x26, 0xac, - 0xb2, 0xaf, 0xce, 0xad, 0xa0, 0xa9, 0xd7, 0xab, 0xd2, 0xac, 0xe8, 0xac, 0xc8, 0xb0, 0x51, 0xad, - 0x08, 0xad, 0x0a, 0xaf, 0x5d, 0xaf, 0x1d, 0xad, 0xc5, 0xae, 0x1d, 0xad, 0x10, 0xb0, 0x8a, 0xac, - 0x1e, 0xa5, 0xc1, 0xa7, 0x72, 0xa7, 0x00, 0xac, 0xac, 0x9b, 0x3e, 0xa4, 0xf0, 0xa1, 0x0b, 0xa5, - 0x2c, 0xa7, 0xfc, 0xa2, 0xe4, 0xa9, 0x18, 0xad, 0x56, 0xac, 0xaa, 0xa8, 0xe0, 0xa5, 0x46, 0x24, - 0x3e, 0xaa, 0x00, 0x8d, 0x74, 0xaa, 0x42, 0xac, 0x50, 0xa7, 0x43, 0xaa, 0xd5, 0xa4, 0xfe, 0x9c, - 0x60, 0xab, 0x6c, 0xa4, 0xef, 0xa9, 0x14, 0x97, 0x8e, 0xa5, 0x6c, 0xa8, 0x0c, 0xaa, 0xca, 0xa9, - 0xa4, 0xa7, 0x1c, 0xa9, 0x2d, 0xa5, 0x27, 0xa3, 0x9e, 0xa4, 0x77, 0xa8, 0xbb, 0xa8, 0x61, 0xa8, - 0xb2, 0xa9, 0x96, 0xa8, 0x60, 0xa9, 0xcf, 0xa8, 0x5a, 0xa9, 0xc6, 0xac, 0xf8, 0xa5, 0x9a, 0xa0, - 0x62, 0xa1, 0xcd, 0xa6, 0xb2, 0xa8, 0x0a, 0xaa, 0x7f, 0xa8, 0x8b, 0xa5, 0x01, 0x99, 0x42, 0xac, - 0x42, 0xa9, 0xe4, 0xa3, 0x82, 0xa4, 0xd4, 0xa5, 0x02, 0xab, 0x78, 0xa9, 0xf3, 0xa5, 0x61, 0xab, - 0xac, 0x24, 0x7f, 0x1f, 0x60, 0x26, 0xe9, 0x28, 0x78, 0x23, 0x2b, 0x24, 0x42, 0x25, 0x50, 0x25, - 0xed, 0x28, 0xf4, 0x21, 0x83, 0x26, 0x0c, 0x21, 0xa0, 0x27, 0x3b, 0x23, 0x00, 0x20, 0x13, 0x24, - 0x26, 0x25, 0xaa, 0x1f, 0x7a, 0x26, 0x55, 0x22, 0xa1, 0x21, 0x34, 0x28, 0xe8, 0x25, 0x66, 0x1c, - 0xf2, 0x29, 0xc6, 0x1e, 0xbc, 0x25, 0x7c, 0x26, 0x55, 0x25, 0x17, 0x22, 0x94, 0x25, 0x69, 0x27, - 0xd7, 0x26, 0xf2, 0x24, 0xae, 0x95, 0x5d, 0x25, 0x74, 0x24, 0x1a, 0x24, 0x56, 0x26, 0xf0, 0x28, - 0x16, 0x25, 0x96, 0x28, 0x12, 0x24, 0x05, 0x28, 0x76, 0x26, 0x49, 0x26, 0x2a, 0x28, 0xa7, 0x16, - 0x0b, 0x26, 0x5f, 0x24, 0x71, 0x20, 0xf9, 0x1f, 0x47, 0x28, 0x0b, 0x27, 0x4d, 0x24, 0x1d, 0x26, - 0x84, 0x27, 0xae, 0x24, 0xee, 0x27, 0xee, 0x23, 0x6c, 0x28, 0x28, 0x28, 0x5e, 0x28, 0xd4, 0x28, - 0xf7, 0x21, 0x50, 0x09, 0x42, 0xa1, 0xda, 0x20, 0xc9, 0x1c, 0x13, 0x9e, 0xc4, 0x18, 0x84, 0x21, - 0x9e, 0x1c, 0x78, 0x16, 0x72, 0x21, 0xb8, 0x21, 0x60, 0x22, 0x2a, 0x1e, 0x7c, 0x9f, 0x04, 0x9d, - 0xc5, 0x21, 0x21, 0x9b, 0x44, 0x15, 0x5b, 0x20, 0x1a, 0x26, 0xf2, 0x1c, 0xe6, 0x21, 0x32, 0xa0, - 0xdc, 0x26, 0x57, 0x20, 0x1c, 0x21, 0x50, 0xa0, 0x80, 0x89, 0x20, 0x25, 0x52, 0x23, 0xa6, 0x1d, - 0x38, 0x10, 0x00, 0x9c, 0xd6, 0x22, 0x00, 0x11, 0x66, 0x21, 0x28, 0x20, 0x63, 0x21, 0x8c, 0x24, - 0x96, 0x1e, 0x80, 0x14, 0xba, 0x24, 0xbf, 0x22, 0x0d, 0x24, 0xb8, 0x21, 0x36, 0x1c, 0xc3, 0x15, - 0xff, 0x24, 0xa5, 0x1d, 0xad, 0x1d, 0xe6, 0x21, 0x1a, 0x9b, 0x80, 0x88, 0xa8, 0x21, 0x34, 0x22, - 0x18, 0x1c, 0x41, 0x24, 0x70, 0x9a, 0xc3, 0x20, 0x46, 0x24, 0x06, 0x26, 0xd6, 0x23, 0x3b, 0x20, - 0x71, 0x99, 0x58, 0x86, 0x9e, 0x98, 0x3e, 0x9c, 0xb4, 0x98, 0x85, 0x96, 0xc0, 0x99, 0x22, 0x9a, - 0x2f, 0x9d, 0x9c, 0x95, 0xc3, 0x99, 0x0d, 0x11, 0xce, 0x99, 0xef, 0x94, 0xb8, 0x82, 0x86, 0x99, - 0x02, 0x98, 0xb0, 0x93, 0x77, 0x98, 0x40, 0x84, 0xbd, 0x97, 0x3c, 0x9b, 0xfa, 0x9a, 0xb8, 0x84, - 0x8a, 0x9e, 0xe3, 0x92, 0xbe, 0x98, 0xcd, 0x9a, 0xe6, 0x98, 0xb1, 0x96, 0xef, 0x98, 0x5f, 0x9a, - 0x3a, 0x9a, 0x9c, 0x95, 0x02, 0x0b, 0x8c, 0x99, 0x32, 0x99, 0xc0, 0x96, 0x2a, 0x9a, 0xaa, 0x9d, - 0x71, 0x97, 0x56, 0x9c, 0xd0, 0x97, 0x37, 0x9c, 0x85, 0x9a, 0x31, 0x97, 0x6a, 0x9c, 0xf2, 0x80, - 0x44, 0x9c, 0xfe, 0x97, 0xd3, 0x8d, 0xa1, 0x89, 0x82, 0x9b, 0xf6, 0x9a, 0x20, 0x9a, 0x0a, 0x98, - 0xa0, 0x9a, 0x43, 0x9a, 0xfe, 0x9b, 0x28, 0x98, 0x4a, 0x9c, 0xbc, 0x9c, 0x3b, 0x9d, 0x40, 0x9c, - 0x16, 0x98, 0x12, 0x11, 0x2f, 0x15, 0x26, 0x95, 0x5a, 0x95, 0x26, 0x11, 0x0d, 0x94, 0x17, 0x98, - 0xd4, 0x96, 0xf6, 0x8d, 0x76, 0x95, 0xee, 0x0f, 0x82, 0x94, 0x77, 0x8e, 0x06, 0x16, 0x0c, 0x91, - 0x64, 0x94, 0x32, 0x09, 0x46, 0x0b, 0x48, 0x0d, 0x80, 0x9a, 0x02, 0x92, 0x9e, 0x98, 0xd2, 0x14, - 0x87, 0x9c, 0x35, 0x94, 0x66, 0x94, 0xa0, 0x03, 0xcc, 0x8c, 0x02, 0x99, 0xcd, 0x96, 0x4c, 0x92, - 0xdc, 0x8e, 0xc7, 0x13, 0x2a, 0x95, 0x86, 0x91, 0x95, 0x97, 0x05, 0x93, 0x7e, 0x96, 0x12, 0x9b, - 0xc6, 0x8f, 0x3e, 0x92, 0x70, 0x98, 0xb3, 0x98, 0x98, 0x98, 0xd9, 0x8e, 0x04, 0x96, 0x7a, 0x00, - 0xb2, 0x9b, 0x83, 0x92, 0xd0, 0x82, 0x76, 0x90, 0x50, 0x01, 0x32, 0x91, 0xf2, 0x98, 0x83, 0x92, - 0xbb, 0x91, 0xec, 0x99, 0x04, 0x90, 0xc7, 0x95, 0xed, 0x98, 0xa4, 0x9b, 0x9c, 0x9a, 0xee, 0x94, - 0x26, 0x93, 0x5d, 0x92, 0x56, 0x96, 0xd6, 0x98, 0x84, 0x90, 0xde, 0x93, 0xa6, 0x93, 0xf6, 0x93, - 0x96, 0x97, 0x45, 0x91, 0x81, 0x96, 0x5f, 0x96, 0x48, 0x98, 0x59, 0x94, 0x48, 0x92, 0x38, 0x8e, - 0xd7, 0x95, 0x64, 0x8d, 0x43, 0x97, 0x01, 0x96, 0xd6, 0x90, 0x11, 0x98, 0x3e, 0x94, 0xa6, 0x8d, - 0xd9, 0x98, 0x0b, 0x8f, 0x19, 0x96, 0x70, 0x94, 0xb0, 0x94, 0x52, 0x92, 0xd7, 0x95, 0x40, 0x97, - 0x1d, 0x96, 0xe3, 0x95, 0xba, 0x87, 0x24, 0x94, 0xba, 0x92, 0x6d, 0x94, 0xc9, 0x95, 0x59, 0x97, - 0xc2, 0x95, 0xe3, 0x97, 0x5c, 0x94, 0xb6, 0x96, 0xeb, 0x95, 0x25, 0x98, 0x61, 0x96, 0x53, 0x8a, - 0xb0, 0x92, 0x27, 0x94, 0x1a, 0x93, 0xa6, 0x93, 0xb4, 0x97, 0xb7, 0x95, 0x5d, 0x90, 0x98, 0x97, - 0x1c, 0x97, 0x04, 0x92, 0x1f, 0x96, 0xea, 0x92, 0x18, 0x98, 0xb3, 0x96, 0x0a, 0x96, 0xa7, 0x98, - 0x70, 0x90, 0xdf, 0x8e, 0xf8, 0x0a, 0x84, 0x93, 0xa4, 0x83, 0xe8, 0x08, 0x94, 0x81, 0xc0, 0x8f, - 0x16, 0x89, 0xa0, 0x88, 0xf2, 0x92, 0xa5, 0x96, 0x03, 0x95, 0x60, 0x91, 0xb3, 0x80, 0x6a, 0x11, - 0x07, 0x94, 0x2f, 0x0b, 0xaf, 0x90, 0x37, 0x95, 0x0c, 0x95, 0xce, 0x90, 0x83, 0x8f, 0x96, 0x0c, - 0xab, 0x95, 0x4a, 0x90, 0x19, 0x93, 0xf3, 0x10, 0x36, 0x86, 0xd3, 0x94, 0x4f, 0x94, 0xe9, 0x90, - 0xfd, 0x89, 0x8e, 0x8c, 0x01, 0x93, 0x64, 0x00, 0xd8, 0x8f, 0x9d, 0x91, 0xbe, 0x91, 0x2d, 0x92, - 0x05, 0x92, 0x3e, 0x8a, 0xe4, 0x94, 0xf6, 0x91, 0xfc, 0x93, 0x8d, 0x95, 0x76, 0x88, 0xf9, 0x89, - 0x04, 0x91, 0x25, 0x8f, 0xd2, 0x91, 0x97, 0x94, 0x84, 0x82, 0xc6, 0x80, 0x04, 0x8c, 0x2e, 0x95, - 0xaa, 0x8f, 0x56, 0x91, 0xf0, 0x0a, 0x62, 0x90, 0x88, 0x94, 0xde, 0x94, 0x62, 0x90, 0x8c, 0x92, - 0x0f, 0xaa, 0xe2, 0x22, 0xec, 0x9b, 0xc8, 0xa9, 0x12, 0xa9, 0xdc, 0x9e, 0x19, 0xa9, 0x8f, 0xaa, - 0x66, 0xac, 0x1e, 0xa4, 0x80, 0xa8, 0x22, 0x28, 0xfe, 0xa6, 0x8d, 0xa0, 0x96, 0x25, 0x4f, 0xa9, - 0x99, 0xa5, 0x74, 0xa0, 0xd4, 0xa0, 0x00, 0x25, 0x61, 0xaa, 0x94, 0xa8, 0x8c, 0xab, 0xfe, 0x22, - 0xdd, 0xae, 0x46, 0xa4, 0xcc, 0xa6, 0xa6, 0xa8, 0x59, 0xa6, 0xc5, 0xa8, 0x59, 0xa8, 0x24, 0xa8, - 0x06, 0xa8, 0x60, 0x1d, 0xe2, 0x9e, 0x6c, 0xa8, 0xbe, 0xa9, 0xfc, 0xa4, 0x6f, 0xa9, 0xed, 0xad, - 0x82, 0xa3, 0x0d, 0xaa, 0x86, 0xa8, 0x04, 0xac, 0x7b, 0xaa, 0xf3, 0x9d, 0x93, 0xab, 0x70, 0x18, - 0x8a, 0xad, 0x36, 0xa6, 0xcc, 0x1c, 0xa4, 0x1b, 0x20, 0xa8, 0x1e, 0xa9, 0xb6, 0xab, 0x6e, 0xa3, - 0x5a, 0xa8, 0x0a, 0xac, 0xc3, 0xa9, 0x3a, 0xa8, 0xbf, 0xab, 0x55, 0xad, 0xa2, 0xad, 0xdd, 0xa9, - 0x38, 0xbe, 0x37, 0xc0, 0x5e, 0xbe, 0x42, 0xbf, 0x01, 0xbf, 0x28, 0xc2, 0x7f, 0xc1, 0xfc, 0xc2, - 0x39, 0xbf, 0x0e, 0xbd, 0x3f, 0xc2, 0x4d, 0xbf, 0x67, 0xc0, 0x0d, 0xc1, 0x13, 0xc0, 0x86, 0xc1, - 0x6d, 0xc2, 0xde, 0xc1, 0x20, 0xc1, 0xb0, 0xbe, 0x10, 0xc0, 0xf2, 0xbc, 0xb6, 0xc1, 0x88, 0xbe, - 0x83, 0xc1, 0x4e, 0xbf, 0x24, 0xc2, 0x19, 0xbe, 0x83, 0xb7, 0x3b, 0xbd, 0xeb, 0xc2, 0x4f, 0xbe, - 0x06, 0xb9, 0x7a, 0xbe, 0x4f, 0xc0, 0x7f, 0xc0, 0x91, 0xba, 0xbe, 0xc0, 0xac, 0xc2, 0x43, 0xc0, - 0x98, 0xc1, 0xa6, 0xc3, 0x4c, 0xc0, 0x10, 0xbd, 0x73, 0xc0, 0x6b, 0xbe, 0x4e, 0xc2, 0x2d, 0xbc, - 0xe0, 0xc0, 0x74, 0xbd, 0x02, 0xc1, 0xc6, 0xc1, 0x44, 0xb9, 0xdc, 0xc4, 0x58, 0xb7, 0x58, 0xc0, - 0x63, 0xc3, 0x37, 0xc1, 0xaf, 0xc0, 0xff, 0xc2, 0x58, 0xbf, 0x47, 0xb9, 0x88, 0xba, 0x4d, 0xc1, - 0xdc, 0x3d, 0x11, 0x3d, 0x60, 0x3d, 0x72, 0x3e, 0x96, 0x3b, 0x74, 0x3d, 0x90, 0x3e, 0x99, 0x3e, - 0xc3, 0x3f, 0xce, 0x3d, 0x40, 0x3e, 0x30, 0x3e, 0xc3, 0x3d, 0x37, 0x3f, 0x1d, 0x3d, 0xba, 0x3e, - 0xc6, 0x3c, 0x0d, 0x3f, 0x52, 0x3c, 0xa2, 0x39, 0x4c, 0x3d, 0x00, 0x3e, 0xcc, 0x3e, 0xcb, 0x3d, - 0xc6, 0x3f, 0x1b, 0x3d, 0x3a, 0x3c, 0xaa, 0x3c, 0x86, 0x3c, 0xce, 0x3d, 0xb6, 0x3c, 0x69, 0x3c, - 0x66, 0x3c, 0xa2, 0x3c, 0x42, 0x3e, 0x2c, 0x37, 0x1a, 0x3d, 0x6e, 0x3c, 0x87, 0x3e, 0x39, 0x3b, - 0x4b, 0x40, 0x9e, 0x3e, 0x90, 0x3e, 0xd6, 0x3d, 0x0a, 0x3f, 0xd2, 0x3b, 0x9e, 0x3d, 0x62, 0x3a, - 0x87, 0x3e, 0x46, 0x3d, 0xad, 0x3e, 0x99, 0x3c, 0xa1, 0x38, 0xe2, 0x3d, 0xb2, 0x38, 0xc0, 0x3f, - 0x38, 0x40, 0x6a, 0x3e, 0xb2, 0x3d, 0x45, 0x40, 0x34, 0x40, 0xea, 0x37, 0x08, 0x3a, 0x35, 0x3d, - 0x15, 0x35, 0x68, 0x38, 0xc0, 0x38, 0x08, 0xaa, 0x39, 0x3b, 0xca, 0x38, 0x88, 0x36, 0xef, 0x35, - 0x80, 0x29, 0x4e, 0x34, 0x09, 0x38, 0x88, 0xb1, 0x60, 0x35, 0x2c, 0x38, 0x4b, 0x36, 0xcc, 0x37, - 0x13, 0x3c, 0x91, 0x37, 0x50, 0x3a, 0x9c, 0x39, 0x8d, 0x39, 0x25, 0x37, 0xf8, 0x2e, 0xd8, 0xae, - 0xce, 0x3a, 0x23, 0x34, 0x9f, 0x3c, 0xf4, 0x38, 0x0c, 0xb0, 0xa4, 0x31, 0xc2, 0x38, 0xfa, 0x33, - 0x5e, 0xb0, 0xec, 0x3a, 0x95, 0x38, 0x3c, 0x2e, 0x80, 0x25, 0x4a, 0x36, 0xb5, 0x35, 0xf8, 0x36, - 0x25, 0x38, 0x12, 0x3c, 0xcd, 0x37, 0x2c, 0x38, 0x46, 0x36, 0x7c, 0x34, 0x56, 0x39, 0x8e, 0x35, - 0x2e, 0x3a, 0x40, 0x39, 0x64, 0x39, 0x86, 0x38, 0x09, 0xb5, 0x84, 0x3e, 0xfe, 0x3b, 0xb9, 0x39, - 0x00, 0x3b, 0x20, 0x35, 0x96, 0x3a, 0xc2, 0x3c, 0x31, 0x37, 0xd8, 0x39, 0xb0, 0x27, 0xd8, 0x3a, - 0xe2, 0xb1, 0xb6, 0xb0, 0xb4, 0xb1, 0xb9, 0xb1, 0x22, 0xb0, 0x38, 0xb0, 0x9a, 0xb1, 0xc8, 0xb0, - 0x76, 0xb3, 0x0e, 0xb2, 0xfa, 0xb0, 0x34, 0xb1, 0x1b, 0xb1, 0xcc, 0xb2, 0x98, 0xb0, 0xe8, 0xb1, - 0xdc, 0xaf, 0x16, 0xb2, 0x6e, 0xaf, 0x70, 0xad, 0x4e, 0xb1, 0x99, 0xb2, 0x48, 0xb1, 0x09, 0xb1, - 0xbf, 0xb3, 0x98, 0xb0, 0x4c, 0xaf, 0xf6, 0xb0, 0xba, 0xb0, 0xdb, 0xb1, 0xd4, 0xad, 0x02, 0xb0, - 0x64, 0xb0, 0x37, 0xb1, 0x22, 0xb2, 0xa0, 0x19, 0x48, 0xb1, 0xd4, 0xae, 0xd9, 0xb0, 0x84, 0xad, - 0x13, 0xb4, 0x7d, 0xb1, 0x5e, 0xb2, 0x7d, 0xb2, 0xb4, 0xb2, 0xdd, 0xae, 0x73, 0xb0, 0x8b, 0xae, - 0x78, 0xb2, 0xee, 0xb1, 0x6c, 0xb2, 0xb1, 0xae, 0x49, 0xaa, 0x7a, 0xb0, 0xda, 0xaf, 0x10, 0xb4, - 0x98, 0xb3, 0x6c, 0xb1, 0xa7, 0xb1, 0x36, 0xb4, 0x6c, 0xb4, 0xae, 0xad, 0x98, 0xad, 0xca, 0xb0, - 0x6c, 0xac, 0xd7, 0xac, 0x9b, 0xae, 0xa0, 0xa2, 0xf8, 0xaf, 0x17, 0xab, 0xa2, 0xaa, 0x8e, 0xa5, - 0x90, 0xa9, 0x89, 0xac, 0x5c, 0xaa, 0xd8, 0x20, 0x47, 0xaa, 0x64, 0xad, 0x0c, 0xab, 0x2c, 0xac, - 0xbd, 0xae, 0xd8, 0xab, 0x91, 0xad, 0x6b, 0xad, 0x92, 0xae, 0x74, 0xae, 0x48, 0x9e, 0x00, 0x98, - 0x40, 0xb0, 0x56, 0xa9, 0xee, 0xaf, 0x6c, 0xae, 0x9e, 0xa5, 0x18, 0xab, 0x32, 0xa8, 0x1c, 0xa9, - 0xf4, 0xa2, 0x48, 0xb0, 0xea, 0xad, 0xd8, 0x28, 0x13, 0xa9, 0x84, 0xa8, 0x1e, 0xa6, 0x0d, 0xa9, - 0xe1, 0xad, 0xe6, 0xae, 0x5a, 0xad, 0xfa, 0xae, 0xa7, 0xac, 0xcc, 0xa8, 0x2e, 0xac, 0x52, 0xab, - 0x70, 0xaf, 0x9d, 0xaf, 0x78, 0xae, 0xe8, 0xa9, 0x25, 0x29, 0xfc, 0xb0, 0x3f, 0xb1, 0x38, 0xb0, - 0x48, 0xaf, 0x4b, 0xa9, 0x77, 0xaf, 0x65, 0xb1, 0x1d, 0xaf, 0x30, 0xaf, 0xac, 0xa3, 0xb9, 0xae, - 0xec, 0xac, 0xd8, 0xac, 0x72, 0xac, 0xd7, 0xad, 0xd8, 0xaa, 0x13, 0xae, 0x8c, 0xae, 0x62, 0xaf, - 0x8c, 0xae, 0x9e, 0xac, 0xad, 0xae, 0xcc, 0xad, 0x8a, 0xad, 0x9a, 0xae, 0xed, 0xac, 0x97, 0xae, - 0x7a, 0xad, 0xfb, 0xae, 0xb2, 0xac, 0xd2, 0xa9, 0xcb, 0xac, 0x90, 0xac, 0x08, 0xaf, 0x4d, 0xad, - 0xeb, 0xae, 0xce, 0xac, 0xe4, 0xac, 0xdc, 0xab, 0xa2, 0xaa, 0xbd, 0xac, 0x0a, 0xae, 0x21, 0xac, - 0xd6, 0xaa, 0xa1, 0xab, 0x98, 0xad, 0xca, 0xaa, 0xd6, 0xab, 0xdf, 0xac, 0x30, 0xaf, 0x17, 0xac, - 0xb8, 0xaf, 0x3c, 0xaf, 0xd6, 0xad, 0x70, 0xac, 0x48, 0xae, 0xab, 0xab, 0x32, 0xae, 0x80, 0xa9, - 0xe6, 0xad, 0x18, 0xac, 0x23, 0xae, 0x63, 0xad, 0xaf, 0xa8, 0x68, 0xaf, 0x65, 0xa4, 0x65, 0xae, - 0x27, 0xb0, 0x5f, 0xae, 0x41, 0xad, 0xe6, 0xaf, 0xa4, 0xae, 0x3a, 0xa5, 0x51, 0xa9, 0x44, 0xad, - 0xfe, 0xa4, 0x1c, 0xa9, 0x19, 0xa8, 0x41, 0xa0, 0x8e, 0xaa, 0x42, 0xab, 0x33, 0xa9, 0x97, 0xaa, - 0xaa, 0x9f, 0xa8, 0xa2, 0x96, 0xaa, 0xb4, 0x9c, 0x00, 0xa8, 0x01, 0xa9, 0x30, 0xa8, 0x8b, 0xa9, - 0xea, 0xac, 0xb6, 0xa9, 0x86, 0xab, 0xce, 0xa9, 0x8b, 0xa9, 0xc3, 0xa4, 0xca, 0xa7, 0x40, 0x9d, - 0xda, 0xaa, 0x2e, 0xa6, 0x34, 0xad, 0x5c, 0xa8, 0xa8, 0x21, 0x3f, 0xa1, 0x2c, 0xac, 0x97, 0xa5, - 0x4f, 0x20, 0xc1, 0xa9, 0xd7, 0xa8, 0x8c, 0xa8, 0xf8, 0x15, 0x2c, 0xa9, 0x36, 0xaa, 0x30, 0xa9, - 0x04, 0xa9, 0x32, 0xad, 0x4c, 0xa8, 0xbd, 0xa5, 0xa0, 0xa7, 0x68, 0xa6, 0xad, 0xab, 0x3c, 0xa5, - 0x4b, 0xaa, 0xe5, 0xa7, 0xe8, 0xa9, 0x09, 0xab, 0x20, 0x20, 0xea, 0xaf, 0xd6, 0xa8, 0xf8, 0xa8, - 0x5a, 0xac, 0x89, 0xa8, 0xac, 0xaa, 0xe0, 0xac, 0x6c, 0xa5, 0x1f, 0xa8, 0x6d, 0x9d, 0xad, 0xab, - 0xea, 0xc0, 0xed, 0xbf, 0x89, 0xc1, 0xd2, 0xbe, 0xd5, 0xc0, 0x5a, 0xbd, 0x77, 0xbf, 0x37, 0xbc, - 0x52, 0xc1, 0x38, 0xc1, 0x1c, 0xbe, 0x11, 0xbd, 0x30, 0xbf, 0x6a, 0xc1, 0xe6, 0xbe, 0x3d, 0xc0, - 0x0e, 0xbf, 0x31, 0xc0, 0x94, 0xbe, 0xf6, 0xbd, 0x06, 0xc1, 0x4a, 0xc2, 0xda, 0xbc, 0x8a, 0xbd, - 0x10, 0xc3, 0x83, 0xbe, 0xa3, 0xbf, 0xfc, 0xc0, 0x2a, 0xbf, 0xb6, 0xc0, 0x3c, 0xb8, 0xdc, 0xbd, - 0x29, 0xbe, 0xde, 0xc1, 0x51, 0xc1, 0x14, 0x3c, 0x32, 0xc0, 0x05, 0xbc, 0x8f, 0xbc, 0x0c, 0xbb, - 0x66, 0xc2, 0x60, 0xc0, 0x49, 0xc1, 0x62, 0xc2, 0x3d, 0xc1, 0xe9, 0xbc, 0x0b, 0xbe, 0x22, 0xbe, - 0xf2, 0xc1, 0x2a, 0xc2, 0x8a, 0xc1, 0xcb, 0xbb, 0x25, 0x2c, 0x3a, 0xc0, 0x38, 0xc2, 0x99, 0xc3, - 0x0e, 0xc2, 0xe2, 0xbe, 0x6e, 0xc1, 0xf7, 0xc3, 0xce, 0xc3, 0x32, 0xc0, 0x96, 0xbb, 0x68, 0xc0, + 0x01, 0xdc, 0x79, 0xc0, 0xf6, 0x49, 0x06, 0xc0, 0x11, 0xc1, 0x72, 0xbf, 0x81, 0x05, 0x0d, 0xc0, + 0x43, 0x26, 0x52, 0xc0, 0x24, 0x39, 0x84, 0xbf, 0xe1, 0xe6, 0x04, 0xc0, 0xd1, 0x83, 0xce, 0xbf, + 0xd9, 0xb7, 0xac, 0xbf, 0x8e, 0x01, 0x2e, 0xc0, 0x6e, 0xb2, 0xe5, 0xbf, 0xe6, 0x56, 0x2c, 0xc0, + 0x4a, 0x9d, 0xdc, 0xbf, 0x79, 0x7b, 0x22, 0xc0, 0xa3, 0xf6, 0x2b, 0xc0, 0x54, 0x01, 0x88, 0xbf, + 0x42, 0xd4, 0x41, 0xc0, 0x0f, 0xc5, 0x84, 0xbf, 0x31, 0xa1, 0x81, 0x3e, 0xae, 0xea, 0x13, 0xc0, + 0xc2, 0xd2, 0x72, 0xbf, 0xc5, 0xb6, 0x24, 0xc0, 0x05, 0x6d, 0xf4, 0xbf, 0x11, 0x24, 0xd7, 0xbf, + 0x41, 0xbf, 0x41, 0xc0, 0xee, 0x61, 0xaf, 0xbf, 0xf0, 0x02, 0x7d, 0xbf, 0xb5, 0xa3, 0xe7, 0xbf, + 0xcb, 0x9e, 0x23, 0xbf, 0x94, 0x87, 0xfa, 0xbf, 0xe3, 0xd4, 0xda, 0xbf, 0x59, 0x8c, 0x1a, 0xc0, + 0x76, 0x5d, 0x77, 0xbf, 0xc5, 0xef, 0x02, 0xbf, 0xd4, 0x13, 0x13, 0xc0, 0xc0, 0x36, 0x5e, 0xc0, + 0xfb, 0x68, 0x1a, 0xc0, 0x40, 0xe7, 0x8a, 0xbf, 0x9e, 0x64, 0xc0, 0xbf, 0x97, 0x9b, 0x04, 0xc0, + 0x6e, 0x53, 0x3e, 0xc0, 0xa6, 0x18, 0x58, 0xc0, 0x62, 0x52, 0x23, 0xc0, 0xfc, 0xe9, 0x23, 0xc0, + 0xae, 0x3d, 0x04, 0xc0, 0x2e, 0xd6, 0x0e, 0xc0, 0xa3, 0x58, 0x00, 0xc0, 0x8e, 0xa2, 0x7b, 0xbf, + 0xfe, 0x2a, 0x0b, 0xc0, 0x41, 0xc1, 0x14, 0xbf, 0xac, 0x1f, 0xdf, 0xbf, 0xd3, 0x3d, 0x00, 0xc0, + 0x97, 0x21, 0xb2, 0xbe, 0xf8, 0xc6, 0x08, 0xc0, 0x65, 0x84, 0x9a, 0xbf, 0x78, 0xb5, 0x28, 0xbf, + 0x08, 0x2f, 0xd7, 0xbf, 0x5f, 0x58, 0x7d, 0xc0, 0x38, 0xf5, 0xfa, 0xbf, 0xcb, 0x1f, 0xaf, 0xbf, + 0x6a, 0x3e, 0x04, 0xc0, 0xee, 0xa6, 0x4e, 0xc0, 0xbf, 0x65, 0xfd, 0xbf, 0x3b, 0x3b, 0x2b, 0xc0, + 0x10, 0xa2, 0x78, 0xc0, 0x8b, 0x1b, 0x42, 0xc0, 0x79, 0xb9, 0xfb, 0x3c, 0x74, 0x7d, 0x95, 0xbf, + 0x82, 0x5a, 0xcf, 0x3f, 0x8b, 0x8b, 0x30, 0x3f, 0x9c, 0x02, 0xc0, 0x3f, 0xa0, 0xa1, 0x91, 0x3f, + 0x12, 0x67, 0x2e, 0x3f, 0xda, 0xf2, 0x65, 0x3f, 0xf8, 0xe2, 0xc6, 0x3f, 0xaa, 0xe8, 0x94, 0x3f, + 0x0e, 0x62, 0xaa, 0x3f, 0xe9, 0xc0, 0x9e, 0x3f, 0x23, 0xe3, 0x8c, 0x3f, 0x60, 0xbc, 0xc8, 0x3f, + 0x3d, 0x9f, 0x96, 0x3f, 0x48, 0x84, 0xb8, 0x3f, 0xfa, 0x5c, 0x8d, 0x3f, 0x02, 0x84, 0xf7, 0x3f, + 0x9e, 0xd7, 0xa6, 0x3f, 0xd6, 0x9a, 0xf7, 0x3f, 0xc1, 0xfb, 0xd4, 0x3f, 0x36, 0xe4, 0x96, 0x3f, + 0x68, 0xa8, 0xc3, 0x3f, 0x90, 0xda, 0x96, 0x3f, 0xf0, 0xe9, 0x87, 0x3f, 0x37, 0xb3, 0xbf, 0x3f, + 0xb2, 0x44, 0xc4, 0x3e, 0xb4, 0x7a, 0xdd, 0x3f, 0x7f, 0x8a, 0xa5, 0x3f, 0x06, 0x5c, 0xa6, 0x3f, + 0x27, 0xee, 0x3b, 0x3f, 0xf6, 0x92, 0x19, 0x3f, 0xfc, 0x71, 0xab, 0x3f, 0xd8, 0x08, 0xe0, 0x3f, + 0xe6, 0x06, 0xa8, 0x3f, 0xd8, 0x5d, 0x8d, 0x3f, 0x4e, 0x44, 0x0c, 0x3f, 0xe1, 0x6f, 0x9b, 0x3f, + 0x6e, 0x24, 0xca, 0x3f, 0x7e, 0x5e, 0xac, 0x3f, 0xcc, 0x58, 0x9c, 0x3f, 0x2c, 0x79, 0x87, 0x3f, + 0x34, 0x87, 0x99, 0x3f, 0x71, 0x67, 0xbd, 0x3f, 0x38, 0xdd, 0xa0, 0x3f, 0xfe, 0xd8, 0x5b, 0x3f, + 0x74, 0xd9, 0xf3, 0x3f, 0xd9, 0x9f, 0x90, 0x3f, 0x53, 0x8a, 0x9b, 0x3f, 0x40, 0xb7, 0xbf, 0x3f, + 0xad, 0x53, 0x58, 0x3f, 0xa4, 0x3b, 0x78, 0x3f, 0xa0, 0x8f, 0x92, 0x3f, 0xe2, 0x5f, 0x9d, 0x3f, + 0x2a, 0xf1, 0xd8, 0x3f, 0xaa, 0xf1, 0x02, 0x40, 0x9b, 0xc0, 0xc0, 0x3f, 0x80, 0x76, 0x93, 0x3f, + 0x59, 0x10, 0xac, 0x3f, 0x83, 0x18, 0xa7, 0x3f, 0x10, 0x2f, 0x02, 0x40, 0x50, 0xde, 0xf3, 0x3f, + 0xa8, 0xd4, 0x02, 0x40, 0x66, 0xda, 0xa4, 0x3f, 0x9a, 0x10, 0xf9, 0x3e, 0x54, 0xef, 0xa9, 0x3f, + 0x8a, 0x5a, 0xd9, 0x3e, 0x31, 0xae, 0x56, 0x3e, 0x2c, 0xff, 0xdd, 0x3d, 0x74, 0x6c, 0x06, 0xbd, + 0x80, 0x59, 0x3d, 0x3e, 0xac, 0x68, 0x80, 0x3f, 0x72, 0x26, 0x61, 0x3f, 0xc8, 0x22, 0x85, 0x3f, + 0x59, 0x9e, 0xd3, 0x3e, 0x1f, 0xb9, 0x0d, 0x3f, 0x0a, 0x25, 0x4a, 0x3f, 0xca, 0x15, 0xb1, 0x3e, + 0x4e, 0x73, 0x50, 0x3f, 0xf4, 0xfc, 0xfc, 0x3d, 0x02, 0x46, 0xbd, 0xbe, 0xc8, 0x41, 0xc0, 0x3e, + 0xce, 0x18, 0xa8, 0x3e, 0x30, 0xfa, 0xd8, 0x3c, 0x22, 0x4e, 0xcd, 0x3e, 0x2c, 0x14, 0xa0, 0x3e, + 0xb0, 0x5b, 0x92, 0x3f, 0xcb, 0x1e, 0x25, 0x3f, 0xd4, 0xc3, 0xe4, 0xbd, 0x42, 0x98, 0x1a, 0x3e, + 0x45, 0x26, 0x81, 0x3e, 0x7d, 0x81, 0xb4, 0x3e, 0xf2, 0x1a, 0x7c, 0x3f, 0xe3, 0xb1, 0x8d, 0x3e, + 0xc0, 0x86, 0x12, 0x3f, 0x24, 0xa7, 0x59, 0xbe, 0x84, 0x0d, 0x28, 0x3f, 0x1e, 0x51, 0x74, 0x3f, + 0x6c, 0xed, 0xae, 0x3e, 0x4a, 0x3a, 0xca, 0x3e, 0x45, 0xca, 0xc8, 0xbe, 0x65, 0x13, 0x20, 0x3f, + 0x72, 0xcc, 0xc7, 0xbd, 0x0a, 0xbf, 0x8a, 0x3f, 0x0a, 0x72, 0x6b, 0x3e, 0xd8, 0x54, 0x44, 0x3f, + 0x74, 0xc8, 0x2a, 0x3e, 0x54, 0x0b, 0x47, 0x3e, 0xb0, 0xc8, 0x97, 0x3c, 0x28, 0xba, 0x2f, 0xbd, + 0xfd, 0x18, 0x4e, 0x3f, 0x68, 0xc8, 0x41, 0x3d, 0x94, 0xb2, 0x3a, 0x3e, 0xd6, 0x16, 0x7d, 0x3f, + 0xe2, 0x95, 0xf6, 0x3e, 0xc6, 0x55, 0x0d, 0x3f, 0xa2, 0x07, 0x6a, 0x3e, 0x00, 0xba, 0xf1, 0xbd, + 0x17, 0xfe, 0x04, 0x3f, 0x6d, 0xa7, 0xa6, 0x3e, 0x6f, 0x97, 0xb7, 0x3e, 0xbf, 0xcd, 0xbf, 0x3e, + 0x4a, 0x4b, 0x39, 0x3e, 0x12, 0xce, 0xdc, 0x3e, 0x26, 0x37, 0x0c, 0x3f, 0xc6, 0x5f, 0xf0, 0x3e, + 0xcc, 0xa1, 0x1a, 0x3f, 0xe2, 0xc1, 0x21, 0x3f, 0xda, 0x26, 0x37, 0x3f, 0xbe, 0xe5, 0x18, 0x3e, + 0xdd, 0x7a, 0x1e, 0xbe, 0x99, 0xcd, 0xa2, 0xbd, 0xc6, 0x14, 0x25, 0xbe, 0x6a, 0x03, 0xec, 0xbd, + 0x0c, 0x1a, 0x88, 0xbd, 0x19, 0x0f, 0xc1, 0xbd, 0x01, 0x71, 0x5e, 0xbe, 0xd3, 0x07, 0x0c, 0xbe, + 0x7d, 0xca, 0x26, 0xbe, 0x10, 0x31, 0x04, 0xbe, 0x42, 0x8f, 0xeb, 0xbd, 0xd0, 0x52, 0x5e, 0xbe, + 0x38, 0x1f, 0xee, 0xbd, 0xea, 0x56, 0x40, 0xbe, 0xb8, 0x59, 0x1f, 0xbe, 0x07, 0x5b, 0x6d, 0xbe, + 0xda, 0xa4, 0x38, 0xbe, 0x6d, 0xa2, 0x52, 0xbe, 0x2e, 0xee, 0x52, 0xbe, 0xeb, 0xb4, 0x0b, 0xbe, + 0x3b, 0x00, 0x3e, 0xbe, 0xa7, 0x47, 0x27, 0xbe, 0x02, 0x49, 0xf7, 0xbd, 0xd2, 0xff, 0x2f, 0xbe, + 0xeb, 0xea, 0x47, 0xbd, 0x93, 0x04, 0x59, 0xbe, 0x14, 0xb7, 0x3e, 0xbe, 0x08, 0x60, 0x03, 0xbe, + 0xb1, 0x29, 0xda, 0xbd, 0xe0, 0x36, 0x7f, 0xbd, 0x4f, 0xe7, 0x20, 0xbe, 0xd6, 0x30, 0x47, 0xbe, + 0xe6, 0xc5, 0x08, 0xbe, 0x60, 0xd2, 0x18, 0xbe, 0x90, 0x6b, 0x4c, 0xbc, 0xc1, 0xd0, 0x13, 0xbe, + 0x93, 0x11, 0x08, 0xbe, 0x0f, 0x17, 0x13, 0xbe, 0x2e, 0x91, 0xe0, 0xbd, 0xb2, 0xd3, 0xe4, 0xbd, + 0x22, 0x7d, 0xf4, 0xbd, 0xbc, 0x0e, 0x21, 0xbe, 0x8e, 0x11, 0xfb, 0xbd, 0x41, 0x52, 0xc0, 0xbd, + 0xd7, 0xce, 0x81, 0xbe, 0x3a, 0x23, 0x1c, 0xbe, 0xce, 0x7b, 0x08, 0xbe, 0x07, 0x48, 0x54, 0xbe, + 0x4a, 0x94, 0x0e, 0xbe, 0x94, 0x84, 0xd0, 0xbd, 0xc2, 0x5e, 0x12, 0xbe, 0xd0, 0x4f, 0x20, 0xbe, + 0xf0, 0x8b, 0x65, 0xbe, 0x46, 0xd0, 0x43, 0xbe, 0x6a, 0xbd, 0x36, 0xbe, 0xa2, 0x6f, 0x15, 0xbe, + 0x17, 0x36, 0x11, 0xbe, 0x0c, 0xc3, 0xe0, 0xbd, 0xb0, 0x74, 0x88, 0xbe, 0x29, 0x5b, 0x61, 0xbe, + 0xd5, 0x50, 0x54, 0xbe, 0x66, 0x4d, 0xfd, 0xbd, 0x1e, 0xcc, 0xe0, 0xbd, 0xe6, 0xb3, 0x2b, 0xbe, + 0x72, 0x45, 0x10, 0xbd, 0x9a, 0x01, 0x01, 0xbd, 0x2c, 0x95, 0xb4, 0xbc, 0x80, 0xf3, 0xed, 0x3a, + 0x5e, 0xcc, 0x89, 0xbc, 0xe4, 0x8d, 0xd6, 0xbd, 0x69, 0x64, 0x19, 0xbe, 0x0a, 0xd3, 0xfe, 0xbd, + 0xd5, 0x9d, 0x91, 0xbd, 0x24, 0x83, 0x77, 0xbd, 0x12, 0xdb, 0xab, 0xbd, 0xbe, 0x4d, 0xc2, 0xbd, + 0x14, 0xe3, 0xa3, 0xbd, 0x8a, 0x80, 0x65, 0xbd, 0xc0, 0x7b, 0x06, 0xbc, 0xc0, 0x96, 0x9b, 0xbd, + 0x3d, 0x94, 0xa9, 0xbd, 0x80, 0x34, 0x74, 0xbc, 0x9c, 0x35, 0xa3, 0xbd, 0x89, 0x09, 0x4c, 0xbd, + 0xa8, 0x8f, 0x16, 0xbe, 0x3d, 0x4a, 0xe1, 0xbd, 0x70, 0x4a, 0x5e, 0xbb, 0x15, 0x65, 0x18, 0xbd, + 0xfa, 0x38, 0x12, 0xbd, 0x04, 0xdf, 0x97, 0xbd, 0xdc, 0x1f, 0x1f, 0xbe, 0x1e, 0x03, 0xd5, 0xbc, + 0x2b, 0x0a, 0xb9, 0xbd, 0x74, 0x21, 0x9b, 0x3c, 0xdb, 0xef, 0xb5, 0xbd, 0x38, 0x61, 0xeb, 0xbd, + 0x0e, 0x59, 0x15, 0xbd, 0x1e, 0x59, 0xa0, 0xbd, 0x1b, 0xfa, 0xa6, 0x3d, 0x04, 0xfe, 0xae, 0xbd, + 0x5c, 0x8a, 0x0e, 0x3d, 0x31, 0x62, 0xf0, 0xbd, 0x74, 0x02, 0x2c, 0xbc, 0x80, 0x22, 0xa9, 0xbd, + 0x6d, 0x00, 0x84, 0xbc, 0x84, 0x82, 0xeb, 0xbc, 0xc0, 0x3e, 0xa1, 0x3a, 0xe0, 0x04, 0x75, 0xbb, + 0xb6, 0xf9, 0x12, 0xbe, 0x6a, 0x21, 0x35, 0xbd, 0xd2, 0xc4, 0xf5, 0xbc, 0xf7, 0x24, 0x20, 0xbe, + 0x30, 0xbb, 0xd2, 0xbd, 0xc6, 0xff, 0x75, 0xbd, 0xe4, 0x18, 0x52, 0xbd, 0x42, 0x9b, 0xb2, 0xbc, + 0x48, 0x61, 0xda, 0xbd, 0x58, 0x12, 0x9d, 0xbc, 0x3a, 0x83, 0x82, 0xbd, 0x28, 0xa7, 0x8b, 0xbd, + 0x8a, 0xb7, 0xd0, 0xbc, 0x3c, 0xec, 0xb6, 0xbc, 0x45, 0x68, 0xf1, 0xbd, 0x22, 0xac, 0x9c, 0xbd, + 0xce, 0xad, 0x81, 0xbd, 0xef, 0xe2, 0x6d, 0xbd, 0xae, 0xd8, 0x07, 0xbe, 0x9f, 0x2d, 0x4c, 0xbd, + 0xc9, 0x9f, 0xe1, 0xbd, 0x4b, 0xb6, 0x25, 0xbd, 0xe9, 0x4f, 0xb7, 0xbd, 0x23, 0x4a, 0x8f, 0xbd, + 0x5a, 0x4a, 0x3b, 0xbd, 0x27, 0x1b, 0x88, 0xbd, 0xc9, 0x11, 0x9e, 0xbd, 0xec, 0xa1, 0x9a, 0xbd, + 0xba, 0x7a, 0x96, 0xbd, 0xb0, 0xbc, 0xa7, 0xbd, 0x1e, 0x08, 0x9c, 0xbd, 0xf6, 0xda, 0x93, 0xbd, + 0x5a, 0xe7, 0xac, 0xbd, 0xf4, 0xf3, 0x8e, 0xbd, 0x7c, 0x47, 0x2b, 0xbd, 0x14, 0xc9, 0xd9, 0xbd, + 0x65, 0x27, 0x78, 0xbd, 0x78, 0xbb, 0xeb, 0xbd, 0x7c, 0xb4, 0xb6, 0xbd, 0xde, 0xe6, 0x8b, 0xbd, + 0x71, 0x74, 0xbf, 0xbd, 0xaf, 0x89, 0x71, 0xbd, 0x12, 0x49, 0x6c, 0xbd, 0xfa, 0xec, 0xac, 0xbd, + 0xfc, 0x31, 0xb3, 0xbc, 0x92, 0xab, 0xbe, 0xbd, 0x1b, 0x78, 0x84, 0xbd, 0x1b, 0x28, 0xaf, 0xbd, + 0xbb, 0x6a, 0x15, 0xbd, 0x8a, 0xf1, 0x09, 0xbd, 0xd0, 0x80, 0xa4, 0xbd, 0xcc, 0x80, 0xe4, 0xbd, + 0xfe, 0x6b, 0xae, 0xbd, 0xb8, 0x7f, 0x5f, 0xbd, 0x46, 0xd9, 0x4e, 0xbd, 0x92, 0xfa, 0x93, 0xbd, + 0x3e, 0x4a, 0xe1, 0xbd, 0x6f, 0xec, 0xbe, 0xbd, 0xe7, 0x06, 0xaf, 0xbd, 0x58, 0x12, 0x95, 0xbd, + 0x34, 0x60, 0x9e, 0xbd, 0x8c, 0xd8, 0xb8, 0xbd, 0xd9, 0x7e, 0xa4, 0xbd, 0x34, 0x35, 0x49, 0xbd, + 0x3c, 0xec, 0xc7, 0xbd, 0x85, 0xcb, 0x52, 0xbd, 0x0b, 0x28, 0x94, 0xbd, 0xb5, 0x1d, 0x9e, 0xbd, + 0x6e, 0xe3, 0x00, 0xbd, 0xc4, 0x48, 0x85, 0xbd, 0xb7, 0x94, 0x76, 0xbd, 0xb4, 0xe1, 0x6f, 0xbd, + 0x90, 0x3f, 0xae, 0xbd, 0x72, 0xf5, 0x0d, 0xbe, 0xda, 0x64, 0xad, 0xbd, 0xb7, 0x2c, 0x7b, 0xbd, + 0xd0, 0x22, 0xa9, 0xbd, 0x80, 0x6e, 0xc7, 0xbd, 0x8e, 0xc5, 0xd1, 0xbd, 0x30, 0xa0, 0xe1, 0xbd, + 0xa8, 0x11, 0x09, 0xbe, 0x9c, 0x42, 0xba, 0xbd, 0x32, 0xf7, 0x0f, 0xbc, 0xa5, 0x00, 0x8a, 0xbd, + 0x93, 0x6c, 0x4e, 0xbd, 0xfe, 0xd2, 0x86, 0xbc, 0x21, 0x37, 0xa3, 0xbc, 0x35, 0x64, 0x3b, 0xbc, + 0xcc, 0x46, 0xab, 0xbc, 0x8c, 0xda, 0x92, 0xbd, 0x61, 0x0a, 0x32, 0xbd, 0x9e, 0x03, 0x8e, 0xbd, + 0x9c, 0xfc, 0xe1, 0xbc, 0xd0, 0xd8, 0x42, 0xbd, 0x0c, 0x5c, 0x78, 0xbd, 0x1e, 0x78, 0x35, 0xbc, + 0x48, 0xc9, 0x87, 0xbd, 0xb8, 0xa5, 0x7f, 0xbb, 0x7c, 0x9c, 0x02, 0x3d, 0xb7, 0xe5, 0xe8, 0xbc, + 0xb7, 0xc7, 0x41, 0xbc, 0x4a, 0x71, 0xa1, 0xbc, 0x70, 0x84, 0xd5, 0xbc, 0x57, 0xd1, 0xcc, 0xbc, + 0xde, 0x03, 0x98, 0xbd, 0x9d, 0x44, 0x04, 0xbd, 0x84, 0xb3, 0x81, 0x3a, 0xba, 0x19, 0x8c, 0xbc, + 0x18, 0x00, 0x7b, 0xbc, 0x04, 0x5a, 0xc6, 0xbc, 0xc6, 0xc1, 0x49, 0xbd, 0xee, 0xd0, 0x0c, 0xbd, + 0x6c, 0x96, 0xe8, 0xbc, 0x13, 0x92, 0xe3, 0x3b, 0xdf, 0x22, 0x3d, 0xbd, 0x92, 0x00, 0x93, 0xbd, + 0xb6, 0xf8, 0x15, 0xbd, 0x2b, 0x68, 0x9c, 0xbc, 0x90, 0x1b, 0x65, 0xbb, 0xa6, 0x51, 0x2f, 0xbd, + 0x51, 0x58, 0xd6, 0xbc, 0x79, 0x06, 0xa4, 0xbd, 0xed, 0xfc, 0x12, 0xbd, 0x7e, 0x73, 0x6e, 0xbd, + 0xc4, 0x86, 0xd2, 0xbc, 0x7f, 0xe8, 0xd4, 0xbc, 0x50, 0xce, 0x96, 0xbc, 0xd0, 0x58, 0x84, 0xbb, + 0x88, 0x90, 0x2e, 0xbd, 0x98, 0xa3, 0x2e, 0x3b, 0x5d, 0x93, 0xa9, 0xbc, 0x48, 0xf5, 0x53, 0xbd, + 0x2a, 0xd8, 0x59, 0xbc, 0xd6, 0x0c, 0x35, 0xbd, 0x8e, 0x8e, 0x6b, 0xbc, 0x72, 0x5c, 0x10, 0x3c, + 0x13, 0x91, 0xd7, 0xbc, 0xd2, 0x79, 0x57, 0xbd, 0x34, 0xa4, 0xdf, 0xbc, 0x3a, 0xe9, 0xb7, 0xbc, + 0xa8, 0x0e, 0xc8, 0xbc, 0x04, 0xd5, 0x5b, 0xbd, 0x46, 0x41, 0xe6, 0xbc, 0x30, 0x36, 0x1d, 0xbd, + 0xc2, 0x52, 0x7d, 0xbd, 0x49, 0x29, 0x6e, 0xbd, 0x14, 0xc6, 0xa5, 0xbc, 0x36, 0xc1, 0x0a, 0xbc, + 0x0e, 0x82, 0xa8, 0xbf, 0xad, 0x89, 0x7d, 0xbf, 0xff, 0x6f, 0xc5, 0xbf, 0x6b, 0x68, 0x5f, 0xbf, + 0x8c, 0x7a, 0x19, 0xbf, 0x9b, 0x99, 0xc7, 0xbf, 0xc0, 0x8b, 0x6b, 0xc0, 0xf8, 0xf5, 0x10, 0xc0, + 0x33, 0xe4, 0x0c, 0xc0, 0x5d, 0xab, 0xbf, 0xbf, 0x72, 0x7b, 0xc8, 0xbf, 0x64, 0xa1, 0x4f, 0xc0, + 0x7d, 0xbf, 0xb9, 0xbf, 0x12, 0xfc, 0x1e, 0xc0, 0x1e, 0x02, 0xf4, 0xbf, 0x55, 0x9c, 0x38, 0xc0, + 0xdb, 0x44, 0x2e, 0xc0, 0xa4, 0x4f, 0xeb, 0xbf, 0xb0, 0x22, 0x2f, 0xc0, 0xc0, 0x68, 0xd6, 0xbf, + 0x5b, 0x05, 0x3f, 0xc0, 0x3d, 0xe6, 0x2e, 0xc0, 0x59, 0x21, 0x93, 0xbf, 0x5a, 0x2a, 0xf3, 0xbf, + 0x8a, 0x79, 0x4a, 0xbf, 0xac, 0xaa, 0x2e, 0xc0, 0x85, 0xa0, 0x5a, 0xc0, 0x6b, 0xea, 0x90, 0xbf, + 0xc8, 0x94, 0xfc, 0xbf, 0xc3, 0x59, 0xb8, 0xbe, 0x01, 0x54, 0x0c, 0xc0, 0xd4, 0x40, 0x27, 0xc0, + 0x1d, 0xa3, 0xa7, 0xbf, 0x58, 0x76, 0x12, 0xc0, 0x8e, 0x89, 0x93, 0x3f, 0x2f, 0x4d, 0x04, 0xc0, + 0x6e, 0x49, 0xb8, 0xbe, 0x92, 0xe8, 0x05, 0xc0, 0x36, 0xb6, 0x36, 0xbf, 0x1d, 0xdc, 0xc5, 0xbf, + 0x38, 0xfd, 0x80, 0xbf, 0x93, 0x77, 0xc4, 0xbf, 0x0a, 0x2d, 0x5d, 0xbf, 0x7f, 0x4f, 0x5a, 0xbf, + 0xb3, 0x17, 0x7b, 0xc0, 0x90, 0x53, 0x04, 0xc0, 0xc7, 0x59, 0xb3, 0xbf, 0x2d, 0xcd, 0x65, 0xc0, + 0x06, 0xc5, 0x28, 0xc0, 0xa7, 0xb5, 0xa4, 0xbf, 0xe8, 0x10, 0xf1, 0xbf, 0xd8, 0xbe, 0xeb, 0xbf, + 0xb7, 0x46, 0x53, 0xc0, 0xec, 0x4c, 0xb0, 0xbf, 0xa9, 0x00, 0x0f, 0xc0, 0x48, 0x47, 0x04, 0xc0, + 0x1f, 0xc5, 0xae, 0xbf, 0x67, 0xba, 0x2d, 0xbf, 0x08, 0x16, 0x75, 0xc0, 0xce, 0xb9, 0x2a, 0xc0, + 0x6c, 0xb7, 0x04, 0xc0, 0x6e, 0xae, 0xa4, 0xbf, 0xf9, 0x09, 0x2f, 0xc0, 0xd4, 0xee, 0x09, 0xc0, + 0x2c, 0x68, 0xae, 0x3f, 0xb7, 0x64, 0xa5, 0x3f, 0x2d, 0x39, 0xa3, 0x3f, 0x14, 0xdf, 0x41, 0x3f, + 0x88, 0x88, 0x87, 0x3f, 0xb8, 0x34, 0x88, 0x3f, 0x66, 0x50, 0xac, 0x3f, 0x4e, 0x13, 0xbb, 0x3f, + 0xc8, 0x4a, 0x8e, 0x3f, 0xb5, 0xcc, 0xb4, 0x3f, 0x10, 0xaa, 0xc7, 0x3f, 0x78, 0x4b, 0x71, 0x3f, + 0x88, 0x31, 0xff, 0x3f, 0xbd, 0xd1, 0x8f, 0x3f, 0x96, 0x85, 0x3b, 0x3f, 0x00, 0x62, 0xc7, 0x3f, + 0xf2, 0x3a, 0xad, 0x3f, 0x94, 0x60, 0xbf, 0x3f, 0xdd, 0x36, 0x9b, 0x3f, 0x12, 0x42, 0xa3, 0x3f, + 0x64, 0xa5, 0xe6, 0x3f, 0x04, 0xd3, 0xd0, 0x3f, 0x64, 0xb3, 0xaa, 0x3f, 0x7e, 0xaf, 0x69, 0x3f, + 0x97, 0xca, 0x73, 0x3f, 0x56, 0x1b, 0xc4, 0x3f, 0x48, 0xc8, 0x78, 0x3f, 0x3d, 0xe7, 0xe4, 0x3f, + 0x80, 0x32, 0xd4, 0x3f, 0x9a, 0x9e, 0x2c, 0x3f, 0x21, 0xf1, 0x7e, 0x3f, 0xce, 0xe2, 0xd6, 0x3f, + 0xe3, 0x15, 0xa1, 0x3f, 0x56, 0x0a, 0xe1, 0x3f, 0xc6, 0xdd, 0x94, 0x3f, 0xf5, 0x42, 0x9f, 0x3f, + 0xa2, 0x44, 0x6b, 0x3f, 0x42, 0xa9, 0xad, 0x3f, 0x8f, 0x7f, 0xd5, 0x3f, 0xd4, 0x2d, 0xcd, 0x3f, + 0x8a, 0xd7, 0x43, 0x3f, 0x43, 0x9a, 0xc7, 0x3f, 0x84, 0x3b, 0xcf, 0x3f, 0xd3, 0x4d, 0x9a, 0x3f, + 0x48, 0x46, 0x47, 0x3f, 0xba, 0xd6, 0x88, 0x3f, 0x53, 0xa4, 0x00, 0x40, 0x48, 0x36, 0xb2, 0x3f, + 0x2a, 0x9b, 0x89, 0x3f, 0xd0, 0x78, 0x12, 0x40, 0x32, 0xea, 0x68, 0x3f, 0xfa, 0x7c, 0x51, 0x3f, + 0x8d, 0xbb, 0xc9, 0x3f, 0x8e, 0x8d, 0xfb, 0x3f, 0x3e, 0x36, 0xd6, 0x3f, 0xe0, 0x1c, 0x9d, 0x3f, + 0x87, 0x4f, 0x9f, 0x3f, 0xfc, 0x37, 0xc9, 0x3f, 0xd9, 0x05, 0xa4, 0x3f, 0xf3, 0xc5, 0x80, 0x3f, + 0xcc, 0x5b, 0xe5, 0x3f, 0xdb, 0xe6, 0xc6, 0x3f, 0xb4, 0x72, 0x8f, 0x3e, 0xcc, 0xc1, 0x6c, 0x3f, + 0x53, 0xf3, 0x49, 0xbf, 0x32, 0xa5, 0x50, 0xbf, 0x08, 0x65, 0x7a, 0xbf, 0x80, 0xc3, 0x65, 0xbf, + 0x10, 0x2f, 0x3a, 0xbf, 0xbd, 0x77, 0x4f, 0xbf, 0x94, 0x51, 0x65, 0xbf, 0xf8, 0xfd, 0x51, 0xbf, + 0x80, 0x7c, 0x44, 0xbf, 0x06, 0x58, 0x36, 0xbf, 0x78, 0xec, 0x4c, 0xbf, 0x56, 0x1c, 0x57, 0xbf, + 0xb3, 0x85, 0x89, 0xbf, 0x74, 0x82, 0x77, 0xbf, 0x8a, 0x13, 0x66, 0xbf, 0x0c, 0xbe, 0x92, 0xbf, + 0xf0, 0x02, 0x8e, 0xbf, 0x6b, 0x0f, 0x80, 0xbf, 0x62, 0xd8, 0x76, 0xbf, 0x25, 0xcb, 0x43, 0xbf, + 0x8c, 0x02, 0x7f, 0xbf, 0x6e, 0xd2, 0x67, 0xbf, 0x1a, 0xa3, 0x83, 0xbf, 0x7c, 0x0f, 0x5b, 0xbf, + 0xe5, 0xe3, 0x36, 0xbf, 0x68, 0xc3, 0x7b, 0xbf, 0x93, 0x9b, 0x63, 0xbf, 0x4a, 0x14, 0x80, 0xbf, + 0xaa, 0xf8, 0x71, 0xbf, 0x28, 0xd6, 0x2b, 0xbf, 0x1c, 0x7a, 0x4c, 0xbf, 0x30, 0xdf, 0x64, 0xbf, + 0x19, 0xc3, 0x6d, 0xbf, 0x20, 0x30, 0x73, 0xbf, 0x9c, 0x62, 0x18, 0xbf, 0x9d, 0x99, 0x39, 0xbf, + 0x40, 0x84, 0x51, 0xbf, 0x2f, 0xe2, 0x83, 0xbf, 0xd4, 0x34, 0x4f, 0xbf, 0xf0, 0x73, 0x76, 0xbf, + 0x33, 0x05, 0x54, 0xbf, 0xa5, 0x12, 0x86, 0xbf, 0xd4, 0xd0, 0x60, 0xbf, 0xdb, 0x27, 0x5f, 0xbf, + 0x70, 0xa5, 0x43, 0xbf, 0x82, 0x29, 0x62, 0xbf, 0x1d, 0xdb, 0x82, 0xbf, 0x32, 0xab, 0x8e, 0xbf, + 0x56, 0x68, 0x41, 0xbf, 0xa0, 0xc0, 0x6e, 0xbf, 0xbc, 0xa5, 0x61, 0xbf, 0x20, 0x59, 0x4e, 0xbf, + 0x8c, 0x27, 0x88, 0xbf, 0x2a, 0x93, 0x89, 0xbf, 0x5d, 0x75, 0x6f, 0xbf, 0x43, 0xed, 0x3f, 0xbf, + 0x02, 0x96, 0x61, 0xbf, 0x54, 0xa2, 0x78, 0xbf, 0x54, 0x4f, 0x6d, 0xbf, 0xd9, 0x9c, 0x3a, 0xbf, + 0x29, 0xb9, 0x8a, 0xbf, 0xd1, 0xff, 0x4e, 0xbf, 0xb0, 0x4f, 0x05, 0xbf, 0x28, 0x4a, 0x6c, 0xbf, + 0x6c, 0x01, 0x86, 0xbe, 0xdf, 0xea, 0xaa, 0xbe, 0xe2, 0x2a, 0x3f, 0xbe, 0x3a, 0x0e, 0x33, 0xbe, + 0x28, 0xa3, 0x46, 0xbd, 0x95, 0x74, 0xb2, 0xbe, 0x68, 0x20, 0x23, 0xbf, 0x6b, 0x54, 0x04, 0xbf, + 0x02, 0x5d, 0xaf, 0xbe, 0x10, 0xf7, 0x96, 0xbe, 0x94, 0x6b, 0x03, 0xbf, 0x82, 0x9e, 0xee, 0xbd, + 0x30, 0x43, 0xed, 0xbe, 0xa1, 0xe5, 0x8f, 0xbe, 0x28, 0x1e, 0x83, 0xbd, 0x3d, 0xff, 0xef, 0xbe, + 0x5e, 0x36, 0xce, 0xbe, 0x8c, 0x56, 0x6c, 0xbe, 0xc9, 0x8b, 0xb7, 0xbe, 0x9b, 0x0e, 0x34, 0xbe, + 0xa8, 0xed, 0xb6, 0xbe, 0x3b, 0x64, 0xc4, 0xbe, 0x56, 0xf0, 0x4f, 0xbe, 0x28, 0x4b, 0x33, 0xbe, + 0x10, 0x9a, 0x8f, 0xbd, 0xe8, 0xea, 0x07, 0xbf, 0x40, 0x9f, 0x78, 0xbe, 0x59, 0x7e, 0xd9, 0xbe, + 0xa9, 0x8c, 0xaf, 0xbe, 0xe0, 0xa2, 0x5b, 0x3d, 0x82, 0xcb, 0xf3, 0xbe, 0x5c, 0x3e, 0xed, 0xbe, + 0x3a, 0x1c, 0x58, 0xbe, 0x3a, 0xa8, 0x1d, 0xbf, 0x72, 0x71, 0x4c, 0xbe, 0x93, 0x14, 0xa3, 0xbe, + 0xdc, 0xdc, 0x09, 0xbe, 0xce, 0x3f, 0x97, 0xbe, 0x0b, 0x17, 0xcc, 0xbe, 0x30, 0x5e, 0xb7, 0xbe, + 0xa8, 0x34, 0xca, 0xbc, 0xc1, 0x0d, 0xfe, 0xbe, 0xae, 0x17, 0xb1, 0xbe, 0x7e, 0x53, 0xec, 0xbe, + 0x80, 0x4d, 0x85, 0xbe, 0xa5, 0x56, 0x3a, 0xbe, 0x67, 0xa9, 0xdb, 0xbe, 0xab, 0x5f, 0xc2, 0xbe, + 0x40, 0x98, 0x84, 0xbe, 0x5a, 0x35, 0x0c, 0xbf, 0xbe, 0x79, 0x1e, 0xbe, 0xd6, 0x08, 0xa6, 0x3d, + 0x92, 0xd9, 0x0a, 0xbf, 0x59, 0x0d, 0x7f, 0xbe, 0x3c, 0xe5, 0x83, 0xbe, 0x9b, 0xac, 0x62, 0xbe, + 0x74, 0x67, 0x42, 0xbe, 0x4e, 0xec, 0xd1, 0xbe, 0x18, 0xe9, 0x96, 0xbd, 0x08, 0x73, 0x67, 0xbe, + 0xd9, 0x63, 0xb2, 0xbe, 0x8e, 0x46, 0x8c, 0xbe, 0xbc, 0xec, 0x05, 0xbe, 0xa3, 0xea, 0x8b, 0xbe, + 0xea, 0x1f, 0xb0, 0x3d, 0x18, 0x59, 0xc4, 0x3d, 0xb4, 0x89, 0xe8, 0x3d, 0x46, 0xec, 0xf1, 0x3d, + 0x23, 0x1a, 0x9c, 0x3d, 0x5a, 0x02, 0xd4, 0x3d, 0x3f, 0x88, 0xf8, 0x3d, 0x18, 0x26, 0xcd, 0x3d, + 0x96, 0xf7, 0xc2, 0x3d, 0x44, 0x94, 0x98, 0x3d, 0x29, 0xfd, 0xbf, 0x3d, 0xf0, 0xb7, 0xcd, 0x3d, + 0x18, 0x67, 0xf1, 0x3d, 0x58, 0x16, 0xf9, 0x3d, 0x44, 0xa4, 0xe8, 0x3d, 0xfb, 0xeb, 0x12, 0x3e, + 0x1d, 0xd7, 0x10, 0x3e, 0x18, 0xe9, 0xe4, 0x3d, 0x7d, 0x24, 0xfa, 0x3d, 0x10, 0xe3, 0xa5, 0x3d, + 0x1d, 0x96, 0xdb, 0x3d, 0x7c, 0x17, 0xce, 0x3d, 0xbe, 0xae, 0xf5, 0x3d, 0x5e, 0x7c, 0xda, 0x3d, + 0x27, 0xa3, 0xa1, 0x3d, 0x8a, 0xe4, 0xfb, 0x3d, 0x45, 0xa6, 0xe7, 0x3d, 0xa4, 0xce, 0xe4, 0x3d, + 0xba, 0x4d, 0xd4, 0x3d, 0x5a, 0xae, 0x9a, 0x3d, 0x99, 0x25, 0xe2, 0x3d, 0x33, 0x7d, 0xcf, 0x3d, + 0x68, 0x03, 0xdd, 0x3d, 0xef, 0xdd, 0xea, 0x3d, 0x86, 0xd2, 0x77, 0x3d, 0x8a, 0x65, 0xaa, 0x3d, + 0xd2, 0x77, 0xca, 0x3d, 0xab, 0x26, 0xfe, 0x3d, 0x92, 0x99, 0xaf, 0x3d, 0xfc, 0xfc, 0xde, 0x3d, + 0x38, 0xba, 0xcc, 0x3d, 0xbe, 0x10, 0x05, 0x3e, 0x64, 0xac, 0xc2, 0x3d, 0xaf, 0xc1, 0xe8, 0x3d, + 0xd2, 0x4b, 0xd0, 0x3d, 0x3a, 0x0c, 0xd9, 0x3d, 0xbe, 0xc4, 0xdc, 0x3d, 0xdc, 0x1d, 0x0f, 0x3e, + 0xd4, 0x37, 0xb9, 0x3d, 0x7d, 0x59, 0xba, 0x3d, 0x8a, 0x83, 0xe0, 0x3d, 0xda, 0x73, 0xb7, 0x3d, + 0x28, 0x5c, 0x09, 0x3e, 0xf8, 0x85, 0xdd, 0x3d, 0x9f, 0x7b, 0xc7, 0x3d, 0xa4, 0x2f, 0xa9, 0x3d, + 0x8b, 0x2d, 0xcd, 0x3d, 0x66, 0x07, 0xe9, 0x3d, 0xab, 0xd6, 0xcc, 0x3d, 0xae, 0x66, 0xb2, 0x3d, + 0x10, 0x4a, 0xf6, 0x3d, 0x90, 0x5f, 0xaa, 0x3d, 0x57, 0x3d, 0x9a, 0x3d, 0x4e, 0x65, 0xf8, 0x3d, + 0x40, 0x55, 0x08, 0x3d, 0xf9, 0x98, 0x43, 0x3d, 0x4e, 0xe9, 0x0c, 0x3d, 0xa2, 0xd8, 0x3b, 0x3d, + 0xf4, 0xb1, 0x08, 0x3c, 0x97, 0xd5, 0x6a, 0x3d, 0x1c, 0x94, 0xc3, 0x3d, 0xdb, 0x04, 0x8f, 0x3d, + 0xc6, 0xd8, 0x57, 0x3d, 0x25, 0x2e, 0x06, 0x3d, 0xa5, 0x2f, 0x85, 0x3d, 0xac, 0xf7, 0xe5, 0x3c, + 0x4e, 0x8b, 0x68, 0x3d, 0xde, 0x46, 0x59, 0x3d, 0x00, 0xc5, 0xf6, 0x3c, 0xfb, 0x0d, 0x9b, 0x3d, + 0x78, 0xf2, 0x90, 0x3d, 0x42, 0x90, 0x0e, 0x3d, 0x66, 0x2b, 0x7c, 0x3d, 0xde, 0x0d, 0xb5, 0x3c, + 0xe0, 0x7f, 0x31, 0x3d, 0xab, 0x61, 0x46, 0x3d, 0x49, 0x4d, 0x19, 0x3d, 0x38, 0x31, 0x1e, 0x3d, + 0x88, 0x7b, 0x6d, 0x3c, 0x57, 0x37, 0x9f, 0x3d, 0x52, 0x93, 0x46, 0x3d, 0x6e, 0xc1, 0x5d, 0x3d, + 0x84, 0xe4, 0x31, 0x3d, 0x58, 0x5c, 0x47, 0x3b, 0x50, 0x15, 0xa0, 0x3d, 0xfe, 0x93, 0x6e, 0x3d, + 0x23, 0x0d, 0x14, 0x3d, 0x36, 0x71, 0xa6, 0x3d, 0x0e, 0x91, 0xab, 0x3c, 0x28, 0x4c, 0x2e, 0x3d, + 0x8e, 0xb5, 0xf9, 0x3c, 0x30, 0x46, 0x4f, 0x3d, 0x0e, 0xf1, 0x36, 0x3d, 0x9f, 0x58, 0x46, 0x3d, + 0x4e, 0xaa, 0xa0, 0x3c, 0x5c, 0x3c, 0x99, 0x3d, 0xcb, 0x50, 0x2b, 0x3d, 0x33, 0xc3, 0x94, 0x3d, + 0x56, 0x33, 0x52, 0x3d, 0x7d, 0xc5, 0x12, 0x3d, 0x0a, 0x87, 0x46, 0x3d, 0x79, 0xb6, 0x87, 0x3d, + 0x53, 0x0f, 0x27, 0x3d, 0x57, 0x07, 0x57, 0x3d, 0xcc, 0x61, 0x17, 0x3d, 0xc0, 0xa1, 0x62, 0x3a, + 0xe5, 0xf3, 0xa7, 0x3d, 0xe5, 0xa7, 0xd1, 0x3c, 0x7c, 0xb2, 0xf2, 0x3c, 0x04, 0xeb, 0xf3, 0x3c, + 0x1b, 0x5c, 0xfe, 0x3c, 0x84, 0x2e, 0x6c, 0x3d, 0x48, 0xa7, 0x70, 0x3c, 0x38, 0xcd, 0x16, 0x3d, + 0x48, 0x60, 0x3f, 0x3d, 0x57, 0x89, 0xf3, 0x3c, 0x46, 0xc0, 0x14, 0x3d, 0x43, 0xa9, 0x66, 0x3d, + 0x69, 0xc9, 0x48, 0x3d, 0x25, 0x4e, 0x44, 0x3d, 0x1b, 0xbb, 0x63, 0x3d, 0xbf, 0x9a, 0x34, 0x3d, + 0x04, 0x83, 0x35, 0x3d, 0x3c, 0x47, 0x33, 0x3d, 0xe1, 0xdd, 0x44, 0x3d, 0x87, 0x4d, 0x47, 0x3d, + 0x9e, 0x3e, 0x30, 0x3d, 0x7c, 0xbc, 0x3e, 0x3d, 0x66, 0x28, 0x4b, 0x3d, 0x3b, 0x72, 0x3b, 0x3d, + 0xac, 0xb3, 0x8a, 0x3d, 0xd7, 0xd1, 0x52, 0x3d, 0x0f, 0xc6, 0x38, 0x3d, 0xde, 0x36, 0x81, 0x3d, + 0x98, 0x2e, 0x72, 0x3d, 0x5a, 0xec, 0x73, 0x3d, 0x94, 0x88, 0x54, 0x3d, 0x54, 0x8b, 0x43, 0x3d, + 0xb8, 0xdc, 0x80, 0x3d, 0xb5, 0xd3, 0x66, 0x3d, 0xa6, 0x9b, 0x6e, 0x3d, 0xaa, 0xc9, 0x38, 0x3d, + 0xb6, 0x27, 0x2b, 0x3d, 0xfb, 0x76, 0x64, 0x3d, 0xb9, 0x79, 0x3e, 0x3d, 0x0a, 0x07, 0x7e, 0x3d, + 0xbc, 0xcc, 0x70, 0x3d, 0x17, 0xe1, 0x17, 0x3d, 0x8c, 0x4a, 0x26, 0x3d, 0x60, 0x34, 0x64, 0x3d, + 0x98, 0xc3, 0x59, 0x3d, 0x25, 0x67, 0x6a, 0x3d, 0xb4, 0x9a, 0x20, 0x3d, 0xd3, 0xfd, 0x33, 0x3d, + 0x14, 0x8e, 0x35, 0x3d, 0x2e, 0xc1, 0x6b, 0x3d, 0xd2, 0xc7, 0x59, 0x3d, 0xb7, 0xf6, 0x6e, 0x3d, + 0x45, 0x2c, 0x32, 0x3d, 0xce, 0xef, 0x71, 0x3d, 0x47, 0xb4, 0x63, 0x3d, 0x67, 0x8e, 0x40, 0x3d, + 0x86, 0x32, 0x1d, 0x3d, 0x6e, 0x5c, 0x47, 0x3d, 0xb4, 0x23, 0x88, 0x3d, 0xc0, 0xcf, 0x76, 0x3d, + 0xcc, 0x2c, 0x30, 0x3d, 0x27, 0xb9, 0x87, 0x3d, 0x46, 0x93, 0x3d, 0x3d, 0xc2, 0xd2, 0x37, 0x3d, + 0xc6, 0x34, 0x73, 0x3d, 0xd9, 0x3f, 0x8f, 0x3d, 0x5c, 0xbb, 0x74, 0x3d, 0xaa, 0x9d, 0x3b, 0x3d, + 0xd9, 0x98, 0x52, 0x3d, 0x3a, 0x81, 0x6b, 0x3d, 0x8f, 0x17, 0x62, 0x3d, 0xff, 0x12, 0x29, 0x3d, + 0x0d, 0x7a, 0x87, 0x3d, 0xc1, 0x40, 0x58, 0x3d, 0x79, 0xd5, 0xb8, 0x3c, 0xd8, 0xea, 0x3e, 0x3d, + 0x3e, 0xa8, 0xb9, 0x3c, 0x14, 0x83, 0xc3, 0x3c, 0x1c, 0x4c, 0x83, 0x3c, 0x27, 0x6b, 0x02, 0x3c, + 0x7f, 0xfc, 0x29, 0x3c, 0x5a, 0x5f, 0xa9, 0x3c, 0xbe, 0xe9, 0x0f, 0x3d, 0x49, 0x2c, 0x09, 0x3d, + 0xd6, 0x66, 0xb2, 0x3c, 0x96, 0x7e, 0xd2, 0x3c, 0xe2, 0x5a, 0x10, 0x3d, 0xda, 0xe0, 0x1c, 0x3c, + 0xe5, 0x45, 0x18, 0x3d, 0xda, 0xbd, 0x8c, 0x3c, 0xb0, 0x30, 0x5d, 0x3b, 0x82, 0x47, 0xef, 0x3c, + 0xad, 0xe5, 0xc2, 0x3c, 0xc9, 0x96, 0xac, 0x3c, 0x94, 0xf3, 0xb0, 0x3c, 0x77, 0xd7, 0x95, 0x3c, + 0x88, 0xc6, 0xfd, 0x3c, 0x1f, 0xda, 0xf7, 0x3c, 0x1a, 0x27, 0x8b, 0x3c, 0xa2, 0x97, 0x35, 0x3c, + 0xfe, 0xcf, 0x1c, 0x3c, 0xc7, 0xc9, 0x07, 0x3d, 0x75, 0x74, 0x68, 0x3c, 0x1e, 0x19, 0x08, 0x3d, + 0x88, 0xe2, 0xea, 0x3c, 0xc0, 0x67, 0x5b, 0x3a, 0x87, 0x74, 0xc8, 0x3c, 0x2c, 0x01, 0x0c, 0x3d, + 0x82, 0x8d, 0x8d, 0x3c, 0x6d, 0xfa, 0x25, 0x3d, 0x6a, 0x2b, 0x9f, 0x3c, 0xb9, 0x7c, 0xc1, 0x3c, + 0x96, 0x0e, 0x26, 0x3c, 0x36, 0x7b, 0xaa, 0x3c, 0xc8, 0xa6, 0x05, 0x3d, 0x16, 0x4c, 0xe6, 0x3c, + 0x00, 0xca, 0x59, 0x3b, 0xad, 0x0a, 0x01, 0x3d, 0x8f, 0x60, 0xed, 0x3c, 0xd6, 0x1f, 0xd9, 0x3c, + 0x7d, 0x01, 0x58, 0x3c, 0xcc, 0xcb, 0x5e, 0x3c, 0x30, 0xc6, 0x16, 0x3d, 0x03, 0x95, 0xc0, 0x3c, + 0x42, 0xf9, 0x94, 0x3c, 0x7c, 0x9c, 0x40, 0x3d, 0x89, 0x02, 0x23, 0x3c, 0xf8, 0x0a, 0x09, 0x3a, + 0x5a, 0x70, 0x08, 0x3d, 0x65, 0xc7, 0xea, 0x3c, 0xb9, 0xd8, 0xd3, 0x3c, 0x8c, 0xd1, 0x9e, 0x3c, + 0xaa, 0x04, 0x8a, 0x3c, 0x2c, 0x22, 0xf1, 0x3c, 0x34, 0x57, 0x4d, 0x3c, 0xda, 0x25, 0x84, 0x3c, + 0x5c, 0xb4, 0xf0, 0x3c, 0x0e, 0x87, 0xd5, 0x3c, 0x8a, 0xd8, 0x48, 0x3b, 0xb0, 0x68, 0x67, 0x3c, + 0x5e, 0x2a, 0x80, 0x3f, 0x3e, 0x79, 0xa5, 0x3f, 0x8c, 0x3c, 0xac, 0x3f, 0x67, 0xe8, 0xd3, 0x3f, + 0xc4, 0xc7, 0x2f, 0x3f, 0x90, 0x93, 0xc5, 0x3f, 0xeb, 0x97, 0x06, 0x40, 0x5f, 0x09, 0xc6, 0x3f, + 0x99, 0x65, 0xb1, 0x3f, 0x96, 0x82, 0x5a, 0x3f, 0xfd, 0xb8, 0xb2, 0x3f, 0x6f, 0xe8, 0x9a, 0x3f, + 0x14, 0xb1, 0xbb, 0x3f, 0x30, 0x92, 0xd7, 0x3f, 0x6e, 0x83, 0xb7, 0x3f, 0x96, 0xe4, 0x04, 0x40, + 0x20, 0x25, 0x03, 0x40, 0x76, 0x3a, 0xa3, 0x3f, 0x32, 0xbb, 0xe1, 0x3f, 0x25, 0x4d, 0x57, 0x3f, + 0x28, 0x23, 0x9f, 0x3f, 0x18, 0x72, 0xa2, 0x3f, 0x2f, 0xf7, 0xb7, 0x3f, 0xd0, 0x06, 0xb4, 0x3f, + 0x80, 0x19, 0x50, 0x3f, 0xc4, 0x08, 0xf0, 0x3f, 0xab, 0x97, 0xc9, 0x3f, 0x7c, 0x9d, 0xb5, 0x3f, + 0x5d, 0xc4, 0x9e, 0x3f, 0x58, 0x20, 0x37, 0x3f, 0x47, 0xb3, 0xef, 0x3f, 0x5b, 0x0b, 0xb0, 0x3f, + 0x8a, 0x2d, 0xa7, 0x3f, 0x10, 0xe9, 0xe1, 0x3f, 0x22, 0x22, 0x20, 0x3f, 0x6e, 0x45, 0x8d, 0x3f, + 0x95, 0xd8, 0x9c, 0x3f, 0x8c, 0x27, 0xd0, 0x3f, 0xd5, 0x68, 0x86, 0x3f, 0x94, 0x39, 0xaf, 0x3f, + 0xee, 0xb5, 0x93, 0x3f, 0x6c, 0xf7, 0xf4, 0x3f, 0xbd, 0x4c, 0x91, 0x3f, 0xfa, 0xad, 0xe5, 0x3f, + 0x93, 0x72, 0xc3, 0x3f, 0x57, 0x00, 0xaa, 0x3f, 0xa5, 0xb3, 0xa0, 0x3f, 0xb8, 0x20, 0xfc, 0x3f, + 0xda, 0x8f, 0x9a, 0x3f, 0x18, 0x63, 0x85, 0x3f, 0x98, 0xf7, 0xb5, 0x3f, 0x12, 0x19, 0x50, 0x3f, + 0x19, 0x69, 0x02, 0x40, 0x52, 0xb9, 0x81, 0x3f, 0xbc, 0x5e, 0x81, 0x3f, 0x2e, 0x1d, 0x75, 0x3f, + 0x54, 0x26, 0x95, 0x3f, 0x8e, 0x93, 0xc4, 0x3f, 0x5c, 0x58, 0x78, 0x3f, 0xdb, 0x42, 0x92, 0x3f, + 0xfa, 0x1c, 0xb7, 0x3f, 0x3e, 0xc1, 0x61, 0x3f, 0x8e, 0xc7, 0x97, 0x3f, 0x82, 0x26, 0xe2, 0x3f, + 0x85, 0x95, 0x87, 0x3f, 0x00, 0x26, 0xca, 0x3e, 0x00, 0x41, 0x6a, 0xbe, 0x81, 0xf9, 0x17, 0x3f, + 0xe8, 0x37, 0xe1, 0x3d, 0xa2, 0x3f, 0x44, 0x3f, 0xbf, 0xc7, 0x87, 0x3e, 0x24, 0x5e, 0xa3, 0x3e, + 0xe6, 0x4d, 0x58, 0x3f, 0xeb, 0x87, 0x97, 0x3e, 0x6c, 0x7e, 0xf4, 0x3e, 0x42, 0x57, 0x94, 0x3e, + 0x41, 0x81, 0x87, 0x3f, 0x28, 0x70, 0x1a, 0x3e, 0x7a, 0x98, 0x9a, 0xbe, 0x22, 0x74, 0x13, 0x3f, + 0x40, 0x75, 0xfe, 0xbd, 0x1b, 0xe6, 0x29, 0x3f, 0x45, 0xa9, 0xdc, 0x3e, 0x08, 0xd0, 0x78, 0x3f, + 0x2d, 0x7d, 0x2c, 0x3f, 0x42, 0x35, 0x05, 0x3e, 0xe0, 0xaa, 0x21, 0x3e, 0x02, 0x4a, 0x11, 0x3f, + 0x50, 0xbf, 0x10, 0x3e, 0x3e, 0x2d, 0x46, 0x3f, 0x9a, 0xfe, 0xf9, 0x3e, 0x21, 0x02, 0x19, 0x3f, + 0x9f, 0xd5, 0x55, 0x3f, 0xbc, 0x82, 0x97, 0x3d, 0x9b, 0xf3, 0x07, 0x3f, 0x8e, 0x8f, 0x3c, 0x3f, + 0xf6, 0xee, 0xfe, 0x3e, 0xab, 0x17, 0xa4, 0x3e, 0x50, 0xf4, 0xd8, 0x3e, 0x4d, 0x9b, 0x50, 0x3f, + 0x58, 0x7d, 0x96, 0x3e, 0x36, 0x10, 0xff, 0x3e, 0xa4, 0xc1, 0xcd, 0x3d, 0x04, 0x86, 0x4c, 0x3e, + 0x91, 0x49, 0xe0, 0x3e, 0x18, 0x7d, 0xa3, 0x3e, 0x34, 0xa0, 0x1c, 0x3f, 0xa0, 0x3b, 0x87, 0xbd, + 0x10, 0x24, 0xcf, 0x3d, 0x22, 0xba, 0x9a, 0x3e, 0xf8, 0xff, 0xd5, 0xbd, 0x94, 0xd0, 0xe7, 0x3d, + 0xdc, 0x35, 0x3c, 0xbe, 0xc0, 0x0b, 0x13, 0x3e, 0x26, 0xc0, 0xbe, 0x3e, 0x1d, 0x5b, 0xe6, 0x3e, + 0x84, 0xea, 0xb0, 0x3d, 0x68, 0xe6, 0x82, 0x3f, 0x05, 0x8d, 0x56, 0x3f, 0x34, 0x64, 0xc7, 0x3d, + 0xc0, 0x12, 0x40, 0x3e, 0xe4, 0x95, 0x49, 0x3f, 0xbd, 0x32, 0x0a, 0x3f, 0x84, 0x1e, 0x9f, 0x3e, + 0x4c, 0x7d, 0xef, 0x3d, 0x29, 0x23, 0xa9, 0x3e, 0xc2, 0x1d, 0x42, 0x3e, 0xea, 0x51, 0x18, 0x3f, + 0x80, 0x9a, 0x6c, 0xbe, 0xb6, 0x82, 0x50, 0xbe, 0x58, 0xa5, 0xac, 0x3d, 0x2d, 0xdd, 0xca, 0xbe, + 0xb1, 0x7e, 0x5a, 0xbd, 0x05, 0xf1, 0x96, 0xbe, 0xe2, 0x1b, 0xb1, 0xbe, 0xb0, 0xa0, 0xb3, 0xbe, + 0xf6, 0x81, 0x0c, 0xbf, 0x59, 0x1c, 0x5a, 0xbe, 0xf7, 0x69, 0x20, 0xbe, 0x57, 0x26, 0xb7, 0xbe, + 0x56, 0xf5, 0x8e, 0xbe, 0x65, 0xef, 0xcd, 0xbe, 0xf6, 0xfb, 0x3d, 0xbe, 0x82, 0x2b, 0x00, 0xbf, + 0x3a, 0xa6, 0x3f, 0xbe, 0xb6, 0x30, 0xd0, 0xbe, 0xb9, 0x13, 0xf6, 0xbe, 0x42, 0xb0, 0x9a, 0xbe, + 0x44, 0x86, 0x15, 0xbf, 0xe8, 0xb5, 0x93, 0xbe, 0x79, 0x3f, 0x42, 0xbe, 0xc6, 0x04, 0x98, 0xbe, + 0xc6, 0x9c, 0xcf, 0x3d, 0x30, 0xe1, 0x21, 0xbf, 0x00, 0x07, 0x85, 0xbe, 0x08, 0xcf, 0xb4, 0xbe, + 0xda, 0xf4, 0x56, 0xbe, 0x86, 0x0c, 0x30, 0xbe, 0x4c, 0xe5, 0xc4, 0xbe, 0x02, 0x3d, 0xa9, 0xbe, + 0x2c, 0x67, 0x49, 0xbe, 0x5f, 0xdc, 0x38, 0xbe, 0x50, 0x28, 0x30, 0xbe, 0x11, 0x6d, 0xcf, 0xbe, + 0x4c, 0x05, 0x27, 0xbe, 0x85, 0x2c, 0x2f, 0xbe, 0xa1, 0x0b, 0x84, 0xbe, 0x7f, 0x45, 0x9d, 0xbd, + 0x5d, 0x19, 0xe5, 0xbe, 0xe3, 0x78, 0xe0, 0xbd, 0xac, 0xd6, 0xd0, 0xbe, 0x96, 0x15, 0x45, 0xbe, + 0xfa, 0xb2, 0x72, 0xbe, 0x0a, 0x50, 0x7d, 0xbe, 0xdf, 0xdc, 0xe1, 0xbd, 0xc2, 0xb1, 0xa7, 0xbe, + 0x42, 0x56, 0x5d, 0xbe, 0xf5, 0xa0, 0x08, 0xbe, 0x22, 0xfc, 0xa6, 0xbe, 0x5b, 0x73, 0x80, 0xbe, + 0xf6, 0x15, 0xe1, 0xbe, 0x6f, 0xe4, 0xaf, 0xbe, 0x42, 0x82, 0xec, 0xbe, 0xb6, 0xde, 0x1f, 0xbe, + 0x1b, 0x9b, 0x26, 0xbe, 0x79, 0xea, 0xa4, 0xbe, 0x5e, 0xab, 0xfa, 0xbe, 0xcb, 0x79, 0x0a, 0xbf, + 0xcd, 0x80, 0x91, 0xbe, 0xd4, 0x1a, 0xbc, 0xbe, 0xa2, 0x70, 0xb7, 0x3c, 0x32, 0x75, 0x79, 0xbe, + 0x4e, 0x0d, 0x2e, 0xbd, 0x18, 0x18, 0xbd, 0xbd, 0x44, 0xe5, 0x45, 0x3e, 0x1d, 0x70, 0x1a, 0x3e, + 0xa9, 0x0b, 0x91, 0x3b, 0x8f, 0x50, 0xbb, 0xbe, 0x06, 0x7b, 0x34, 0xbe, 0x73, 0xed, 0x34, 0xbe, + 0xc2, 0x64, 0x8b, 0xbe, 0x0c, 0xf8, 0x8a, 0xbe, 0x92, 0x2a, 0xf4, 0xbd, 0xa7, 0x37, 0x98, 0xbd, + 0xff, 0x7c, 0xa9, 0xbe, 0xb4, 0x55, 0xbc, 0xbd, 0xb7, 0x39, 0x10, 0x3e, 0x56, 0x99, 0x07, 0x3e, + 0x7c, 0xe9, 0x1d, 0x3d, 0xa6, 0x54, 0x0f, 0xbd, 0xea, 0x01, 0x08, 0xbe, 0xfd, 0x82, 0x4d, 0xbe, + 0x44, 0x10, 0xc3, 0xbd, 0x68, 0xa7, 0x66, 0xbe, 0x56, 0x49, 0xf8, 0x3c, 0x3d, 0xaa, 0x40, 0xbd, + 0xc6, 0xf3, 0x90, 0x3d, 0x94, 0x19, 0x5f, 0xbe, 0xbc, 0x05, 0x93, 0xbe, 0xb6, 0x9f, 0xfd, 0xbd, + 0xfb, 0x3d, 0x85, 0xbe, 0x80, 0x6d, 0x79, 0x3a, 0x5b, 0xd1, 0x86, 0xbe, 0xf6, 0x1a, 0x65, 0xbe, + 0xd0, 0x36, 0x41, 0xbe, 0x4e, 0xc2, 0x43, 0xbe, 0x3a, 0x9d, 0xaa, 0xbc, 0x6a, 0xdc, 0x0f, 0xbe, + 0x68, 0x25, 0x23, 0x3e, 0x7c, 0x1e, 0x74, 0xbe, 0x30, 0x5f, 0x0c, 0x3e, 0x16, 0x24, 0x3d, 0xbe, + 0x88, 0x0a, 0x7a, 0x3c, 0x19, 0x83, 0xd2, 0x3d, 0x59, 0xb4, 0x89, 0xbd, 0xb9, 0xad, 0xdf, 0x3d, + 0x3e, 0x42, 0xa7, 0xbd, 0xe4, 0x71, 0x9b, 0xbc, 0xbf, 0xd4, 0x97, 0x3d, 0xde, 0x44, 0x77, 0xbe, + 0x94, 0x1d, 0x12, 0x3d, 0x6a, 0xfa, 0x8f, 0xbe, 0xc0, 0xf1, 0x48, 0x3c, 0x38, 0x55, 0x12, 0x3d, + 0x07, 0x26, 0x62, 0x3e, 0xfe, 0xf2, 0x06, 0xbe, 0x70, 0xc6, 0xca, 0xbe, 0x3a, 0xbb, 0x3b, 0xbd, + 0x60, 0xd1, 0x3d, 0xbb, 0x18, 0xda, 0xca, 0xbe, 0x33, 0x00, 0xf1, 0xbd, 0x8c, 0x4b, 0x76, 0x3d, + 0x28, 0x3a, 0x27, 0x3d, 0xae, 0x8d, 0x31, 0xbe, 0x20, 0xf2, 0x3e, 0xbe, 0x8e, 0xe3, 0x96, 0xbd, + 0xa0, 0x55, 0xc1, 0x3a, 0x13, 0x38, 0xb4, 0x3c, 0x5c, 0x8e, 0x70, 0xbc, 0xfd, 0x7e, 0x03, 0x3d, + 0x3b, 0xd3, 0x86, 0x3b, 0xc7, 0x63, 0x17, 0x3d, 0x32, 0x6b, 0x54, 0x3d, 0x8a, 0x2c, 0x50, 0x3d, + 0x8a, 0xce, 0x8a, 0x3d, 0xb7, 0x0b, 0x12, 0x3d, 0xd4, 0xdd, 0x5d, 0x3c, 0xa2, 0x71, 0x43, 0x3d, + 0x10, 0x9d, 0xbd, 0x3c, 0x28, 0x5d, 0x75, 0x3d, 0xec, 0xbd, 0xfc, 0x3c, 0xa0, 0xd8, 0x49, 0x3d, + 0xa0, 0x1d, 0xf8, 0x3c, 0xd4, 0x61, 0x26, 0x3d, 0x52, 0xd3, 0x83, 0x3d, 0x96, 0x0e, 0xbc, 0x3c, + 0x7c, 0x2f, 0x8f, 0x3d, 0x55, 0x9f, 0x4d, 0x3d, 0x68, 0x92, 0xb1, 0x3c, 0xe0, 0x22, 0xe5, 0x3c, + 0x3c, 0x28, 0xc0, 0xbc, 0xec, 0xb7, 0xa3, 0x3d, 0xcc, 0xc6, 0x17, 0x3d, 0x79, 0xba, 0x20, 0x3d, + 0x8c, 0x73, 0x88, 0x3c, 0xe6, 0xaa, 0xbe, 0x3c, 0xc6, 0xb1, 0x59, 0x3d, 0x60, 0x57, 0x14, 0x3d, + 0x38, 0x01, 0xb7, 0x3c, 0xe2, 0x9e, 0xd4, 0x3c, 0xe0, 0x6b, 0x50, 0x3c, 0x34, 0x6d, 0x26, 0x3d, + 0xf4, 0x9e, 0xdb, 0x3b, 0x93, 0xd2, 0xab, 0x3c, 0x7a, 0x53, 0xe9, 0x3c, 0x97, 0xf5, 0x66, 0x3c, + 0xc2, 0xfb, 0x53, 0x3d, 0x10, 0x62, 0x2c, 0x3a, 0xcc, 0xd7, 0x34, 0x3d, 0xe8, 0xcb, 0xd1, 0x3c, + 0x35, 0x89, 0x14, 0x3d, 0xf0, 0x39, 0xe5, 0x3c, 0x4e, 0xaf, 0x85, 0x3c, 0x94, 0x3b, 0x6b, 0x3d, + 0x53, 0x97, 0x16, 0x3d, 0xbe, 0xec, 0xef, 0x3c, 0xd4, 0x00, 0x13, 0x3d, 0xac, 0xca, 0xaa, 0x3c, + 0x2e, 0x14, 0x55, 0x3d, 0x2d, 0xde, 0xc4, 0x3c, 0x58, 0x7e, 0x7a, 0x3d, 0x16, 0x08, 0xb8, 0x3c, + 0xd6, 0xe4, 0x95, 0x3c, 0x10, 0x87, 0x2b, 0x3d, 0x9b, 0x32, 0x7b, 0x3d, 0x32, 0xcc, 0x8a, 0x3d, + 0xc4, 0x46, 0x16, 0x3d, 0x17, 0x22, 0x58, 0x3d, 0x80, 0xba, 0xb9, 0xb9, 0x6f, 0x0d, 0xa7, 0x3c, + 0x95, 0x63, 0x8e, 0xbc, 0xae, 0x18, 0x32, 0x3c, 0x42, 0x89, 0xd1, 0xbc, 0x01, 0x24, 0xb9, 0xbc, + 0xc1, 0xe6, 0xd9, 0xba, 0x02, 0x7d, 0x34, 0x3d, 0x80, 0xec, 0x0e, 0x3d, 0xc6, 0xd7, 0x08, 0x3d, + 0xf0, 0x50, 0x24, 0x3d, 0x04, 0xfa, 0x29, 0x3d, 0x8b, 0x89, 0x20, 0x3c, 0x31, 0xbc, 0x9e, 0x3c, + 0xb8, 0x0f, 0xe8, 0x3c, 0xc3, 0x90, 0xec, 0x3c, 0x80, 0x03, 0xea, 0xba, 0x84, 0xdc, 0x65, 0xbc, + 0xfc, 0xda, 0xfc, 0x3b, 0x98, 0x9b, 0x62, 0x3b, 0x78, 0x62, 0xf2, 0x3c, 0xea, 0xed, 0x51, 0x3c, + 0xff, 0x49, 0xac, 0x3c, 0x2c, 0xb7, 0x33, 0x3d, 0x60, 0xed, 0xa9, 0xb9, 0xde, 0x83, 0x43, 0x3b, + 0xd6, 0x17, 0xa7, 0xbc, 0x5f, 0xab, 0x1d, 0x3d, 0xfd, 0xf8, 0x22, 0x3d, 0xc7, 0x9c, 0x85, 0x3c, + 0x70, 0xac, 0xb1, 0x3c, 0x22, 0x32, 0xc4, 0x3b, 0x39, 0x08, 0x28, 0x3d, 0x80, 0x2f, 0xd1, 0x3c, + 0x22, 0x74, 0xb0, 0x3c, 0xd4, 0x56, 0xdd, 0x3c, 0x21, 0x3b, 0x1d, 0xbb, 0xdb, 0x1e, 0x68, 0x3c, + 0x6d, 0x3a, 0xd1, 0xbc, 0x8c, 0xfa, 0xe2, 0x3c, 0xf7, 0x7c, 0x34, 0xbc, 0x44, 0xe2, 0xcb, 0x3c, + 0x00, 0xa0, 0xb3, 0x3b, 0x54, 0x9b, 0xa8, 0xbc, 0x85, 0x40, 0x25, 0x3c, 0x9a, 0x5e, 0x95, 0xbb, + 0xee, 0xd0, 0xa9, 0x3c, 0x92, 0x7b, 0xa8, 0x3b, 0x5c, 0x24, 0x0b, 0xbb, 0x1e, 0xfc, 0x47, 0x3d, + 0x74, 0xc7, 0x3d, 0x3c, 0x52, 0x7e, 0x34, 0x3d, 0x7c, 0x44, 0x06, 0x3b, 0xc6, 0xfb, 0xff, 0xbb, + 0xfe, 0xcb, 0x65, 0xbc, 0x2d, 0x32, 0x3b, 0x3b, 0xb0, 0x81, 0x5f, 0x3d, 0x1f, 0x5d, 0x3b, 0x3c, + 0xee, 0xda, 0x17, 0x3b, 0x5c, 0xe0, 0x49, 0x3d, 0x80, 0xb9, 0xc5, 0x3c, 0x88, 0x3c, 0xfb, 0x3b, + 0xb2, 0x35, 0x89, 0x3b, 0xb2, 0xab, 0x08, 0x3d, 0xdb, 0x32, 0xa8, 0x3c, 0x00, 0xbc, 0x5e, 0x3b, + 0xb5, 0x40, 0xd3, 0x3c, 0x6b, 0x02, 0x56, 0x3c, 0x5e, 0x5d, 0x95, 0xbb, 0x1f, 0x37, 0xda, 0x3c, + 0xed, 0xd6, 0x7e, 0x3b, 0x92, 0xc4, 0x9d, 0x3c, 0xc9, 0xd9, 0x78, 0x3c, 0xa4, 0x3f, 0x85, 0x3c, + 0xe0, 0x0b, 0x00, 0x3d, 0x6a, 0x18, 0x1c, 0x3c, 0x7b, 0x3a, 0x4a, 0x3c, 0x28, 0xd6, 0x8d, 0x3c, + 0x79, 0x6a, 0xc4, 0x3c, 0x71, 0xbc, 0x86, 0x3c, 0xdc, 0x89, 0x71, 0x3b, 0x7c, 0x01, 0xf6, 0x3c, + 0x38, 0xea, 0xb1, 0x3b, 0x92, 0xf1, 0xd3, 0x3c, 0x73, 0xe3, 0xc0, 0x3c, 0x46, 0xf6, 0xcc, 0x3c, + 0xcd, 0xd5, 0x02, 0x3d, 0xe0, 0xa4, 0x26, 0x3c, 0x05, 0xde, 0x25, 0x3c, 0xd1, 0xe1, 0xa3, 0x3c, + 0x00, 0x07, 0x13, 0xba, 0xe7, 0xd1, 0x0a, 0x3d, 0x7e, 0x27, 0x6a, 0x3c, 0x92, 0x68, 0xb1, 0x3c, + 0x31, 0xe2, 0x97, 0x3c, 0x22, 0x0d, 0xfc, 0x3b, 0x6e, 0xae, 0xa3, 0x3c, 0xbe, 0x32, 0xb3, 0x3c, + 0xdb, 0x29, 0x5a, 0x3c, 0xda, 0x47, 0x1f, 0x3c, 0x5c, 0x23, 0x59, 0x3c, 0x8e, 0x53, 0xdd, 0x3c, + 0x4f, 0x1a, 0x55, 0x3c, 0x76, 0x50, 0x40, 0x3c, 0x8c, 0xf6, 0x55, 0x3c, 0x47, 0x92, 0x7f, 0x3b, + 0x0a, 0xd5, 0xc6, 0x3c, 0x3a, 0x96, 0x2f, 0x3c, 0xa0, 0xf3, 0xc9, 0x3c, 0xc0, 0x2c, 0xf5, 0x3b, + 0x2c, 0x83, 0x1c, 0x3c, 0x62, 0x9f, 0x66, 0x3c, 0x6e, 0xa2, 0x59, 0x3b, 0x26, 0xb0, 0x36, 0x3c, + 0x46, 0xd6, 0xb3, 0x3b, 0x8f, 0x64, 0x7e, 0x3b, 0x4d, 0x50, 0x98, 0x3c, 0x2c, 0x28, 0x8e, 0x3c, + 0xf3, 0x75, 0xa9, 0x3c, 0xcc, 0xaa, 0xe7, 0x3c, 0x03, 0xc7, 0xd7, 0x3c, 0xec, 0x3d, 0xe2, 0x3b, + 0x02, 0x97, 0x16, 0x3c, 0x35, 0xe4, 0xa5, 0x3c, 0xac, 0x0e, 0xd4, 0x3c, 0x94, 0xc2, 0xd6, 0x3c, + 0xa4, 0xe5, 0x56, 0x3c, 0xf9, 0x35, 0x8c, 0x3c, 0x80, 0x38, 0x3b, 0xb7, 0xbc, 0x87, 0x94, 0x3c, + 0x3d, 0x04, 0x86, 0x3c, 0x5e, 0xad, 0xf5, 0x3b, 0xc3, 0xf0, 0x23, 0xbc, 0xd6, 0x3b, 0x3b, 0xba, + 0x03, 0x14, 0x83, 0x3a, 0xce, 0xdd, 0xba, 0x3c, 0xca, 0xb8, 0xdb, 0x3b, 0x79, 0xab, 0xf7, 0x3b, + 0x9e, 0xcb, 0x8e, 0x3c, 0x03, 0xf5, 0x4b, 0x3c, 0x57, 0x90, 0x2b, 0x3c, 0xef, 0x14, 0x4e, 0x3b, + 0xcd, 0xa3, 0xd9, 0x3c, 0x40, 0xf5, 0x74, 0x3a, 0xad, 0xfb, 0x4e, 0xbc, 0x42, 0xe4, 0xd4, 0xba, + 0x89, 0xe3, 0xbf, 0xbb, 0xda, 0xe5, 0xee, 0x3b, 0x26, 0xb8, 0xc9, 0x3b, 0x76, 0x6a, 0xa3, 0x3c, + 0x82, 0xc6, 0xf5, 0x3b, 0x19, 0xa9, 0xe5, 0x3b, 0x82, 0xc1, 0x65, 0xba, 0x78, 0x22, 0xf6, 0x3b, + 0x68, 0xff, 0x7d, 0x3a, 0xa8, 0xbe, 0x57, 0x3c, 0xef, 0x45, 0x80, 0x3c, 0xf8, 0xf8, 0x26, 0x3c, + 0xa3, 0x7e, 0xac, 0x3c, 0xc0, 0x7c, 0x79, 0xba, 0xc0, 0x09, 0x64, 0x3c, 0x1e, 0x73, 0x87, 0x3c, + 0xc5, 0x9c, 0x53, 0x3c, 0xcd, 0xff, 0x27, 0x3c, 0x10, 0x8c, 0xba, 0x3b, 0x25, 0xdc, 0x61, 0x3c, + 0x6b, 0x1f, 0x4c, 0xbb, 0x70, 0x78, 0x77, 0x3c, 0xd5, 0x36, 0xdb, 0xbb, 0x0a, 0x4c, 0x18, 0x3c, + 0x7a, 0xd4, 0x94, 0x3a, 0xa0, 0xea, 0xcb, 0x38, 0xd6, 0xaf, 0xf9, 0x3b, 0x5e, 0x1a, 0xf9, 0xbb, + 0x7c, 0x0d, 0xea, 0x3a, 0x5a, 0x11, 0x2e, 0x3b, 0x7c, 0x56, 0xc1, 0xbb, 0x74, 0x62, 0xe0, 0x3b, + 0x92, 0xbf, 0xe8, 0xbb, 0x0a, 0xa9, 0x38, 0x3c, 0xf6, 0x3b, 0xdb, 0x3a, 0xf2, 0x1a, 0x46, 0x3b, + 0x77, 0x22, 0x4a, 0xbc, 0xd8, 0xee, 0x90, 0x3c, 0x5b, 0xca, 0xbc, 0x3c, 0x7c, 0x2b, 0xb6, 0x3a, + 0x54, 0x6c, 0x9d, 0x3a, 0x81, 0x3d, 0xc4, 0x3c, 0x46, 0xe3, 0xee, 0x3b, 0xf4, 0x71, 0x7b, 0xbb, + 0xe1, 0x69, 0x4d, 0xbb, 0x52, 0xfe, 0xf2, 0x3b, 0xf7, 0xea, 0x2a, 0x3c, 0x8a, 0xd9, 0x1d, 0x3c, + 0x87, 0x34, 0xe3, 0xbe, 0x9a, 0xca, 0x8a, 0x3e, 0x6e, 0x9e, 0xca, 0xbe, 0xf0, 0xfb, 0x29, 0xbd, + 0x00, 0xf0, 0xc3, 0x38, 0x24, 0x4a, 0x38, 0x3f, 0xb2, 0xe0, 0x68, 0x3f, 0x44, 0x2b, 0x5e, 0x3f, + 0xf0, 0x53, 0x80, 0x3f, 0x0a, 0x43, 0x4b, 0x3f, 0x78, 0xa1, 0x12, 0x3e, 0x27, 0xaf, 0x2e, 0x3f, + 0x3b, 0xfc, 0xa0, 0x3e, 0x62, 0x09, 0x76, 0x3f, 0x01, 0x13, 0xce, 0x3e, 0x08, 0x4f, 0x81, 0x3e, + 0xa6, 0xce, 0xea, 0x3e, 0x32, 0xf1, 0xa8, 0x3e, 0xa1, 0x02, 0x73, 0x3f, 0xc2, 0x3f, 0x23, 0x3e, + 0x7f, 0xcd, 0x5a, 0x3f, 0xbf, 0x4e, 0x83, 0x3f, 0x52, 0x8f, 0x4d, 0x3e, 0x36, 0x4d, 0x53, 0x3e, + 0x7b, 0x58, 0x07, 0xbf, 0xe9, 0xc5, 0x91, 0x3f, 0x06, 0x0f, 0x3e, 0x3f, 0x65, 0x77, 0xf3, 0x3e, + 0x5a, 0xe7, 0x63, 0x3e, 0xcd, 0x14, 0x9f, 0x3e, 0xe0, 0x39, 0x6d, 0x3f, 0x3e, 0xb4, 0x02, 0x3f, + 0xc5, 0x02, 0xbd, 0x3e, 0x6c, 0x0d, 0x04, 0x3f, 0xc0, 0xe4, 0x26, 0x3c, 0xc3, 0x56, 0xd3, 0x3e, + 0x79, 0x21, 0x92, 0xbe, 0x2b, 0xa4, 0xd8, 0x3e, 0x52, 0x7d, 0x2f, 0x3e, 0xae, 0xfc, 0xc6, 0x3e, + 0x62, 0xcd, 0x08, 0x3f, 0xd6, 0xa2, 0x9b, 0xbe, 0x0b, 0x18, 0xea, 0x3e, 0x78, 0xb5, 0x81, 0x3e, + 0xcc, 0xaa, 0x1d, 0x3f, 0xed, 0x1b, 0x98, 0x3e, 0x20, 0x49, 0x3e, 0x3e, 0x52, 0x62, 0x95, 0x3f, + 0xd3, 0x01, 0x17, 0x3f, 0x50, 0x11, 0x4a, 0x3f, 0x26, 0xd6, 0xaa, 0x3e, 0xbb, 0x03, 0x0c, 0x3d, + 0x01, 0xcc, 0xcf, 0x3e, 0xc0, 0xcd, 0x3f, 0x3d, 0x62, 0xd2, 0x8b, 0x3f, 0x28, 0xce, 0xb6, 0x3e, + 0x98, 0x76, 0x3d, 0x3e, 0x3f, 0x27, 0x54, 0x3f, 0x1c, 0x96, 0x53, 0x3f, 0x28, 0x60, 0x45, 0x3f, + 0x1f, 0x7a, 0xdc, 0x3e, 0x4c, 0xec, 0x62, 0x3f, 0x5a, 0xeb, 0x37, 0x3e, 0x0c, 0x80, 0xf3, 0x3d, + 0x36, 0x71, 0x17, 0xbe, 0x6c, 0x6d, 0x37, 0xbe, 0x74, 0x8c, 0xe2, 0xbd, 0x54, 0x4b, 0xb7, 0xbd, + 0xb0, 0x6c, 0xf4, 0xbd, 0x85, 0xc9, 0xdb, 0xbd, 0x34, 0x1a, 0x1f, 0xbe, 0xb3, 0xe9, 0x14, 0xbe, + 0x4e, 0x2e, 0x1b, 0xbe, 0x96, 0x74, 0x11, 0xbe, 0x2e, 0x55, 0x2c, 0xbe, 0x93, 0xaf, 0xf6, 0xbd, + 0x80, 0x87, 0x80, 0xbe, 0x58, 0xe6, 0x10, 0xbe, 0x1a, 0x64, 0xd3, 0xbd, 0x77, 0xdc, 0x3d, 0xbe, + 0x70, 0x23, 0x29, 0xbe, 0xf8, 0xc8, 0x2f, 0xbe, 0xa0, 0x24, 0x0f, 0xbe, 0x24, 0x4d, 0x3c, 0xbe, + 0xf8, 0xf0, 0x4f, 0xbe, 0x1a, 0x7b, 0x52, 0xbe, 0x7c, 0x30, 0x33, 0xbe, 0x4a, 0xff, 0xd6, 0xbd, + 0x36, 0xb2, 0x03, 0xbe, 0xd4, 0x27, 0x50, 0xbe, 0xdb, 0xd3, 0xe7, 0xbd, 0x20, 0xdc, 0x5e, 0xbe, + 0xf7, 0x35, 0x84, 0xbe, 0x74, 0xcd, 0xb3, 0xbd, 0xa6, 0x6d, 0xcb, 0xbd, 0xae, 0x0f, 0x31, 0xbe, + 0xa3, 0x71, 0x08, 0xbe, 0xb6, 0x6a, 0x79, 0xbe, 0xba, 0x07, 0x15, 0xbe, 0x6e, 0x1c, 0x22, 0xbe, + 0x8d, 0xcf, 0x5a, 0xbd, 0xfa, 0x4b, 0xed, 0xbd, 0xa5, 0x3c, 0x2e, 0xbe, 0x2a, 0x05, 0x29, 0xbe, + 0x92, 0xcf, 0x8a, 0xbd, 0x43, 0xb1, 0x33, 0xbe, 0x84, 0x8d, 0x53, 0xbe, 0x8f, 0x3b, 0x14, 0xbe, + 0xd5, 0x14, 0x4a, 0xbd, 0x63, 0xd8, 0x1f, 0xbe, 0x10, 0xe4, 0x73, 0xbe, 0x1d, 0x41, 0x17, 0xbe, + 0x65, 0xd7, 0x0f, 0xbe, 0xf2, 0xf7, 0x8e, 0xbe, 0x1f, 0x3d, 0xeb, 0xbd, 0xcf, 0x77, 0xfc, 0xbd, + 0xa6, 0x1e, 0x3d, 0xbe, 0x29, 0x1b, 0x5b, 0xbe, 0xd6, 0x0a, 0x69, 0xbe, 0x03, 0xd2, 0x12, 0xbe, + 0x16, 0x3f, 0x02, 0xbe, 0xaa, 0x90, 0x2b, 0xbe, 0xf8, 0xe4, 0x1c, 0xbe, 0x9c, 0x3b, 0x9e, 0xbd, + 0xff, 0x9c, 0x14, 0xbe, 0x64, 0x12, 0x18, 0xbe, 0x06, 0x28, 0x57, 0xbd, 0x79, 0xa7, 0x04, 0xbe, + 0x90, 0x69, 0x9b, 0x3d, 0x8f, 0x25, 0xe0, 0x3d, 0x94, 0x6f, 0xbd, 0x3d, 0x82, 0x9e, 0xee, 0x3d, + 0x28, 0x60, 0xb6, 0x3d, 0x77, 0xd4, 0xd9, 0x3d, 0x04, 0x88, 0xcc, 0x3d, 0x7a, 0x7f, 0xd0, 0x3d, + 0xa0, 0x1d, 0xc9, 0x3d, 0x83, 0xf4, 0x9b, 0x3d, 0x5b, 0xcf, 0xba, 0x3d, 0x24, 0xb4, 0xbb, 0x3d, + 0x1c, 0x28, 0x0b, 0x3e, 0x24, 0xca, 0xef, 0x3d, 0x41, 0xb2, 0xdb, 0x3d, 0x7d, 0xb8, 0x07, 0x3e, + 0x47, 0x81, 0x07, 0x3e, 0x3d, 0xed, 0xd8, 0x3d, 0xab, 0xd6, 0xe7, 0x3d, 0x1d, 0x6a, 0xb9, 0x3d, + 0x46, 0xbc, 0x02, 0x3e, 0xee, 0x45, 0xe3, 0x3d, 0xea, 0x6e, 0x03, 0x3e, 0x4a, 0x09, 0xbf, 0x3d, + 0x24, 0xeb, 0xb6, 0x3d, 0xda, 0xbe, 0xf8, 0x3d, 0x3a, 0x72, 0xd3, 0x3d, 0xb0, 0xd3, 0xfe, 0x3d, + 0x8e, 0xdf, 0x02, 0x3e, 0x0d, 0xdb, 0xb6, 0x3d, 0x66, 0x50, 0xc1, 0x3d, 0x3e, 0x02, 0xbd, 0x3d, + 0x28, 0xd7, 0xd7, 0x3d, 0x8c, 0xea, 0xea, 0x3d, 0xd0, 0x12, 0xa3, 0x3d, 0x37, 0x4e, 0xb5, 0x3d, + 0x11, 0x50, 0xa0, 0x3d, 0x92, 0x1b, 0xf2, 0x3d, 0xf6, 0xab, 0xbf, 0x3d, 0xc4, 0x5a, 0xe7, 0x3d, + 0xcf, 0x5c, 0xda, 0x3d, 0x9f, 0xfb, 0xe7, 0x3d, 0x9e, 0xc0, 0xe1, 0x3d, 0x86, 0xde, 0xe4, 0x3d, + 0x0d, 0xfe, 0x80, 0x3d, 0xc4, 0x97, 0xdb, 0x3d, 0x7a, 0x64, 0xf2, 0x3d, 0x03, 0xd9, 0x08, 0x3e, + 0x1c, 0x89, 0xc4, 0x3d, 0x69, 0xf8, 0xe9, 0x3d, 0x0a, 0x02, 0xe2, 0x3d, 0xf3, 0x5f, 0xbd, 0x3d, + 0xf0, 0x0a, 0x00, 0x3e, 0x3d, 0xaf, 0xe1, 0x3d, 0xc4, 0xd9, 0xe7, 0x3d, 0xfc, 0xa6, 0xa7, 0x3d, + 0xac, 0x98, 0xc3, 0x3d, 0xf2, 0x39, 0xf2, 0x3d, 0x03, 0x0b, 0xc4, 0x3d, 0x2c, 0x43, 0x95, 0x3d, + 0xda, 0x7b, 0xde, 0x3d, 0x32, 0x19, 0xc6, 0x3d, 0x9c, 0xf1, 0x78, 0x3d, 0x87, 0xdc, 0xd9, 0x3d, + 0xba, 0x18, 0xdf, 0x3c, 0xe6, 0x62, 0x43, 0x3d, 0x5c, 0x49, 0x71, 0x3c, 0x98, 0x0d, 0xa2, 0x3c, + 0xe0, 0x1d, 0x69, 0x3a, 0x54, 0x1a, 0x18, 0x3d, 0x8a, 0xc2, 0x99, 0x3d, 0xec, 0xc3, 0x53, 0x3d, + 0x03, 0xc3, 0x51, 0x3d, 0xf8, 0x28, 0x21, 0x3d, 0xc2, 0x03, 0x68, 0x3d, 0xd0, 0x56, 0x1e, 0x3c, + 0x32, 0x56, 0x71, 0x3d, 0xdc, 0xa4, 0x2e, 0x3d, 0x54, 0xda, 0x73, 0x3c, 0xab, 0x9b, 0x51, 0x3d, + 0xc3, 0xbb, 0x42, 0x3d, 0x64, 0x5a, 0x11, 0x3d, 0x6e, 0xb5, 0x40, 0x3d, 0xd3, 0xe5, 0xd4, 0x3c, + 0x5a, 0xcf, 0xa5, 0x3c, 0x89, 0xbc, 0x41, 0x3d, 0xa0, 0xce, 0x05, 0x3d, 0x8e, 0x31, 0xc1, 0x3c, + 0x98, 0x58, 0xbb, 0xba, 0xfa, 0x57, 0xa1, 0x3d, 0x5a, 0x0d, 0x93, 0x3c, 0xbf, 0x91, 0x78, 0x3d, + 0x0d, 0xcf, 0x39, 0x3d, 0x80, 0xd6, 0x83, 0xba, 0x2b, 0x6d, 0x80, 0x3d, 0x06, 0x36, 0x48, 0x3d, + 0xc6, 0x52, 0xf3, 0x3c, 0x00, 0xcd, 0xb5, 0x3d, 0xac, 0x79, 0x34, 0x3d, 0x4a, 0x18, 0x0b, 0x3d, + 0x1e, 0xa2, 0x7d, 0x3c, 0x62, 0xa7, 0xa4, 0x3c, 0xe2, 0x70, 0x39, 0x3d, 0xdb, 0x28, 0x19, 0x3d, + 0xd0, 0xb2, 0x2b, 0xbb, 0x14, 0xa3, 0x80, 0x3d, 0x70, 0xc0, 0x5f, 0x3d, 0x61, 0x8a, 0x85, 0x3d, + 0x78, 0x8a, 0x85, 0x3c, 0xa2, 0x75, 0xda, 0x3c, 0x97, 0x84, 0x5f, 0x3d, 0xa8, 0xb8, 0x13, 0x3d, + 0x7f, 0x1a, 0xac, 0x3c, 0x3c, 0x5b, 0x9f, 0x3d, 0x6a, 0x7a, 0x75, 0x3c, 0x2b, 0x0c, 0x24, 0xbc, + 0xa5, 0x14, 0x59, 0x3d, 0x7f, 0x94, 0x07, 0x3d, 0x46, 0x6d, 0x3e, 0x3d, 0xa8, 0x7c, 0xbc, 0x3c, + 0x41, 0x0f, 0xb9, 0x3c, 0x64, 0x15, 0x89, 0x3d, 0xb0, 0x10, 0xad, 0xba, 0x04, 0xd2, 0x79, 0x3c, + 0xf8, 0x78, 0xf6, 0x3c, 0x44, 0x24, 0xeb, 0x3c, 0x9c, 0xe7, 0xa6, 0x3b, 0x38, 0x9a, 0x22, 0x3d, + 0x0a, 0x09, 0x00, 0xbc, 0xb1, 0x39, 0x52, 0xbc, 0x72, 0xf9, 0x32, 0xbc, 0x7a, 0xab, 0x7d, 0xbc, + 0x3c, 0x66, 0x1a, 0xbc, 0xe2, 0x34, 0x69, 0xbc, 0x43, 0x16, 0x5e, 0xbc, 0x62, 0x21, 0x55, 0xbc, + 0x31, 0x87, 0x48, 0xbc, 0x4b, 0xcd, 0x0d, 0xbc, 0x04, 0xaa, 0x32, 0xbc, 0xe4, 0x60, 0x27, 0xbc, + 0x3b, 0x25, 0x75, 0xbc, 0x7e, 0x65, 0x75, 0xbc, 0x10, 0x75, 0x5b, 0xbc, 0x2a, 0x32, 0x85, 0xbc, + 0xca, 0xf9, 0x88, 0xbc, 0x72, 0xa1, 0x43, 0xbc, 0xb0, 0x67, 0x70, 0xbc, 0x1c, 0x5c, 0x0f, 0xbc, + 0xac, 0x71, 0x5b, 0xbc, 0xf8, 0x37, 0x47, 0xbc, 0x32, 0xa2, 0x76, 0xbc, 0xf6, 0x3a, 0x3f, 0xbc, + 0xe6, 0x17, 0x14, 0xbc, 0x0a, 0x04, 0x7c, 0xbc, 0xb6, 0x9e, 0x4e, 0xbc, 0xc8, 0xa5, 0x6c, 0xbc, + 0x94, 0x3b, 0x52, 0xbc, 0xcd, 0xa7, 0x2a, 0xbc, 0x93, 0x19, 0x65, 0xbc, 0x88, 0x6b, 0x2c, 0xbc, + 0x20, 0x92, 0x52, 0xbc, 0xf0, 0x98, 0x5d, 0xbc, 0xed, 0x4c, 0x18, 0xbc, 0x28, 0xed, 0x1e, 0xbc, + 0xd8, 0xe1, 0x2f, 0xbc, 0x2a, 0xb7, 0x73, 0xbc, 0xfc, 0x2a, 0x2e, 0xbc, 0x02, 0x1d, 0x5a, 0xbc, + 0x62, 0xdf, 0x5e, 0xbc, 0x92, 0xff, 0x69, 0xbc, 0x38, 0xd3, 0x4a, 0xbc, 0x7b, 0x00, 0x79, 0xbc, + 0x66, 0x33, 0x0d, 0xbc, 0x46, 0x1b, 0x48, 0xbc, 0xdc, 0x7f, 0x4d, 0xbc, 0xa6, 0xa7, 0x8a, 0xbc, + 0x70, 0x56, 0x30, 0xbc, 0xd8, 0x01, 0x40, 0xbc, 0xce, 0xc0, 0x5c, 0xbc, 0xfc, 0xb8, 0x17, 0xbc, + 0xd8, 0xda, 0x79, 0xbc, 0xb4, 0x14, 0x35, 0xbc, 0x54, 0x04, 0x40, 0xbc, 0x2d, 0xba, 0x0d, 0xbc, + 0xc1, 0xc1, 0x37, 0xbc, 0xf0, 0x00, 0x7e, 0xbc, 0x31, 0x7e, 0x16, 0xbc, 0x14, 0x69, 0x15, 0xbc, + 0x0e, 0x5e, 0x54, 0xbc, 0x97, 0x20, 0x34, 0xbc, 0xe8, 0xb3, 0xf9, 0xbb, 0x8e, 0x34, 0x5f, 0xbc, + 0x88, 0x4c, 0x41, 0xbb, 0x02, 0x20, 0xda, 0xbb, 0xa9, 0x37, 0x4e, 0xbb, 0xc9, 0x5e, 0xbe, 0xbb, + 0x1c, 0x4b, 0x1f, 0xba, 0x49, 0x91, 0xef, 0xbb, 0x47, 0x78, 0x35, 0xbc, 0xc4, 0x09, 0x03, 0xbc, + 0x96, 0x14, 0xf7, 0xbb, 0xf2, 0x00, 0xa3, 0xbb, 0x46, 0x0b, 0xf4, 0xbb, 0xa3, 0x32, 0x04, 0xbb, + 0x12, 0x0f, 0xee, 0xbb, 0x3f, 0xd8, 0xf6, 0xbb, 0xc8, 0x2b, 0x88, 0xbb, 0x03, 0x15, 0x05, 0xbc, + 0x72, 0x09, 0x07, 0xbc, 0x06, 0x76, 0xa0, 0xbb, 0x87, 0x04, 0x04, 0xbc, 0x55, 0x6e, 0x16, 0xbb, + 0xa0, 0xb7, 0x2d, 0xbb, 0x14, 0xca, 0xbd, 0xbb, 0xf8, 0xb9, 0xb3, 0xbb, 0x7a, 0x14, 0x9a, 0xbb, + 0x40, 0x51, 0x15, 0x38, 0x24, 0x18, 0x36, 0xbc, 0x33, 0xbf, 0x85, 0xbb, 0xba, 0x36, 0x04, 0xbc, + 0x6d, 0x51, 0x96, 0xbb, 0x9e, 0xc8, 0xb5, 0xba, 0x97, 0x30, 0x31, 0xbc, 0x4c, 0x98, 0xca, 0xbb, + 0x4f, 0x20, 0xad, 0xbb, 0xe7, 0x1a, 0x33, 0xbc, 0x48, 0x10, 0xbc, 0xbb, 0x33, 0x0a, 0x8b, 0xbb, + 0x9d, 0xfd, 0x91, 0xbb, 0x91, 0xeb, 0xa5, 0xbb, 0x8a, 0x03, 0xbe, 0xbb, 0x44, 0x96, 0xbc, 0xbb, + 0x74, 0x8f, 0x2f, 0xbb, 0x55, 0x52, 0x17, 0xbc, 0x99, 0x72, 0xdf, 0xbb, 0x2a, 0xbd, 0x2c, 0xbc, + 0x1a, 0x6e, 0x81, 0xbb, 0x96, 0x3f, 0x88, 0xbb, 0x39, 0xfc, 0xc9, 0xbb, 0x4a, 0xe2, 0xea, 0xbb, + 0x9a, 0x1e, 0x56, 0xbb, 0xe7, 0x50, 0x04, 0xbc, 0x3a, 0xfd, 0x80, 0xbb, 0xbe, 0x74, 0x81, 0x3a, + 0x99, 0xd1, 0x03, 0xbc, 0xa1, 0x0f, 0x5b, 0xbb, 0x0e, 0x6a, 0xa5, 0xbb, 0x12, 0x36, 0x35, 0xbb, + 0x56, 0x95, 0x80, 0xbb, 0xb2, 0xe3, 0x29, 0xbc, 0xa0, 0x13, 0x08, 0x3a, 0x2c, 0xee, 0x5b, 0xbb, + 0x89, 0x59, 0xa7, 0xbb, 0x91, 0x5a, 0x89, 0xbb, 0xf5, 0x79, 0x06, 0xbb, 0x0c, 0xea, 0xe3, 0xbb, + 0x9e, 0xdc, 0xa2, 0xbb, 0x01, 0xc7, 0xd4, 0xbb, 0xd6, 0x0a, 0xa8, 0xbb, 0x82, 0xf2, 0xb7, 0xbb, + 0xb3, 0x3d, 0xae, 0xbb, 0xfc, 0x18, 0xae, 0xbb, 0x48, 0xd2, 0xb0, 0xbb, 0xaa, 0x57, 0xb7, 0xbb, + 0xd6, 0x74, 0xb6, 0xbb, 0x86, 0x08, 0x9b, 0xbb, 0x9a, 0xcb, 0xb4, 0xbb, 0x55, 0x72, 0xae, 0xbb, + 0xe4, 0xf9, 0x0b, 0xbc, 0x74, 0xb0, 0xcb, 0xbb, 0xa8, 0x17, 0xb6, 0xbb, 0xcd, 0x29, 0xf3, 0xbb, + 0xfe, 0x56, 0xe9, 0xbb, 0x6c, 0x5f, 0xd1, 0xbb, 0xdc, 0x2c, 0xc4, 0xbb, 0xfd, 0x07, 0xc9, 0xbb, + 0x3d, 0xbf, 0x01, 0xbc, 0x29, 0x3f, 0xe5, 0xbb, 0x29, 0xe8, 0xef, 0xbb, 0x71, 0x4b, 0xa2, 0xbb, + 0x13, 0x93, 0xb4, 0xbb, 0xb1, 0xfe, 0xe3, 0xbb, 0xf6, 0x55, 0xb5, 0xbb, 0xd8, 0xe5, 0xf6, 0xbb, + 0xf6, 0xbe, 0x0b, 0xbc, 0xa0, 0x12, 0x9e, 0xbb, 0xfb, 0x8c, 0x90, 0xbb, 0x1c, 0xd1, 0xbb, 0xbb, + 0xb2, 0x1b, 0xbe, 0xbb, 0xa3, 0x9c, 0xed, 0xbb, 0x82, 0x47, 0x9f, 0xbb, 0x44, 0x55, 0xb5, 0xbb, + 0x0e, 0x0b, 0x6e, 0xbb, 0xa2, 0x4f, 0xc8, 0xbb, 0xa2, 0x62, 0xbd, 0xbb, 0x10, 0xa4, 0xd5, 0xbb, + 0x5b, 0x1b, 0xa9, 0xbb, 0x5e, 0x4c, 0xd1, 0xbb, 0x89, 0xe7, 0xe1, 0xbb, 0xbd, 0xab, 0xbd, 0xbb, + 0x3c, 0x53, 0x43, 0xbb, 0x62, 0x15, 0xce, 0xbb, 0xaa, 0x28, 0xfd, 0xbb, 0x36, 0x1d, 0xe6, 0xbb, + 0x16, 0x08, 0xba, 0xbb, 0x5f, 0x81, 0x02, 0xbc, 0xbb, 0x32, 0xc0, 0xbb, 0x06, 0x0d, 0xb9, 0xbb, + 0x9d, 0xce, 0xe8, 0xbb, 0x95, 0xb5, 0xee, 0xbb, 0x61, 0x5b, 0xf4, 0xbb, 0xa2, 0x78, 0xa9, 0xbb, + 0xd1, 0x91, 0xb1, 0xbb, 0xa5, 0x6b, 0xd1, 0xbb, 0xfa, 0xd7, 0xc9, 0xbb, 0xc4, 0x03, 0x7b, 0xbb, + 0x54, 0x6d, 0xc8, 0xbb, 0xfc, 0x34, 0xbc, 0xbb, 0x96, 0x91, 0x4a, 0xbb, 0x9d, 0x34, 0xb9, 0xbb, + 0x6b, 0x4d, 0x26, 0xbb, 0xa2, 0x3a, 0x5f, 0xbb, 0x3a, 0x7d, 0xa2, 0xba, 0x66, 0xb3, 0x4b, 0xba, + 0x45, 0x61, 0x6e, 0xba, 0xf9, 0xb2, 0xf2, 0xba, 0x4c, 0x34, 0x88, 0xbb, 0x1b, 0x80, 0x4a, 0xbb, + 0xe1, 0xef, 0x52, 0xbb, 0x68, 0x77, 0x3d, 0xbb, 0x71, 0x4e, 0x78, 0xbb, 0x6b, 0x78, 0xa0, 0xba, + 0x15, 0xd6, 0x99, 0xbb, 0x29, 0x6e, 0x23, 0xbb, 0xe0, 0xc3, 0x75, 0xba, 0xfa, 0xb4, 0x5b, 0xbb, + 0xb4, 0xd9, 0x3c, 0xbb, 0xf9, 0xf1, 0x3b, 0xbb, 0x64, 0x93, 0x2f, 0xbb, 0xea, 0x8e, 0x3e, 0xbb, + 0x6b, 0xf5, 0x26, 0xbb, 0x76, 0xd8, 0x79, 0xbb, 0xe6, 0x45, 0x26, 0xbb, 0xe0, 0x6a, 0xc0, 0xba, + 0xd7, 0x2e, 0x7f, 0xba, 0xcb, 0x12, 0x9e, 0xbb, 0x6a, 0x5b, 0xa6, 0xba, 0xca, 0x76, 0x8e, 0xbb, + 0x0f, 0x6b, 0x90, 0xbb, 0x66, 0x3e, 0xa2, 0xb9, 0xfe, 0x47, 0x39, 0xbb, 0x72, 0x63, 0x69, 0xbb, + 0x73, 0x33, 0x04, 0xbb, 0x9b, 0x1e, 0xc3, 0xbb, 0x74, 0x05, 0x4a, 0xbb, 0x6b, 0xda, 0x37, 0xbb, + 0xe6, 0x13, 0x01, 0xba, 0x05, 0x39, 0x9e, 0xba, 0xd6, 0x72, 0x5c, 0xbb, 0x62, 0xa4, 0x33, 0xbb, + 0x9c, 0x02, 0x9e, 0x39, 0x42, 0x3e, 0x7d, 0xbb, 0x9d, 0xcd, 0x86, 0xbb, 0xd8, 0xd0, 0x62, 0xbb, + 0x2b, 0x6a, 0x29, 0xba, 0xcc, 0x33, 0x14, 0xbb, 0xea, 0xa6, 0x94, 0xbb, 0x1b, 0x7e, 0x0c, 0xbb, + 0x30, 0xe5, 0xfc, 0xba, 0xce, 0x51, 0xc9, 0xbb, 0x28, 0xc2, 0x8f, 0xba, 0x93, 0x2b, 0x09, 0xba, + 0xba, 0x8a, 0x65, 0xbb, 0x92, 0xc9, 0x60, 0xbb, 0x13, 0x0c, 0x87, 0xbb, 0xf3, 0x17, 0x12, 0xbb, + 0xef, 0x6a, 0xe8, 0xba, 0xce, 0x9c, 0x7a, 0xbb, 0x3b, 0x62, 0xab, 0xba, 0xc5, 0xa2, 0x78, 0xba, + 0x16, 0x78, 0x0f, 0xbb, 0x5a, 0x83, 0x19, 0xbb, 0x3e, 0xd5, 0xa0, 0xb9, 0x29, 0xea, 0x17, 0xbb, + 0x02, 0x49, 0xac, 0xbd, 0x76, 0xa4, 0x32, 0xbe, 0x9e, 0xc5, 0x05, 0xbe, 0x7a, 0x60, 0x5e, 0xbe, + 0xb2, 0x04, 0xaa, 0xbd, 0x12, 0xb9, 0x5c, 0xbe, 0xac, 0xf9, 0x73, 0xbe, 0xea, 0xc3, 0x4c, 0xbe, + 0xb8, 0x10, 0x3d, 0xbe, 0x10, 0x4b, 0xf1, 0xbd, 0x36, 0x56, 0x28, 0xbe, 0x28, 0x2e, 0xdc, 0xbd, + 0xb9, 0xcb, 0x3f, 0xbe, 0xcc, 0x19, 0x5f, 0xbe, 0x30, 0xc1, 0x2f, 0xbe, 0x32, 0x6a, 0x69, 0xbe, + 0x0c, 0x49, 0x75, 0xbe, 0xc4, 0x04, 0x16, 0xbe, 0xee, 0x12, 0x62, 0xbe, 0xc0, 0x55, 0xa0, 0xbd, + 0xf6, 0xd8, 0x04, 0xbe, 0x9e, 0x2d, 0x1a, 0xbe, 0x4c, 0xcd, 0x3f, 0xbe, 0xbe, 0xc0, 0x21, 0xbe, + 0x87, 0x96, 0x8d, 0xbd, 0xdb, 0x31, 0x7d, 0xbe, 0x54, 0x52, 0x23, 0xbe, 0x02, 0xa4, 0x4b, 0xbe, + 0x88, 0x8f, 0x06, 0xbe, 0xde, 0xfc, 0xe2, 0xbd, 0x38, 0x1f, 0x82, 0xbe, 0x3b, 0x31, 0x14, 0xbe, + 0xb2, 0xcf, 0x2e, 0xbe, 0xc6, 0x63, 0x5b, 0xbe, 0xe1, 0xf1, 0x07, 0xbe, 0x75, 0x32, 0xf0, 0xbd, + 0xfc, 0x50, 0x21, 0xbe, 0x1a, 0x08, 0x48, 0xbe, 0x4e, 0xb4, 0x11, 0xbe, 0xee, 0x67, 0x31, 0xbe, + 0x56, 0x37, 0x2a, 0xbe, 0x46, 0x01, 0x62, 0xbe, 0x6c, 0x47, 0x28, 0xbe, 0x3a, 0xbd, 0x81, 0xbe, + 0x8b, 0x27, 0x04, 0xbe, 0x6f, 0xa3, 0x14, 0xbe, 0x8e, 0x87, 0x1a, 0xbe, 0x08, 0xbf, 0x6f, 0xbe, + 0x9e, 0x54, 0xfb, 0xbd, 0x34, 0x8e, 0x1c, 0xbe, 0xf3, 0x28, 0x2b, 0xbe, 0x52, 0xf9, 0x7e, 0xbd, + 0x95, 0x15, 0x5d, 0xbe, 0x07, 0x1b, 0xde, 0xbd, 0x9e, 0xe7, 0x06, 0xbe, 0xd0, 0x99, 0xbd, 0xbd, + 0xf6, 0xd8, 0x0d, 0xbe, 0x97, 0x28, 0x7f, 0xbe, 0x7d, 0x72, 0x72, 0xbd, 0x98, 0xde, 0xf7, 0xbd, + 0x5c, 0x00, 0x2b, 0xbe, 0x28, 0xec, 0x08, 0xbe, 0x5f, 0xf4, 0xc3, 0xbd, 0x5e, 0xe7, 0x4b, 0xbe, + 0x95, 0xc4, 0xf0, 0xbd, 0xb2, 0x15, 0xb7, 0xbd, 0x44, 0xeb, 0x22, 0x3d, 0xde, 0xc6, 0x95, 0xbd, + 0x4e, 0x5f, 0xc8, 0xbc, 0x46, 0xdb, 0x9e, 0xbd, 0xef, 0xa0, 0x41, 0xbd, 0xab, 0x68, 0xff, 0xbc, + 0xe2, 0xbc, 0xff, 0xbd, 0xa8, 0xf9, 0xf8, 0xbc, 0x0a, 0x00, 0x76, 0xbd, 0x00, 0xf9, 0x61, 0xbd, + 0xc8, 0x4e, 0x21, 0xbe, 0x13, 0x27, 0x35, 0xbd, 0x00, 0xeb, 0x6d, 0x39, 0x3c, 0xac, 0xb2, 0xbd, + 0x88, 0xa6, 0x8e, 0xbc, 0x1a, 0x49, 0xb5, 0xbd, 0x5c, 0x6a, 0x7c, 0xbd, 0x4c, 0x38, 0x1d, 0xbe, + 0x0e, 0xd7, 0xb8, 0xbd, 0x0e, 0xf8, 0x62, 0xbd, 0xa4, 0x2e, 0x6b, 0xbd, 0xb3, 0x47, 0x90, 0xbd, + 0x08, 0x9e, 0x42, 0xbd, 0x75, 0x8a, 0x02, 0xbe, 0xab, 0xcc, 0x84, 0xbd, 0x95, 0xe1, 0xc9, 0xbd, + 0x94, 0x32, 0x34, 0xbe, 0x94, 0x6e, 0xe3, 0xbc, 0xdc, 0x78, 0x58, 0xbd, 0x50, 0x27, 0xa1, 0xbd, + 0xae, 0x3f, 0x6a, 0xbd, 0xf2, 0x67, 0xcc, 0xbd, 0xf2, 0xa7, 0x92, 0xbd, 0xf8, 0x49, 0xec, 0xbd, + 0xfb, 0xe7, 0x14, 0x3c, 0x5e, 0xdb, 0xf5, 0xbc, 0x00, 0x9e, 0x5b, 0xbc, 0x74, 0xa2, 0xca, 0xbc, + 0x54, 0xb4, 0x0f, 0xbd, 0x14, 0xd5, 0x52, 0xbd, 0x93, 0xd5, 0xd8, 0xbd, 0xf0, 0x38, 0x90, 0xbc, + 0xd2, 0xa8, 0x96, 0x3c, 0xaa, 0xa3, 0xa2, 0xbd, 0x30, 0xe9, 0xda, 0xbc, 0x22, 0x34, 0xa2, 0xbc, + 0xb0, 0xf7, 0x6e, 0xbc, 0xbc, 0x92, 0x84, 0xbd, 0x0c, 0x6a, 0x7a, 0xbd, 0x12, 0x05, 0xb1, 0xbd, + 0x56, 0x51, 0x12, 0xbd, 0xe4, 0x2a, 0xf5, 0xbd, 0x40, 0x7e, 0x13, 0xbe, 0x28, 0xcc, 0xf8, 0xbc, + 0xb0, 0x0a, 0xb0, 0xbc, 0xde, 0x72, 0xb6, 0xbd, 0x2b, 0xe9, 0xa1, 0xbd, 0x46, 0x7e, 0xfd, 0xbb, + 0x74, 0x31, 0xb7, 0x3c, 0x14, 0x39, 0xd6, 0xbc, 0x5d, 0x0b, 0x32, 0xbd, 0x3d, 0xc2, 0xc5, 0xbd, + 0xd4, 0xb0, 0xa4, 0x3c, 0x7e, 0xdd, 0x45, 0x3d, 0xd7, 0x27, 0x61, 0xbc, 0x2d, 0xf4, 0x87, 0x3d, + 0xc4, 0x69, 0xb1, 0x3c, 0xf4, 0x76, 0x60, 0x3d, 0x06, 0xd7, 0x37, 0x3d, 0xf6, 0xb5, 0x60, 0x3d, + 0x32, 0x55, 0x9c, 0x3d, 0x0d, 0xcf, 0xd6, 0x3c, 0xb0, 0xd0, 0xdb, 0x3c, 0x01, 0xc0, 0x31, 0x3d, + 0x10, 0xec, 0x66, 0x3d, 0xe0, 0x4f, 0x78, 0x3d, 0xa1, 0x43, 0x13, 0x3d, 0x83, 0x30, 0x8b, 0x3d, + 0x2e, 0x3e, 0x24, 0x3d, 0xe6, 0x9c, 0x3e, 0x3d, 0x0e, 0xcb, 0x84, 0x3d, 0x68, 0x4f, 0x35, 0x3d, + 0x1e, 0x15, 0xb1, 0x3d, 0xb2, 0xcc, 0x49, 0x3d, 0x83, 0x31, 0x36, 0x3d, 0xd2, 0x30, 0x19, 0x3d, + 0x90, 0x85, 0xfd, 0x3b, 0xae, 0xd6, 0xb0, 0x3d, 0x54, 0x24, 0x25, 0x3d, 0x7a, 0x72, 0x74, 0x3d, + 0x4a, 0x52, 0x5e, 0x3d, 0x42, 0x31, 0x21, 0x3d, 0x42, 0xfb, 0x58, 0x3d, 0x98, 0x36, 0x13, 0x3d, + 0x9f, 0xd2, 0x03, 0x3d, 0x07, 0xad, 0x1a, 0x3d, 0x2e, 0xcb, 0x18, 0x3d, 0x80, 0x97, 0x67, 0x3d, + 0x31, 0xd4, 0x50, 0x3c, 0xf4, 0x41, 0x04, 0x3d, 0x18, 0x73, 0x1d, 0x3d, 0x7e, 0xad, 0xc7, 0x3c, + 0xcb, 0x62, 0x8c, 0x3d, 0x84, 0xa7, 0x9f, 0x3c, 0x6b, 0x6f, 0x81, 0x3d, 0xe9, 0x85, 0x33, 0x3d, + 0x22, 0x1f, 0x28, 0x3c, 0xba, 0xa6, 0x32, 0x3d, 0xef, 0x6f, 0xde, 0x3c, 0xa0, 0x01, 0x61, 0x3d, + 0x65, 0x27, 0x2c, 0x3d, 0x99, 0x96, 0x0c, 0x3d, 0x6a, 0x91, 0x60, 0x3d, 0x62, 0x5a, 0x17, 0x3d, + 0xcc, 0x45, 0x80, 0x3d, 0xe7, 0xd7, 0x17, 0x3d, 0x2e, 0xe3, 0x86, 0x3d, 0xdf, 0x56, 0xb8, 0x3c, + 0x88, 0xcb, 0xc1, 0x3c, 0x26, 0x92, 0x5a, 0x3d, 0xfb, 0xec, 0x54, 0x3d, 0x74, 0x81, 0x5d, 0x3d, + 0x3f, 0xe9, 0xea, 0x3c, 0x5a, 0x7b, 0x57, 0x3d, 0x34, 0xca, 0xd3, 0x3b, 0x4a, 0x71, 0x1d, 0x3d, + 0x16, 0xfe, 0xad, 0x3b, 0x3c, 0xf1, 0xc0, 0x3c, 0x76, 0x9b, 0xbe, 0xbc, 0xc5, 0x7b, 0x51, 0xbc, + 0x5a, 0x46, 0x90, 0xbb, 0xb6, 0x30, 0x1f, 0x3d, 0x46, 0x6b, 0xed, 0x3c, 0x45, 0xd1, 0x90, 0x3c, + 0x6b, 0xfc, 0x34, 0x3d, 0x28, 0x90, 0x17, 0x3d, 0xb2, 0x26, 0x8d, 0x3c, 0x7c, 0x6f, 0xb2, 0x3b, + 0x0a, 0x1e, 0x3b, 0x3d, 0xff, 0x62, 0xc2, 0x3c, 0xc0, 0x9a, 0xb7, 0xbb, 0x30, 0x84, 0x12, 0xbc, + 0x8c, 0x8e, 0x5f, 0x3b, 0x46, 0x4d, 0x82, 0x3c, 0xbc, 0x8c, 0xc8, 0x3c, 0xef, 0x42, 0xe9, 0x3c, + 0x54, 0x66, 0xc3, 0xbb, 0x82, 0xe2, 0x00, 0x3d, 0x7c, 0xde, 0x18, 0x3c, 0x58, 0x95, 0x30, 0x3c, + 0xf1, 0x42, 0x7e, 0xbc, 0x89, 0x75, 0x42, 0x3d, 0xea, 0x63, 0xb7, 0x3c, 0x3a, 0x99, 0xfa, 0x3c, + 0x83, 0xf6, 0x17, 0x3d, 0xbc, 0xa1, 0x8b, 0x3b, 0xd1, 0xab, 0x29, 0x3d, 0xd8, 0x1d, 0xcc, 0x3c, + 0xa4, 0x01, 0xe1, 0x3c, 0x5c, 0xa7, 0x3d, 0x3d, 0x1b, 0x75, 0xd6, 0x3c, 0x9b, 0x59, 0x84, 0x3c, + 0x1d, 0xfb, 0x63, 0xbc, 0x8c, 0xe6, 0x6b, 0x3c, 0x57, 0xeb, 0x07, 0xbc, 0x48, 0x3e, 0xa4, 0x3c, + 0x54, 0xbd, 0xd8, 0xbb, 0xac, 0xdb, 0x3e, 0x3b, 0xb9, 0x3c, 0xdb, 0x3c, 0x48, 0x23, 0x05, 0x3c, + 0xc0, 0x70, 0xcd, 0xba, 0xd6, 0xdd, 0x29, 0x3c, 0xfc, 0x60, 0x8b, 0x3b, 0x8c, 0x42, 0xb6, 0x3c, + 0xd5, 0x8a, 0x0a, 0xbc, 0xd6, 0x8f, 0x51, 0x3d, 0xd0, 0x6a, 0xe1, 0xba, 0x96, 0x81, 0xb3, 0xbb, + 0xc2, 0x13, 0xbf, 0xbc, 0xe9, 0x13, 0xaf, 0x3c, 0x70, 0x21, 0x77, 0x3d, 0x6d, 0xc6, 0xb2, 0x3b, + 0x9a, 0x45, 0x7f, 0x3b, 0x4e, 0x41, 0x86, 0x3d, 0x7c, 0xc1, 0x49, 0x3b, 0x40, 0x04, 0x5b, 0xbc, + 0x3a, 0xcb, 0x10, 0xbc, 0x51, 0xbe, 0x98, 0x3c, 0xa0, 0xaf, 0x2e, 0x3c, 0xa4, 0xb1, 0xa1, 0x3c, + 0xf4, 0x29, 0x27, 0x3a, 0xaa, 0x61, 0xb2, 0xbb, 0x6d, 0xe4, 0xf8, 0x3a, 0xb6, 0x75, 0xe1, 0xbb, + 0x23, 0x09, 0x11, 0xbb, 0x6c, 0x4f, 0xf8, 0xbb, 0x1a, 0x75, 0xd8, 0xbb, 0xec, 0x47, 0x04, 0xbc, + 0x8e, 0x99, 0x1b, 0xbc, 0x57, 0x50, 0x9c, 0xbb, 0xe0, 0x09, 0x3d, 0xbb, 0x62, 0xf6, 0xa1, 0xbb, + 0x54, 0xe4, 0xb3, 0xbb, 0xae, 0x45, 0x10, 0xbc, 0xde, 0xc8, 0xa7, 0xbb, 0x6a, 0x1c, 0xe0, 0xbb, + 0x0a, 0xa9, 0xb7, 0xbb, 0x36, 0x1c, 0x9e, 0xbb, 0x5b, 0xc0, 0x11, 0xbc, 0xea, 0x0b, 0x41, 0xbb, + 0xc6, 0x52, 0x1e, 0xbc, 0x3a, 0x30, 0xe9, 0xbb, 0x28, 0x34, 0xab, 0xbb, 0x5c, 0x13, 0x79, 0xbb, + 0xef, 0x2e, 0x9d, 0x3a, 0xf2, 0x79, 0x35, 0xbc, 0x9e, 0x05, 0xa4, 0xbb, 0x50, 0xc9, 0xeb, 0xbb, + 0x34, 0x5c, 0x87, 0xbb, 0x97, 0x16, 0xaa, 0xbb, 0x9b, 0x42, 0x07, 0xbc, 0xd8, 0x87, 0x83, 0xbb, + 0xe4, 0xb8, 0x8c, 0xbb, 0xbe, 0x74, 0x98, 0xbb, 0x01, 0x07, 0x94, 0xbb, 0x02, 0x5c, 0xb2, 0xbb, + 0x51, 0x1c, 0xc7, 0xba, 0xae, 0x84, 0x95, 0xbb, 0xd8, 0xb4, 0xa0, 0xbb, 0x51, 0x46, 0x7b, 0xbb, + 0x4e, 0xe2, 0x0f, 0xbc, 0x51, 0x5e, 0xb1, 0xba, 0xd8, 0xd8, 0xee, 0xbb, 0x08, 0x83, 0xd1, 0xbb, + 0xae, 0xe9, 0x07, 0xbb, 0x92, 0xa0, 0x90, 0xbb, 0x80, 0x7c, 0x5e, 0xbb, 0x8e, 0x6a, 0x0e, 0xbc, + 0x9c, 0xcc, 0xb0, 0xbb, 0x32, 0xd3, 0xb5, 0xbb, 0x04, 0xe3, 0xc7, 0xbb, 0xc2, 0x79, 0x1c, 0xbb, + 0xf0, 0x9e, 0xe6, 0xbb, 0x72, 0xf1, 0x2c, 0xbb, 0xc5, 0xa0, 0x04, 0xbc, 0x1e, 0xbd, 0x2c, 0xbb, + 0x8a, 0x5f, 0x46, 0xbb, 0x0c, 0x31, 0x08, 0xbc, 0xcb, 0x33, 0xae, 0xbb, 0x94, 0x74, 0xeb, 0xbb, + 0x9f, 0xe4, 0x96, 0xbb, 0x2c, 0x48, 0x02, 0xbc, 0x34, 0x19, 0x61, 0xb8, 0xd2, 0x98, 0x67, 0xbb, + 0xbd, 0xbe, 0x0a, 0x3b, 0x55, 0x88, 0x42, 0xbb, 0xa4, 0xde, 0x3a, 0x3b, 0x18, 0xda, 0x87, 0x3a, + 0x96, 0xff, 0xcd, 0x39, 0x3a, 0x17, 0xc4, 0xbb, 0x64, 0x5a, 0xa4, 0xbb, 0x64, 0xb8, 0x8e, 0xbb, + 0x20, 0xdb, 0xcd, 0xbb, 0x0b, 0xa4, 0xbf, 0xbb, 0xf6, 0x36, 0xfc, 0xba, 0x82, 0x6c, 0x96, 0xba, + 0x1c, 0xd9, 0x90, 0xbb, 0x42, 0xa6, 0xa7, 0xbb, 0x14, 0xe7, 0x7c, 0xba, 0xec, 0x29, 0x5f, 0x3a, + 0x7f, 0xc6, 0xfd, 0xba, 0x69, 0xf6, 0xe6, 0xba, 0xeb, 0x40, 0x9f, 0xbb, 0x69, 0x1e, 0xb3, 0xba, + 0x10, 0x65, 0xdc, 0xb9, 0x46, 0xdb, 0xae, 0xbb, 0x62, 0x17, 0xe0, 0xba, 0x2b, 0x35, 0x95, 0xba, + 0x28, 0x00, 0x67, 0x3b, 0x09, 0x94, 0xeb, 0xbb, 0x73, 0x87, 0x52, 0xbb, 0xd2, 0x77, 0x8c, 0xbb, + 0x91, 0x25, 0x1e, 0xbb, 0xd1, 0x63, 0xdc, 0xba, 0x0f, 0xac, 0xe8, 0xbb, 0x38, 0xd0, 0x3e, 0xbb, + 0x81, 0x88, 0x7a, 0xbb, 0x34, 0x70, 0xb4, 0xbb, 0x9b, 0x26, 0x5f, 0xbb, 0x95, 0x1a, 0xb8, 0xba, + 0x55, 0x56, 0x96, 0x3a, 0xdc, 0xc8, 0x35, 0xbb, 0x30, 0x03, 0x72, 0xb9, 0x59, 0xed, 0x5e, 0xbb, + 0x0c, 0xe8, 0xa5, 0xba, 0xb2, 0xa0, 0x9f, 0x39, 0x21, 0xe4, 0x6e, 0xbb, 0xb6, 0x0a, 0x39, 0xbb, + 0x8f, 0x91, 0x69, 0xba, 0xd0, 0xb7, 0x8e, 0xba, 0x1a, 0xd3, 0x90, 0xba, 0x3a, 0xbb, 0xb1, 0xbb, + 0x18, 0xe2, 0xb5, 0xb9, 0xfc, 0x00, 0xed, 0xbb, 0x08, 0x8d, 0xe9, 0xb9, 0x5c, 0x2e, 0xf3, 0x3a, + 0x5c, 0x2f, 0xcc, 0x3a, 0xa8, 0x1c, 0x8c, 0xba, 0x00, 0x24, 0xf7, 0xbb, 0xd0, 0x16, 0x74, 0xba, + 0x8b, 0xba, 0x89, 0xba, 0xd5, 0x2a, 0x1c, 0xbc, 0x4c, 0x31, 0xdf, 0xb9, 0x10, 0x29, 0xe7, 0xb9, + 0x34, 0xc2, 0x5f, 0xba, 0x97, 0x46, 0x95, 0xbb, 0xd6, 0x44, 0xf8, 0xb9, 0x8c, 0x16, 0xda, 0xba, + 0x8b, 0x77, 0x2d, 0xbb, 0xf2, 0x11, 0x45, 0xbb, 0x79, 0xa1, 0x68, 0x3a, 0xed, 0x7f, 0x7d, 0xbb, + 0x21, 0x18, 0xa7, 0xba, 0x72, 0x90, 0x3b, 0xbb, 0x44, 0x32, 0x0a, 0xbb, 0x34, 0x66, 0x1a, 0xbb, + 0xd8, 0xe5, 0x8f, 0xbb, 0x47, 0xa7, 0x86, 0xba, 0xe5, 0xdb, 0xea, 0xba, 0xd1, 0x72, 0x22, 0xbb, + 0x80, 0x98, 0x88, 0xbb, 0x1c, 0xdb, 0x31, 0xbb, 0xce, 0x66, 0xb9, 0xba, 0x9c, 0xd2, 0x86, 0xbb, + 0xee, 0x8a, 0xe5, 0xba, 0xef, 0x05, 0x46, 0xbb, 0x8e, 0x9b, 0x4e, 0xbb, 0x45, 0x29, 0x7d, 0xbb, + 0x91, 0xfb, 0x9d, 0xbb, 0xee, 0xd3, 0x1b, 0xbb, 0x49, 0x63, 0x24, 0xbb, 0xde, 0x08, 0x20, 0xbb, + 0x14, 0x40, 0xa7, 0xba, 0x6f, 0x47, 0x9c, 0xbb, 0x90, 0x92, 0x17, 0xbb, 0x47, 0x47, 0x65, 0xbb, + 0x03, 0x9b, 0x93, 0xbb, 0x0c, 0x19, 0xf7, 0xba, 0x06, 0xfb, 0x19, 0xbb, 0x68, 0xdd, 0x19, 0xbb, + 0x61, 0x1e, 0xee, 0xba, 0x2e, 0xe4, 0x23, 0xbb, 0x70, 0x72, 0x14, 0xbb, 0xa8, 0xbd, 0x7b, 0xbb, + 0x69, 0x2f, 0x03, 0xba, 0x52, 0xae, 0xc8, 0xba, 0x51, 0xd0, 0xe7, 0xba, 0x6d, 0xfe, 0x8c, 0xba, + 0xe7, 0x33, 0x54, 0xbb, 0x0f, 0x77, 0xcb, 0xba, 0x24, 0x0b, 0x79, 0xbb, 0xc0, 0x92, 0xf0, 0xba, + 0x18, 0x42, 0xcd, 0xb8, 0x3b, 0x70, 0x39, 0xbb, 0xa0, 0xde, 0xb8, 0xba, 0xac, 0x67, 0x0a, 0xbb, + 0x1d, 0x4d, 0xfc, 0xba, 0xe6, 0x4a, 0xe2, 0xba, 0x42, 0xed, 0x4a, 0xbb, 0xc7, 0xb5, 0x3d, 0xbb, + 0x8d, 0x0d, 0x54, 0xbb, 0xb2, 0x5a, 0x4c, 0xbb, 0xe6, 0xe1, 0x86, 0xbb, 0xc7, 0x02, 0xa8, 0xba, + 0xf6, 0xd6, 0x9c, 0xba, 0x19, 0xbf, 0x2e, 0xbb, 0xfe, 0x95, 0x53, 0xbb, 0xd4, 0x4d, 0x18, 0xbb, + 0x2c, 0x26, 0x55, 0xba, 0xec, 0x93, 0x0e, 0xbb, 0x86, 0x83, 0x5a, 0xba, 0x78, 0xc4, 0x37, 0xbb, + 0xd6, 0xfa, 0xf9, 0xba, 0xe3, 0xe8, 0xe7, 0xba, 0x2a, 0xbd, 0xb2, 0x3a, 0xd4, 0x89, 0xbd, 0x38, + 0x6c, 0x3d, 0xec, 0x37, 0x40, 0x58, 0x07, 0xbb, 0x1e, 0x2f, 0xac, 0xba, 0xfb, 0x3a, 0x02, 0xba, + 0xb7, 0x73, 0x36, 0xbb, 0xb0, 0x4e, 0xcd, 0xba, 0x7f, 0xed, 0xab, 0xba, 0x3e, 0x5e, 0x18, 0xba, + 0xc7, 0x25, 0x6e, 0xbb, 0x0a, 0xd8, 0x63, 0xba, 0x50, 0xe2, 0x2d, 0x3a, 0x7b, 0x1c, 0x8d, 0xb9, + 0x84, 0x3e, 0xae, 0x38, 0xa5, 0x4e, 0xc3, 0xba, 0x86, 0xb7, 0x94, 0xba, 0xeb, 0x6a, 0x49, 0xbb, + 0x7c, 0x7e, 0x51, 0xb9, 0xf7, 0xfd, 0xc2, 0xba, 0xe7, 0xd3, 0x44, 0xba, 0xf4, 0x98, 0x91, 0xba, + 0x38, 0x76, 0xed, 0xb8, 0x01, 0x2f, 0x39, 0xbb, 0x55, 0xa1, 0xb9, 0xba, 0xc9, 0xf5, 0x05, 0xbb, + 0x9b, 0xec, 0x6e, 0xbb, 0x38, 0xee, 0x08, 0xb9, 0xbe, 0x43, 0xe8, 0xba, 0x56, 0x7b, 0xeb, 0xba, + 0x19, 0x35, 0xcf, 0xba, 0xa6, 0xdf, 0x3f, 0xbb, 0x78, 0xfd, 0xdf, 0xba, 0x99, 0xd0, 0xee, 0xba, + 0x3c, 0x43, 0x5a, 0x3a, 0xa0, 0xdb, 0x26, 0xba, 0x5a, 0xf2, 0x14, 0x3a, 0xeb, 0x4a, 0x61, 0xba, + 0xc9, 0x5a, 0x08, 0x3a, 0xa0, 0xa7, 0x3d, 0xba, 0x5c, 0xa4, 0x01, 0xbb, 0xd0, 0xec, 0x52, 0xb8, + 0x52, 0x62, 0x01, 0x3a, 0x3c, 0xfb, 0x98, 0xba, 0xb6, 0x2e, 0x8b, 0xb9, 0x54, 0x6e, 0xfa, 0xb9, + 0xb2, 0x1f, 0x1d, 0x3a, 0x3d, 0x53, 0x28, 0xbb, 0x7f, 0x18, 0x8d, 0xb9, 0x3b, 0xb5, 0x4a, 0xba, + 0x71, 0x3a, 0x8b, 0x3a, 0x22, 0xe9, 0x18, 0xbb, 0x43, 0xa6, 0x7b, 0xbb, 0x76, 0x2d, 0xe1, 0xb9, + 0x96, 0xa5, 0x5a, 0xb9, 0xab, 0xb2, 0x56, 0xbb, 0x22, 0x55, 0x4d, 0xba, 0x5f, 0x68, 0x89, 0x3a, + 0x9b, 0x45, 0x8b, 0x3a, 0x5a, 0x51, 0xfa, 0xb9, 0xf9, 0xc5, 0x88, 0xba, 0x65, 0xfb, 0xf4, 0xba, + 0xc7, 0x52, 0x77, 0x3d, 0x2d, 0x0b, 0x94, 0xbd, 0x09, 0x2a, 0x32, 0x3d, 0x6c, 0x67, 0x42, 0xbd, + 0x32, 0x4e, 0x6b, 0xbc, 0xad, 0x60, 0x08, 0xbe, 0xc6, 0x8d, 0xf3, 0xbd, 0x20, 0x34, 0x07, 0xbe, + 0x64, 0xcf, 0x15, 0xbe, 0x3a, 0x18, 0xe5, 0xbd, 0x42, 0x16, 0x20, 0xbd, 0x5f, 0x26, 0x60, 0xbd, + 0xce, 0xad, 0x93, 0xbd, 0x00, 0x0f, 0x15, 0xbe, 0x48, 0x51, 0x8c, 0xbd, 0x79, 0x97, 0x3a, 0xbd, + 0x57, 0xd4, 0xa4, 0xbd, 0xc0, 0xe5, 0x54, 0xbd, 0x6d, 0xb1, 0x0e, 0xbe, 0x14, 0xf3, 0x25, 0xbc, + 0x13, 0x1e, 0xba, 0xbd, 0x8a, 0x44, 0x01, 0xbe, 0xf4, 0x5e, 0x82, 0xbd, 0x74, 0x72, 0x1b, 0xbd, + 0x0f, 0x2b, 0x85, 0x3d, 0x2c, 0xc9, 0x31, 0xbe, 0x67, 0x24, 0x9c, 0xbd, 0x66, 0x15, 0xd6, 0xbd, + 0xfe, 0xb2, 0xf2, 0xbc, 0x84, 0x1e, 0x90, 0xbd, 0xc4, 0x5f, 0x23, 0xbe, 0xc7, 0x8d, 0x6e, 0xbd, + 0x08, 0xbe, 0x9d, 0xbd, 0xfd, 0xf6, 0xb6, 0xbd, 0x82, 0xde, 0x91, 0xbd, 0xbe, 0x65, 0x45, 0xbd, + 0xd0, 0xa0, 0xd3, 0xbb, 0x32, 0xac, 0x99, 0xbd, 0x14, 0xe7, 0x5e, 0xbd, 0x02, 0x0c, 0x9a, 0xbd, + 0x7e, 0x89, 0xd5, 0xbd, 0x80, 0x37, 0x8c, 0x3a, 0x46, 0xb7, 0xc5, 0xbd, 0x56, 0x5a, 0xcb, 0xbd, + 0xff, 0x73, 0x1b, 0xbd, 0x97, 0x3c, 0x2c, 0xbd, 0x9b, 0xfc, 0x33, 0xbd, 0x1d, 0x8d, 0x1d, 0xbe, + 0xa0, 0xac, 0x79, 0xbd, 0x40, 0xac, 0x02, 0xbe, 0x9f, 0x03, 0x6f, 0xbd, 0xe2, 0xec, 0x7f, 0x3c, + 0x84, 0xf2, 0x57, 0xbd, 0x94, 0x19, 0x41, 0xbc, 0x22, 0x44, 0x0f, 0xbe, 0x76, 0x43, 0x05, 0xbd, + 0x6b, 0xe5, 0x25, 0xbd, 0x3f, 0xc2, 0x36, 0xbe, 0x15, 0xa2, 0x37, 0xbd, 0x8d, 0xc0, 0xab, 0xbd, + 0x68, 0x36, 0x89, 0xbd, 0xc7, 0x63, 0x09, 0xbe, 0x9c, 0xbd, 0xa1, 0x3b, 0x42, 0xa2, 0x0c, 0xbd, + 0x68, 0x9b, 0xbd, 0xbd, 0xbb, 0xc6, 0x78, 0xbd, 0xde, 0xab, 0xae, 0xbd, 0xb7, 0x11, 0x44, 0xbd, + 0x3a, 0x20, 0x79, 0xbd, 0xe6, 0xe6, 0x9a, 0xbd, 0x1a, 0xda, 0x9d, 0xbd, 0xc1, 0xfd, 0xc3, 0xbd, + 0x94, 0x26, 0x75, 0xbd, 0x7b, 0x8f, 0xbb, 0xbd, 0xa8, 0x49, 0xc7, 0xbd, 0x9a, 0x7a, 0x4a, 0xbd, + 0x01, 0x64, 0xe5, 0xbd, 0xfa, 0xab, 0x6b, 0xbd, 0x9e, 0x60, 0xdd, 0xbc, 0xf4, 0xc5, 0xb7, 0xbd, + 0xce, 0x7e, 0x8b, 0xbd, 0x14, 0xaf, 0xb9, 0xbd, 0x98, 0x46, 0x93, 0xbd, 0xca, 0x71, 0x82, 0xbd, + 0x74, 0x78, 0xe0, 0xbd, 0xa8, 0xbb, 0xa8, 0xbd, 0x16, 0x1d, 0x84, 0xbd, 0xce, 0xb1, 0x6a, 0xbd, + 0x67, 0xab, 0x36, 0xbd, 0x80, 0xa7, 0xa4, 0xbd, 0xea, 0xfa, 0x70, 0xbd, 0x84, 0x77, 0xcc, 0xbd, + 0xbc, 0xea, 0x8a, 0xbd, 0x4c, 0xb3, 0x06, 0xbd, 0x27, 0x54, 0x8d, 0xbd, 0xee, 0x96, 0xe4, 0xbd, + 0xb5, 0x41, 0xa6, 0xbd, 0x4a, 0xdc, 0xa3, 0xbd, 0xe2, 0xc6, 0x81, 0xbd, 0xa0, 0x6b, 0x91, 0xbd, + 0x28, 0x90, 0xa4, 0xbd, 0xe5, 0xb1, 0xce, 0xbd, 0xc9, 0xa1, 0xd4, 0xbd, 0x45, 0x6a, 0xcd, 0xbd, + 0x29, 0x22, 0x6c, 0xbd, 0x44, 0xc4, 0xbb, 0xbd, 0xd4, 0x20, 0xb1, 0xbd, 0xf6, 0xa5, 0x7f, 0xbd, + 0x7f, 0x6a, 0x83, 0xbd, 0xdd, 0x66, 0x3c, 0xbd, 0x80, 0x8b, 0xd8, 0xbd, 0xce, 0x6c, 0xac, 0xbd, + 0xe6, 0x49, 0x46, 0xbd, 0xbc, 0x09, 0xf5, 0xbd, 0x3f, 0x15, 0x4b, 0xbd, 0x78, 0xa5, 0x13, 0xbd, + 0xe0, 0x35, 0xb0, 0xbd, 0x42, 0xef, 0x01, 0xbe, 0x81, 0xfe, 0xad, 0xbd, 0x9e, 0x79, 0x8a, 0xbd, + 0x89, 0x50, 0xa1, 0xbd, 0x80, 0xc9, 0xd2, 0xbd, 0xa3, 0x2a, 0x98, 0xbd, 0xa3, 0xa1, 0xa0, 0xbd, + 0xe9, 0x9c, 0x05, 0xbe, 0x83, 0x68, 0xd6, 0xbd, 0xe5, 0xd8, 0xf9, 0xbb, 0x78, 0x51, 0x3f, 0xbd, + 0xbe, 0xed, 0x5b, 0x3d, 0x20, 0xd7, 0x21, 0x3d, 0xac, 0x7a, 0x80, 0x3d, 0xea, 0x3d, 0x43, 0x3d, + 0x8a, 0x7c, 0x1b, 0x3d, 0xc4, 0xb4, 0x2a, 0x3d, 0x02, 0xf4, 0x61, 0x3d, 0x54, 0x73, 0x3b, 0x3d, + 0x45, 0xa9, 0x34, 0x3d, 0x5a, 0x2b, 0x37, 0x3d, 0xca, 0x12, 0x3e, 0x3d, 0x82, 0xcf, 0x59, 0x3d, + 0x08, 0x10, 0x67, 0x3d, 0x37, 0x52, 0x62, 0x3d, 0xb8, 0xf0, 0x4b, 0x3d, 0x8c, 0x3b, 0x8d, 0x3d, + 0x58, 0xd0, 0x79, 0x3d, 0x60, 0x74, 0x84, 0x3d, 0x32, 0x17, 0x6d, 0x3d, 0x93, 0x7b, 0x36, 0x3d, + 0xb2, 0x73, 0x63, 0x3d, 0x08, 0xc7, 0x4c, 0x3d, 0x50, 0xbd, 0x5c, 0x3d, 0x76, 0x79, 0x5a, 0x3d, + 0xf3, 0x17, 0x0d, 0x3d, 0xec, 0xcd, 0x6c, 0x3d, 0xe8, 0x8c, 0x53, 0x3d, 0xdc, 0x56, 0x60, 0x3d, + 0xbb, 0x8c, 0x38, 0x3d, 0xa9, 0x40, 0x07, 0x3d, 0x20, 0x6f, 0x42, 0x3d, 0x66, 0xe2, 0x6f, 0x3d, + 0x95, 0x33, 0x5e, 0x3d, 0xb6, 0x10, 0x54, 0x3d, 0x33, 0x0e, 0xf0, 0x3c, 0x66, 0x84, 0x2d, 0x3d, + 0x0d, 0x2b, 0x61, 0x3d, 0x86, 0x6e, 0x71, 0x3d, 0x18, 0x81, 0x42, 0x3d, 0x3f, 0x31, 0x58, 0x3d, + 0x17, 0x2f, 0x3a, 0x3d, 0x43, 0x25, 0x80, 0x3d, 0xc0, 0xcc, 0x47, 0x3d, 0x4d, 0xaa, 0x36, 0x3d, + 0x4c, 0x4e, 0x6b, 0x3d, 0x29, 0x2a, 0x48, 0x3d, 0xdb, 0x55, 0x6a, 0x3d, 0x3a, 0x57, 0x80, 0x3d, + 0x34, 0xaa, 0x22, 0x3d, 0x91, 0x9d, 0x4a, 0x3d, 0x18, 0xdb, 0x44, 0x3d, 0x9c, 0xe5, 0x42, 0x3d, + 0x24, 0x4a, 0x80, 0x3d, 0x4d, 0x96, 0x8f, 0x3d, 0x51, 0xd3, 0x5e, 0x3d, 0x3a, 0xcf, 0x39, 0x3d, + 0xe6, 0xc9, 0x5a, 0x3d, 0x7d, 0xce, 0x5d, 0x3d, 0x43, 0x17, 0x80, 0x3d, 0xeb, 0x55, 0x56, 0x3d, + 0x36, 0xbc, 0x91, 0x3d, 0x8d, 0x4f, 0x41, 0x3d, 0x10, 0xaf, 0xe4, 0x3c, 0xd0, 0xee, 0x5b, 0x3d, + 0x54, 0x03, 0x86, 0x3c, 0x8e, 0x70, 0x73, 0x3c, 0x12, 0xe1, 0x37, 0x3c, 0x01, 0x5c, 0x01, 0x3c, + 0x08, 0x64, 0xa2, 0x3b, 0x6a, 0x94, 0xce, 0x3c, 0x59, 0xe4, 0x14, 0x3d, 0x76, 0x5b, 0x0d, 0x3d, + 0x58, 0xc5, 0x85, 0x3c, 0xf6, 0x5e, 0x8a, 0x3c, 0x72, 0x4e, 0xfd, 0x3c, 0x36, 0xcf, 0x16, 0x3c, + 0x24, 0xa2, 0xdb, 0x3c, 0x1d, 0x93, 0x3b, 0x3c, 0xc0, 0x0f, 0x03, 0xbb, 0x9e, 0xd3, 0xd0, 0x3c, + 0x48, 0x80, 0xac, 0x3c, 0x6e, 0xde, 0x0d, 0x3c, 0x63, 0x46, 0x97, 0x3c, 0xa7, 0x4e, 0x18, 0x3c, + 0x42, 0x0f, 0x00, 0x3d, 0x6c, 0xd5, 0xb6, 0x3c, 0xdc, 0xef, 0xc8, 0x3b, 0x11, 0x2b, 0x0b, 0x3c, + 0x38, 0xe9, 0xfc, 0x3b, 0x10, 0xed, 0xbc, 0x3c, 0x66, 0xa3, 0xb3, 0x3c, 0xa8, 0x99, 0x9b, 0x3c, + 0x22, 0xd4, 0x9c, 0x3c, 0x31, 0x95, 0xb5, 0xbb, 0x0a, 0xf1, 0xd0, 0x3c, 0xeb, 0xbc, 0xf9, 0x3c, + 0x65, 0x51, 0x38, 0x3c, 0x41, 0x8a, 0xdf, 0x3c, 0xd7, 0xff, 0x34, 0x3a, 0xfb, 0xd5, 0xa8, 0x3c, + 0x06, 0x6a, 0xb1, 0x3b, 0x6a, 0x55, 0xd6, 0x3c, 0x49, 0xe2, 0xa7, 0x3c, 0x16, 0xd1, 0xc3, 0x3c, + 0x1c, 0x55, 0x7f, 0x3b, 0x41, 0x7f, 0xbd, 0x3c, 0xcd, 0xcc, 0x4b, 0x3c, 0x3e, 0x1f, 0x93, 0x3c, + 0x95, 0x89, 0xb5, 0x3c, 0xee, 0xab, 0xf1, 0x3b, 0x16, 0x48, 0xa4, 0x3c, 0xd3, 0xe0, 0xe2, 0x3c, + 0x42, 0xf7, 0x94, 0x3c, 0x4a, 0xd5, 0xd6, 0x3c, 0x4e, 0x01, 0x20, 0x3c, 0xa6, 0x72, 0x95, 0xbb, + 0x00, 0x15, 0x02, 0x3d, 0x6d, 0xec, 0x56, 0x3c, 0x79, 0x14, 0x2f, 0x3c, 0xcd, 0xe9, 0x64, 0x3c, + 0x06, 0x76, 0x25, 0x3c, 0xc8, 0x43, 0x90, 0x3c, 0x78, 0xbc, 0x2a, 0x3c, 0x24, 0xb1, 0x8b, 0x3c, + 0x65, 0xaf, 0xc1, 0x3c, 0x3c, 0xae, 0x98, 0x3c, 0x10, 0x87, 0x6d, 0x3c, 0x65, 0x95, 0x41, 0x3c, + 0xdf, 0x97, 0xbd, 0xbb, 0x48, 0x36, 0x98, 0xbb, 0x76, 0x94, 0xe9, 0xbb, 0xe8, 0xf0, 0xc2, 0xbb, + 0x0e, 0x45, 0x80, 0xbb, 0x86, 0x8c, 0xa3, 0xbb, 0x4b, 0x83, 0xf6, 0xbb, 0x71, 0xf4, 0xb0, 0xbb, + 0x93, 0x2c, 0xb2, 0xbb, 0xf4, 0x71, 0x93, 0xbb, 0x6b, 0x76, 0xac, 0xbb, 0x92, 0x75, 0xde, 0xbb, + 0x32, 0xdc, 0xc6, 0xbb, 0x3e, 0x7f, 0xe3, 0xbb, 0x51, 0xef, 0xd3, 0xbb, 0x08, 0x5b, 0x0d, 0xbc, + 0x8e, 0xf0, 0x01, 0xbc, 0xe6, 0x0b, 0xe9, 0xbb, 0x76, 0x37, 0xec, 0xbb, 0xbc, 0xa3, 0xa4, 0xbb, + 0xd3, 0x72, 0xcc, 0xbb, 0x0a, 0x4f, 0xc0, 0xbb, 0x42, 0x79, 0xcc, 0xbb, 0x3f, 0x80, 0xd5, 0xbb, + 0xf6, 0x3e, 0x85, 0xbb, 0x84, 0x22, 0xea, 0xbb, 0xdc, 0x86, 0xe1, 0xbb, 0x39, 0x47, 0xbf, 0xbb, + 0x82, 0xbc, 0xb2, 0xbb, 0x93, 0x9b, 0x6a, 0xbb, 0xe1, 0xbb, 0xc7, 0xbb, 0xf7, 0xe4, 0xd7, 0xbb, + 0x42, 0x67, 0xc4, 0xbb, 0x48, 0xcb, 0xd3, 0xbb, 0x7f, 0xe6, 0x17, 0xbb, 0x24, 0x6e, 0xa4, 0xbb, + 0x36, 0x4e, 0xbf, 0xbb, 0xf9, 0x58, 0xde, 0xbb, 0x05, 0x94, 0x99, 0xbb, 0x1a, 0x0a, 0xbd, 0xbb, + 0xcb, 0xd5, 0xa6, 0xbb, 0x78, 0x4e, 0xf5, 0xbb, 0xe0, 0xab, 0xa5, 0xbb, 0x27, 0x73, 0xb4, 0xbb, + 0x02, 0x96, 0xf8, 0xbb, 0x6c, 0x0a, 0xca, 0xbb, 0x72, 0x91, 0xc6, 0xbb, 0x7a, 0xb7, 0x02, 0xbc, + 0x30, 0x1d, 0xac, 0xbb, 0x14, 0x6b, 0x9b, 0xbb, 0x73, 0x12, 0xc6, 0xbb, 0x14, 0x08, 0xbb, 0xbb, + 0x78, 0xf9, 0x04, 0xbc, 0x28, 0x81, 0xe3, 0xbb, 0x07, 0x1a, 0xc0, 0xbb, 0x63, 0x02, 0xac, 0xbb, + 0xde, 0x0e, 0xc1, 0xbb, 0xe1, 0x99, 0xb6, 0xbb, 0x13, 0x56, 0xf3, 0xbb, 0x96, 0x78, 0xc7, 0xbb, + 0x24, 0xfb, 0xf5, 0xbb, 0x6f, 0x28, 0x94, 0xbb, 0x84, 0xa2, 0x98, 0xbb, 0x74, 0x42, 0xe8, 0xbb, + 0xd2, 0x82, 0x06, 0xbb, 0xfb, 0xda, 0x0e, 0xbb, 0x00, 0x94, 0x01, 0xbb, 0x27, 0x3d, 0x01, 0xbb, + 0x02, 0xf5, 0x20, 0xba, 0x5b, 0x3b, 0x5b, 0xbb, 0x2a, 0xdd, 0xb8, 0xbb, 0x8c, 0x14, 0x8c, 0xbb, + 0x97, 0x4e, 0x2e, 0xbb, 0x3e, 0x08, 0xe1, 0xba, 0xef, 0x73, 0x73, 0xbb, 0xd1, 0xbe, 0x1c, 0xbb, + 0x0e, 0xba, 0x4b, 0xbb, 0xfd, 0xe8, 0x27, 0xbb, 0x08, 0xeb, 0xa8, 0xba, 0x09, 0x45, 0x8c, 0xbb, + 0x58, 0x07, 0x81, 0xbb, 0xbf, 0xf8, 0xc5, 0xba, 0xb8, 0x1b, 0x56, 0xbb, 0x9e, 0x75, 0xc4, 0xba, + 0x8e, 0xde, 0x79, 0xbb, 0x91, 0x3d, 0x4b, 0xbb, 0x29, 0xe7, 0xbf, 0xba, 0x2e, 0x1c, 0x05, 0xbb, + 0x17, 0xa6, 0xb6, 0xba, 0x03, 0x86, 0x70, 0xbb, 0x4c, 0x24, 0x80, 0xbb, 0xfe, 0x17, 0x14, 0xbb, + 0xf5, 0xa7, 0x3b, 0xbb, 0x60, 0xe7, 0x90, 0x39, 0xca, 0x86, 0x7f, 0xbb, 0xa0, 0xc3, 0x77, 0xbb, + 0xe6, 0x05, 0xde, 0xba, 0xd1, 0x8e, 0x83, 0xbb, 0xde, 0x79, 0x0e, 0x3a, 0x3a, 0x4d, 0x3a, 0xbb, + 0x96, 0x4c, 0x67, 0xba, 0x08, 0xdf, 0x65, 0xbb, 0xb6, 0x41, 0x02, 0xbb, 0x14, 0xd3, 0x3c, 0xbb, + 0x98, 0x3c, 0x75, 0xba, 0x1c, 0x25, 0x68, 0xbb, 0xb9, 0x5d, 0xba, 0xba, 0x38, 0x55, 0x3a, 0xbb, + 0xce, 0xf4, 0x84, 0xbb, 0xfb, 0x26, 0x04, 0xbb, 0xfc, 0x9f, 0x19, 0xbb, 0xea, 0xd6, 0x92, 0xbb, + 0x0f, 0x23, 0x4b, 0xbb, 0xae, 0x84, 0x1e, 0xbb, 0x43, 0x2d, 0x11, 0xbb, 0x94, 0x5a, 0x01, 0xba, + 0x50, 0xc0, 0xa4, 0xbb, 0xd7, 0xff, 0xa2, 0xba, 0xe0, 0x6d, 0xc3, 0xba, 0xf0, 0x49, 0x0a, 0xbb, + 0x31, 0x7a, 0xcc, 0xba, 0x65, 0x72, 0xfb, 0xba, 0x7e, 0x13, 0x11, 0xbb, 0x38, 0xc2, 0x27, 0xbb, + 0xba, 0x8e, 0x34, 0xbb, 0x8e, 0x53, 0xda, 0xba, 0x68, 0x55, 0x59, 0xbb, 0x5e, 0x0f, 0x3e, 0xbb, + 0xfe, 0xad, 0x5b, 0xbb, 0xc7, 0x45, 0x17, 0xbb, 0x9e, 0x23, 0x6e, 0xbb, 0x57, 0xf6, 0x22, 0xbb, + 0x6b, 0xaa, 0x1b, 0xbb, 0x4d, 0x61, 0x24, 0xbb, 0x30, 0xfb, 0x3d, 0xbb, 0x3e, 0xba, 0x3c, 0xbb, + 0xa6, 0x64, 0x20, 0xbb, 0xc5, 0x18, 0x44, 0xbb, 0x0c, 0x00, 0x43, 0xbb, 0xe7, 0x23, 0x31, 0xbb, + 0x14, 0x04, 0x6f, 0xbb, 0x80, 0x42, 0x3d, 0xbb, 0xf9, 0x84, 0x19, 0xbb, 0xdf, 0x9b, 0x76, 0xbb, + 0x25, 0x09, 0x4f, 0xbb, 0x2b, 0xe2, 0x7a, 0xbb, 0x88, 0xac, 0x4d, 0xbb, 0xd5, 0x71, 0x2b, 0xbb, + 0xac, 0x93, 0x66, 0xbb, 0xf6, 0x92, 0x42, 0xbb, 0x82, 0xe1, 0x45, 0xbb, 0x24, 0xb6, 0x3a, 0xbb, + 0x29, 0x17, 0xfe, 0xba, 0x72, 0x01, 0x53, 0xbb, 0x50, 0x3a, 0x2d, 0xbb, 0xb1, 0x17, 0x64, 0xbb, + 0xd1, 0x22, 0x29, 0xbb, 0xf2, 0x38, 0xf3, 0xba, 0x41, 0x24, 0x2b, 0xbb, 0x0a, 0xdf, 0x70, 0xbb, + 0x75, 0xc6, 0x54, 0xbb, 0x05, 0x47, 0x40, 0xbb, 0x55, 0xaa, 0x0c, 0xbb, 0xc6, 0xc8, 0x24, 0xbb, + 0xba, 0x6e, 0x5a, 0xbb, 0xb9, 0xa9, 0x69, 0xbb, 0x62, 0x51, 0x55, 0xbb, 0x40, 0xb0, 0x5b, 0xbb, + 0xd6, 0x7c, 0x2b, 0xbb, 0x99, 0xf7, 0x6a, 0xbb, 0x85, 0x28, 0x4c, 0xbb, 0x50, 0x0f, 0x23, 0xbb, + 0x49, 0x1e, 0x41, 0xbb, 0x29, 0x4c, 0x24, 0xbb, 0xe8, 0xa9, 0x6f, 0xbb, 0xf4, 0x47, 0x5f, 0xbb, + 0x4d, 0x94, 0x07, 0xbb, 0xde, 0xc5, 0x66, 0xbb, 0xcc, 0x44, 0x24, 0xbb, 0xf5, 0x0b, 0x20, 0xbb, + 0xf8, 0xe8, 0x5d, 0xbb, 0xe0, 0x13, 0x96, 0xbb, 0xf8, 0xbc, 0x59, 0xbb, 0xb6, 0xe4, 0x2d, 0xbb, + 0xdc, 0x08, 0x51, 0xbb, 0xc9, 0x13, 0x67, 0xbb, 0xe7, 0x53, 0x62, 0xbb, 0x60, 0x4d, 0x48, 0xbb, + 0x60, 0x04, 0x95, 0xbb, 0xc8, 0x11, 0x57, 0xbb, 0x31, 0x1d, 0x82, 0xba, 0x79, 0x4c, 0x2d, 0xbb, + 0x11, 0xaf, 0xc2, 0xba, 0xfa, 0xf9, 0x8c, 0xba, 0x50, 0xb2, 0x8a, 0xba, 0x08, 0x90, 0x02, 0xba, + 0xf4, 0x0f, 0x3e, 0xba, 0xe7, 0xe4, 0xdc, 0xba, 0x10, 0x55, 0x00, 0xbb, 0x58, 0xda, 0x17, 0xbb, + 0xbe, 0xbe, 0x8a, 0xba, 0xc1, 0xd1, 0xd1, 0xba, 0x98, 0x43, 0x10, 0xbb, 0xf8, 0x36, 0x04, 0xba, + 0xe8, 0x04, 0x0e, 0xbb, 0x01, 0xdf, 0x36, 0xba, 0xac, 0x7c, 0x81, 0x39, 0xb0, 0x55, 0xd0, 0xba, + 0xc2, 0x5e, 0x98, 0xba, 0xea, 0xa8, 0x86, 0xba, 0xdb, 0x05, 0x99, 0xba, 0x05, 0xae, 0x5f, 0xba, + 0x20, 0x10, 0x17, 0xbb, 0x68, 0xc5, 0xcf, 0xba, 0x27, 0x88, 0x25, 0xba, 0xee, 0x0f, 0x1f, 0xba, + 0x92, 0xd8, 0x1d, 0xba, 0xdf, 0x43, 0xc2, 0xba, 0x80, 0xaf, 0x97, 0xba, 0xee, 0xb8, 0xdd, 0xba, + 0x92, 0x74, 0xa8, 0xba, 0xd0, 0xf8, 0x97, 0x38, 0x8a, 0x57, 0xc6, 0xba, 0xe4, 0xdb, 0x14, 0xbb, + 0x58, 0xc1, 0x8f, 0xba, 0x1c, 0x15, 0xe0, 0xba, 0x67, 0x9f, 0x3b, 0xba, 0x7f, 0x02, 0xbb, 0xba, + 0x5f, 0x28, 0x60, 0xba, 0x87, 0x80, 0xfc, 0xba, 0x6f, 0xbc, 0xf9, 0xba, 0x5f, 0x1f, 0xfa, 0xba, + 0x76, 0x3a, 0x0d, 0xba, 0x5e, 0x77, 0xd3, 0xba, 0x27, 0x28, 0xaa, 0xba, 0x8a, 0x8d, 0x97, 0xba, + 0x29, 0xfa, 0x9a, 0xba, 0xd2, 0xa9, 0xe2, 0xb9, 0xe8, 0xd0, 0xeb, 0xba, 0xd7, 0x5f, 0xd9, 0xba, + 0x4a, 0x11, 0x82, 0xba, 0x22, 0x9d, 0x1a, 0xbb, 0xe7, 0x23, 0x1b, 0xba, 0xba, 0x0c, 0x2d, 0x39, + 0xb2, 0x6c, 0xf1, 0xba, 0x1c, 0xcd, 0xe2, 0xba, 0xd2, 0xfc, 0x94, 0xba, 0x94, 0x0e, 0x8e, 0xba, + 0x16, 0x31, 0x86, 0xba, 0x02, 0xad, 0xde, 0xba, 0x4a, 0x1e, 0x5e, 0xba, 0xcc, 0x6b, 0xa9, 0xba, + 0xf9, 0x54, 0x0e, 0xbb, 0x78, 0xfc, 0xf2, 0xba, 0x44, 0xb6, 0xa8, 0xb9, 0xd2, 0x46, 0x10, 0xba, + 0xed, 0x31, 0x85, 0xbd, 0x9d, 0x4c, 0x7c, 0xbd, 0x73, 0x1f, 0xa6, 0xbd, 0xac, 0xfe, 0x9d, 0xbd, + 0x1e, 0x64, 0x12, 0xbd, 0xf3, 0x57, 0x99, 0xbd, 0xa6, 0xec, 0x03, 0xbe, 0xe1, 0xea, 0xad, 0xbd, + 0x26, 0xea, 0x9b, 0xbd, 0x68, 0xad, 0x40, 0xbd, 0x9b, 0x07, 0x9d, 0xbd, 0x6c, 0x36, 0xba, 0xbd, + 0x3c, 0x19, 0x9a, 0xbd, 0xe1, 0x87, 0xbd, 0xbd, 0x8b, 0xdf, 0xa4, 0xbd, 0x10, 0x77, 0xfb, 0xbd, + 0xbe, 0x93, 0xee, 0xbd, 0x52, 0x3d, 0x98, 0xbd, 0x17, 0x8b, 0xcc, 0xbd, 0xb3, 0x5e, 0x6b, 0xbd, + 0xdd, 0x7a, 0xaf, 0xbd, 0x4e, 0x7c, 0xa4, 0xbd, 0x9f, 0x79, 0x90, 0xbd, 0xda, 0x93, 0xa7, 0xbd, + 0x13, 0xec, 0x4d, 0xbd, 0x81, 0x05, 0xd0, 0xbd, 0xfa, 0x8c, 0xda, 0xbd, 0xa2, 0x70, 0x87, 0xbd, + 0xe9, 0xea, 0x9c, 0xbd, 0x74, 0x18, 0xed, 0xbc, 0x73, 0xf0, 0xc2, 0xbd, 0x08, 0xc2, 0xb5, 0xbd, + 0x37, 0x8f, 0x87, 0xbd, 0x47, 0x74, 0xc8, 0xbd, 0x9e, 0xf9, 0x86, 0xbb, 0xaf, 0xc1, 0x90, 0xbd, + 0x5e, 0x66, 0x63, 0xbd, 0x7e, 0x7a, 0xb9, 0xbd, 0x76, 0x3d, 0x49, 0xbd, 0xbe, 0xf2, 0x93, 0xbd, + 0xc2, 0xaf, 0x5a, 0xbd, 0x27, 0x27, 0xcf, 0xbd, 0x3b, 0xee, 0x51, 0xbd, 0x2e, 0x6e, 0xa0, 0xbd, + 0xf9, 0x6a, 0xeb, 0xbd, 0x2d, 0xf8, 0xa4, 0xbd, 0xa2, 0x7d, 0x8b, 0xbd, 0xed, 0xf5, 0xf3, 0xbd, + 0x9c, 0xa5, 0xa7, 0xbd, 0x00, 0x7f, 0x4f, 0xbd, 0x3a, 0xec, 0xa4, 0xbd, 0xe9, 0xd6, 0x78, 0xbd, + 0x48, 0x64, 0x01, 0xbe, 0x4f, 0x4c, 0x77, 0xbd, 0xf9, 0xd9, 0x7b, 0xbd, 0x44, 0xf1, 0x86, 0xbd, + 0xcc, 0x5b, 0x83, 0xbd, 0xda, 0xc9, 0x6f, 0xbd, 0xb7, 0x0e, 0xb8, 0xbd, 0xa9, 0x12, 0x9f, 0xbd, + 0x4e, 0xa0, 0xa9, 0xbd, 0xfb, 0xe5, 0x31, 0xbd, 0xe7, 0xd2, 0xb3, 0xbd, 0xf2, 0x35, 0xce, 0xbd, + 0x7c, 0x8c, 0x9e, 0xbd, 0x2e, 0x03, 0x82, 0xbc, 0x97, 0x56, 0x25, 0xbc, 0x41, 0x8d, 0x22, 0xbd, + 0x76, 0x2d, 0x5b, 0xbc, 0x8c, 0xdf, 0x78, 0xbd, 0x85, 0xa0, 0xbc, 0xbc, 0x48, 0x02, 0x1e, 0xbd, + 0xda, 0x86, 0x3e, 0xbd, 0x33, 0x74, 0x12, 0xbd, 0xfa, 0xe8, 0x30, 0xbd, 0x0d, 0x8f, 0x89, 0xbc, + 0xc8, 0xa3, 0x85, 0xbd, 0x4d, 0xcb, 0x11, 0xbc, 0x82, 0x55, 0xca, 0x3c, 0x35, 0x7f, 0x26, 0xbd, + 0x1a, 0xfd, 0xbc, 0x3b, 0x53, 0x15, 0x49, 0xbd, 0x1e, 0x79, 0x06, 0xbd, 0xe4, 0xb9, 0x46, 0xbd, + 0x06, 0xe6, 0x59, 0xbd, 0x2c, 0xe9, 0xef, 0xbb, 0x32, 0x84, 0xbb, 0xbb, 0x38, 0xfa, 0x23, 0xbd, + 0xea, 0xaf, 0x84, 0xbb, 0x41, 0x14, 0x2e, 0xbd, 0xba, 0xf3, 0x0d, 0xbd, 0x5c, 0x18, 0x25, 0xbd, + 0x87, 0xc5, 0xda, 0xbc, 0xd6, 0x17, 0x3c, 0xbb, 0x17, 0x77, 0x3b, 0xbd, 0xe8, 0x1a, 0x84, 0xbd, + 0xfc, 0xbf, 0x30, 0xbd, 0x66, 0x2c, 0x05, 0xbc, 0x80, 0x4c, 0xd8, 0xbc, 0x5d, 0xe8, 0x4a, 0xbd, + 0x89, 0x1b, 0x49, 0xbd, 0xa7, 0x8e, 0x6d, 0xbd, 0x14, 0x7c, 0xd0, 0xbc, 0xa6, 0xe5, 0xf6, 0xbc, + 0xc1, 0x2c, 0x29, 0xbd, 0x18, 0x21, 0xf3, 0xbc, 0x32, 0xfc, 0x13, 0xbd, 0x78, 0x5d, 0x98, 0x3a, + 0xbe, 0x40, 0xf8, 0xbc, 0x00, 0xce, 0xfa, 0xbb, 0x00, 0xac, 0x12, 0xb7, 0xd3, 0xc8, 0xa5, 0xbc, + 0xb9, 0x17, 0x6e, 0x3c, 0xaa, 0x19, 0x4b, 0xbc, 0x4c, 0x1a, 0xba, 0xbc, 0x76, 0x65, 0x90, 0xbc, + 0x78, 0x5c, 0x3d, 0xbc, 0xcc, 0x58, 0xa3, 0xbd, 0xd4, 0xe3, 0x30, 0xbd, 0x03, 0x09, 0x36, 0xbc, + 0x27, 0x4b, 0xd4, 0xbc, 0x44, 0x72, 0x82, 0xbd, 0x80, 0x74, 0x18, 0xbd, 0x8e, 0xdf, 0x32, 0xbd, + 0x48, 0x6d, 0x3f, 0xbd, 0x86, 0x3a, 0x32, 0xbd, 0x80, 0xcc, 0x4a, 0xbb, 0x87, 0x89, 0xf7, 0xbc, + 0xc4, 0x60, 0xd3, 0x3c, 0x41, 0x81, 0x12, 0x3c, 0x23, 0x83, 0x1d, 0x3c, 0x2b, 0x27, 0xb9, 0x3c, + 0x95, 0xb6, 0x8b, 0x3b, 0x92, 0x37, 0x82, 0x3c, 0x00, 0xaf, 0xe2, 0x3c, 0xaa, 0x9d, 0xb6, 0x3c, + 0xd7, 0xe0, 0x07, 0x3d, 0x34, 0xd5, 0xa1, 0x3c, 0x6c, 0xde, 0x68, 0x3c, 0xbe, 0xf6, 0xed, 0x3c, + 0xb2, 0x6b, 0x88, 0x3c, 0x06, 0x60, 0xdd, 0x3c, 0x19, 0x81, 0x67, 0x3c, 0xa1, 0x36, 0x16, 0x3d, + 0x8c, 0xb3, 0x73, 0x3c, 0xf6, 0x49, 0x0f, 0x3d, 0x94, 0x0b, 0x0a, 0x3d, 0x52, 0x72, 0xaf, 0x3c, + 0x44, 0x10, 0x0f, 0x3d, 0xe8, 0xce, 0x9c, 0x3c, 0xfb, 0xe0, 0x43, 0x3c, 0x2c, 0x11, 0xd0, 0x3c, + 0xc2, 0xd6, 0xcf, 0xbb, 0xbe, 0xe5, 0x24, 0x3d, 0xb3, 0x59, 0xa5, 0x3c, 0xd1, 0x78, 0xb7, 0x3c, + 0x13, 0xdb, 0x07, 0x3c, 0xf4, 0xf8, 0x03, 0x3c, 0xb8, 0x38, 0xdb, 0x3c, 0xe6, 0xf6, 0xf8, 0x3c, + 0x79, 0x8b, 0x90, 0x3c, 0xb9, 0xf2, 0x59, 0x3c, 0x7d, 0x67, 0x05, 0x3c, 0xde, 0x03, 0xd8, 0x3c, + 0x02, 0xeb, 0xb5, 0x3c, 0xde, 0xe7, 0x82, 0x3c, 0x5e, 0x1d, 0xa3, 0x3c, 0x67, 0x07, 0x0b, 0x3c, + 0x28, 0x6a, 0xd8, 0x3c, 0x8f, 0x71, 0x80, 0x3c, 0x1a, 0xf7, 0xce, 0x3c, 0xee, 0x3a, 0x2e, 0x3c, + 0x44, 0xa8, 0xfe, 0x3c, 0x84, 0xcb, 0x8b, 0x3c, 0x70, 0x35, 0x3e, 0x3c, 0x80, 0xfa, 0xbe, 0x3c, + 0x26, 0xd3, 0x50, 0x3c, 0xd4, 0xc1, 0x1b, 0x3c, 0x85, 0x43, 0xa6, 0x3c, 0xb4, 0x65, 0xa2, 0x3c, + 0x30, 0xc4, 0x00, 0x3d, 0x4f, 0x69, 0x09, 0x3d, 0x74, 0xb9, 0xfb, 0x3c, 0xe9, 0x7b, 0x80, 0x3c, + 0x3b, 0x48, 0x8f, 0x3c, 0x3a, 0xbb, 0xb1, 0x3c, 0x02, 0x9a, 0x26, 0x3d, 0x7a, 0xd3, 0x2f, 0x3d, + 0xc0, 0x6d, 0xff, 0x3c, 0x12, 0xe8, 0xcd, 0x3c, 0x50, 0xd7, 0x46, 0x3a, 0xd9, 0x95, 0xa1, 0x3c, + 0xe6, 0xda, 0xb0, 0x3b, 0xfc, 0x6f, 0x36, 0x3b, 0x1a, 0xbe, 0xfe, 0xbb, 0x7c, 0x6f, 0x09, 0xbc, + 0xf6, 0x5d, 0x17, 0x3b, 0xcc, 0xaa, 0xd5, 0x3c, 0x82, 0xf2, 0x59, 0x3c, 0xa2, 0x5b, 0x8f, 0x3c, + 0x80, 0xfd, 0x51, 0x3c, 0x26, 0xc6, 0x80, 0x3c, 0x14, 0xbc, 0x37, 0x3c, 0x58, 0x7f, 0xe8, 0x3b, + 0xfc, 0x69, 0xa5, 0x3c, 0x84, 0xb1, 0x02, 0x3b, 0xc8, 0x97, 0x48, 0xbc, 0x69, 0xa4, 0x95, 0xbb, + 0x38, 0x29, 0x84, 0xba, 0x9c, 0x48, 0x94, 0xba, 0x47, 0x5d, 0xeb, 0x3b, 0xc2, 0xab, 0x2c, 0x3c, + 0x0c, 0xca, 0x94, 0x3c, 0xcc, 0xf6, 0x6b, 0x3c, 0x28, 0x6c, 0xb5, 0xbb, 0xb4, 0x10, 0x09, 0x3b, + 0xb8, 0x8a, 0x30, 0x3a, 0x90, 0xca, 0xf2, 0x3b, 0xae, 0xce, 0xc5, 0x3c, 0x1b, 0x52, 0x81, 0x3b, + 0x30, 0xf7, 0x75, 0x3c, 0x02, 0x93, 0x3e, 0xbb, 0x0a, 0x85, 0x73, 0x3c, 0xd0, 0x95, 0x97, 0x3c, + 0x44, 0x00, 0x26, 0x3c, 0x75, 0x5e, 0xc6, 0x3b, 0x3d, 0x2e, 0x07, 0xbc, 0xfa, 0xd4, 0x3f, 0x3c, + 0x9b, 0x19, 0x18, 0xbc, 0x5c, 0xfb, 0xbe, 0x3c, 0xbc, 0x25, 0xce, 0xbb, 0xbe, 0x8e, 0x79, 0x3c, + 0x40, 0xe2, 0xf8, 0x3a, 0xb0, 0x64, 0xdf, 0xbb, 0xce, 0xba, 0xc3, 0xba, 0x64, 0x59, 0x2d, 0xbc, + 0xa6, 0xb1, 0x58, 0x3c, 0x38, 0x0c, 0x4b, 0xba, 0x89, 0x41, 0xa7, 0xbb, 0x71, 0x49, 0xaa, 0x3c, + 0x6c, 0x3a, 0x62, 0x3b, 0x52, 0x5d, 0x53, 0x3c, 0x50, 0x6b, 0xb9, 0x3a, 0x88, 0x4e, 0x16, 0xbb, + 0x1c, 0xd2, 0xd1, 0xbb, 0xb4, 0xae, 0xed, 0x3b, 0x65, 0x3e, 0x90, 0x3c, 0x84, 0x3d, 0xaa, 0x3b, + 0xb4, 0xf7, 0x44, 0x3a, 0x9d, 0x9b, 0x8a, 0x3c, 0xb3, 0xc5, 0x4e, 0x3c, 0x94, 0xdc, 0x33, 0x3b, + 0x64, 0x00, 0x89, 0x3b, 0x88, 0xf6, 0x5e, 0x3c, 0x7e, 0x92, 0x8d, 0x3c, 0xd4, 0x94, 0xf0, 0x3a, + 0xda, 0x70, 0xcc, 0xba, 0x5a, 0x6b, 0x79, 0xba, 0xb6, 0xf3, 0x3b, 0xba, 0x25, 0x01, 0xd8, 0xba, + 0xb0, 0x11, 0xa2, 0xb9, 0x6b, 0xfd, 0xd9, 0xba, 0x95, 0xa7, 0x84, 0xbb, 0xf8, 0x35, 0x40, 0xbb, + 0xee, 0x3e, 0x85, 0xbb, 0xa9, 0x99, 0x27, 0xbb, 0x7f, 0x3b, 0xa4, 0xba, 0xe0, 0x0f, 0x88, 0xbb, + 0x86, 0xa4, 0xae, 0xba, 0xdc, 0xbe, 0x7c, 0xbb, 0x5f, 0xbd, 0x18, 0xbb, 0xf2, 0x5f, 0x80, 0xbb, + 0x1a, 0x84, 0x1b, 0xbb, 0x1f, 0xf1, 0x68, 0xbb, 0x20, 0x90, 0x8d, 0xbb, 0x32, 0x15, 0x0b, 0xbb, + 0xed, 0x53, 0x8f, 0xbb, 0x5e, 0x5e, 0x5b, 0xbb, 0x96, 0xc4, 0xad, 0xba, 0x42, 0x14, 0x29, 0xbb, + 0x3e, 0xfb, 0x89, 0x3a, 0x6a, 0x98, 0xa3, 0xbb, 0xed, 0xd5, 0x49, 0xbb, 0x32, 0x0c, 0x11, 0xbb, + 0x26, 0xbc, 0x90, 0xba, 0x3e, 0x8a, 0x7d, 0xba, 0xd3, 0xc9, 0x53, 0xbb, 0xb4, 0x7a, 0x5a, 0xbb, + 0x54, 0xc6, 0xe0, 0xba, 0x6d, 0xd9, 0x04, 0xbb, 0x50, 0x8b, 0x95, 0x38, 0xfa, 0x5d, 0x3d, 0xbb, + 0x9c, 0x3f, 0xa1, 0xba, 0x9a, 0x90, 0xd5, 0xba, 0xd4, 0x0d, 0xef, 0xba, 0xb3, 0xfa, 0x86, 0xba, + 0x70, 0xb1, 0x2f, 0xbb, 0xc1, 0xea, 0x85, 0xba, 0x26, 0x8a, 0x24, 0xbb, 0xbd, 0xd0, 0x90, 0xba, + 0x06, 0x5d, 0x8d, 0xbb, 0xde, 0x7e, 0x14, 0xbb, 0xd6, 0xc2, 0xc2, 0xba, 0x6d, 0x14, 0x7d, 0xbb, + 0x1a, 0x5a, 0x28, 0xbb, 0x58, 0x4b, 0xd9, 0xba, 0x4c, 0x86, 0x1a, 0xbb, 0x10, 0xdb, 0x14, 0xbb, + 0x8f, 0x67, 0x83, 0xbb, 0xa0, 0x51, 0x33, 0xbb, 0x77, 0xb6, 0x84, 0xbb, 0xfa, 0xee, 0x12, 0xbb, + 0xff, 0x93, 0xeb, 0xba, 0xf8, 0x93, 0x01, 0xbb, 0xc8, 0xf1, 0xb3, 0xbb, 0xcd, 0xc9, 0xa7, 0xbb, + 0xca, 0x1e, 0x5a, 0xbb, 0x0b, 0xa0, 0x44, 0xbb, 0x49, 0x4d, 0x65, 0xba, 0x33, 0xe8, 0x05, 0xbb, + 0x72, 0x16, 0xfc, 0x39, 0x13, 0xf9, 0xaf, 0xb9, 0x4d, 0xee, 0x85, 0x3a, 0x51, 0x30, 0xbe, 0x3a, + 0x7e, 0x7c, 0xee, 0xb8, 0x4b, 0xc1, 0x2f, 0xbb, 0x2a, 0x24, 0x2b, 0xbb, 0xf2, 0xcd, 0x20, 0xbb, + 0x1d, 0x15, 0x05, 0xbb, 0x38, 0x27, 0x0d, 0xbb, 0xa4, 0xd9, 0x79, 0xba, 0x9f, 0x40, 0x00, 0xbb, + 0x31, 0x08, 0xdd, 0xba, 0xff, 0x6d, 0xb1, 0xba, 0x10, 0xcb, 0xa0, 0x39, 0x38, 0x53, 0x58, 0x39, + 0xb4, 0xb5, 0x4d, 0xba, 0x00, 0x03, 0x6f, 0x38, 0x2b, 0xa7, 0xda, 0xba, 0x7c, 0x96, 0x87, 0xba, + 0x10, 0xc9, 0x30, 0xbb, 0xc3, 0x4e, 0x3c, 0xbb, 0x3e, 0x04, 0xde, 0x39, 0xe8, 0xa7, 0x82, 0xb9, + 0x36, 0x13, 0xb8, 0x39, 0x84, 0xd3, 0xdf, 0xba, 0x1e, 0xcd, 0x63, 0xbb, 0xec, 0x12, 0xc1, 0xb9, + 0xa4, 0xd2, 0xe8, 0xba, 0xf2, 0x61, 0x07, 0x39, 0xab, 0xd1, 0x05, 0xbb, 0x6e, 0x93, 0x0c, 0xbb, + 0x92, 0x9b, 0x7c, 0xba, 0x0f, 0xb0, 0xaa, 0xba, 0xe4, 0x36, 0xe0, 0x3a, 0x58, 0x93, 0xba, 0xba, + 0x18, 0x80, 0xfb, 0x3a, 0xe4, 0xd7, 0x1a, 0xbb, 0xe8, 0xc6, 0x50, 0x3a, 0xfa, 0x66, 0xdf, 0xba, + 0xd8, 0x16, 0x70, 0xb9, 0x38, 0xf3, 0xa0, 0x3a, 0xc8, 0x1a, 0x2d, 0x39, 0x85, 0x3f, 0x85, 0x3a, + 0xb2, 0xad, 0x25, 0xbb, 0xc6, 0xaa, 0xfc, 0xb9, 0x64, 0x77, 0x42, 0x39, 0xc8, 0x86, 0x6c, 0xbb, + 0xc6, 0xe3, 0xd6, 0xba, 0xde, 0xe3, 0x02, 0xbb, 0xc2, 0x46, 0xf6, 0xb9, 0xb0, 0x0f, 0x7a, 0xb8, + 0x78, 0x19, 0x6d, 0xb9, 0x98, 0xe8, 0xdf, 0xb8, 0xe1, 0x70, 0x33, 0xbb, 0x80, 0x63, 0x9c, 0xba, + 0x80, 0x66, 0x01, 0xb9, 0xf4, 0x8e, 0xc4, 0xba, 0x07, 0x09, 0x30, 0xbb, 0x67, 0x7b, 0x90, 0xba, + 0xdd, 0xb5, 0x24, 0xba, 0x9a, 0x2b, 0xf2, 0xba, 0xbb, 0x9c, 0x25, 0xbb, 0xb2, 0x57, 0x85, 0xb9, + 0x26, 0x8d, 0x0f, 0xbb, 0xfc, 0x1e, 0x14, 0xba, 0x62, 0x12, 0x24, 0xba, 0x97, 0x87, 0xd3, 0xba, + 0x00, 0x96, 0xb9, 0xb9, 0x12, 0x21, 0xa8, 0xba, 0x16, 0x84, 0xa4, 0xba, 0x5b, 0x2b, 0xa5, 0xba, + 0x50, 0xab, 0xf3, 0xba, 0x21, 0x74, 0x95, 0xba, 0xd7, 0xd6, 0x91, 0xba, 0xf5, 0xd5, 0xa7, 0xba, + 0x96, 0x51, 0xbf, 0xba, 0x37, 0xff, 0x94, 0xba, 0x28, 0x38, 0x8e, 0xb9, 0xad, 0x44, 0x0a, 0xbb, + 0x16, 0xa7, 0xf8, 0xb9, 0xb7, 0x85, 0x0c, 0xbb, 0xe3, 0x9a, 0xe1, 0xba, 0xf5, 0xfc, 0xc1, 0xba, + 0xc4, 0x10, 0x01, 0xbb, 0x16, 0xf7, 0x2c, 0xba, 0xf6, 0xab, 0x20, 0xba, 0x45, 0xde, 0xd0, 0xba, + 0xa0, 0x77, 0x3b, 0x39, 0x2e, 0x4a, 0x0b, 0xbb, 0x46, 0x5b, 0x87, 0xba, 0xfa, 0xaa, 0xbf, 0xba, + 0x9f, 0xff, 0x14, 0xba, 0xe3, 0x80, 0xca, 0xb9, 0x94, 0x29, 0xcf, 0xba, 0xa3, 0xfc, 0x01, 0xbb, + 0xb7, 0x64, 0xa4, 0xba, 0xe5, 0x71, 0x12, 0xba, 0x25, 0x70, 0x5e, 0xba, 0xaf, 0xa4, 0xda, 0xba, + 0x41, 0x9e, 0xe7, 0xba, 0x55, 0x0b, 0xa7, 0xba, 0xaa, 0xc5, 0xa3, 0xba, 0x2c, 0x2f, 0x25, 0xba, + 0xdd, 0xf6, 0xd8, 0xba, 0xff, 0x11, 0x99, 0xba, 0x1a, 0x8d, 0xcd, 0xba, 0x0d, 0x21, 0x06, 0xba, + 0x64, 0x53, 0xc5, 0xba, 0x93, 0xbb, 0x4b, 0xba, 0x98, 0xe3, 0x02, 0xba, 0x55, 0x7a, 0x79, 0xba, + 0xe4, 0xf7, 0x49, 0xb9, 0xd2, 0xae, 0xd2, 0xb9, 0x63, 0x11, 0x93, 0xba, 0xa4, 0xb3, 0x8b, 0xba, + 0x43, 0x42, 0xbe, 0xba, 0x82, 0x9f, 0x23, 0xbb, 0xf8, 0x86, 0xd8, 0xba, 0x14, 0xb3, 0x39, 0xba, + 0x90, 0x59, 0x8d, 0xba, 0x5d, 0x0f, 0xda, 0xba, 0xec, 0x46, 0x02, 0xbb, 0x69, 0xf2, 0x16, 0xbb, + 0x61, 0x80, 0xf8, 0xba, 0x88, 0x4d, 0xc4, 0xba, 0x14, 0x9c, 0x0e, 0x39, 0x31, 0xf4, 0x9f, 0xba, + 0x12, 0x5f, 0x99, 0xba, 0x27, 0xcf, 0x86, 0xb9, 0x13, 0x57, 0x7f, 0x39, 0xb6, 0xed, 0x86, 0xb8, + 0xb8, 0x5c, 0x86, 0xb9, 0xa8, 0xe3, 0xea, 0xba, 0x2e, 0xb2, 0x0c, 0xba, 0x54, 0xc3, 0x85, 0xba, + 0x22, 0x85, 0x5c, 0xba, 0x60, 0x03, 0x76, 0xba, 0x02, 0x5f, 0x7c, 0xba, 0xac, 0xb6, 0x3f, 0xb9, + 0x6c, 0x83, 0xd6, 0xba, 0x60, 0x8a, 0xf0, 0x38, 0xb7, 0x48, 0x89, 0x3a, 0x58, 0x88, 0x48, 0xb8, + 0xa0, 0xb3, 0xa7, 0x39, 0x11, 0x79, 0xb1, 0xb9, 0xe7, 0x38, 0xcf, 0xb9, 0x02, 0x66, 0x75, 0xba, + 0xbc, 0x42, 0x94, 0xba, 0xbd, 0xaf, 0xdd, 0xb9, 0x5d, 0x33, 0x89, 0x39, 0x00, 0x7c, 0xe0, 0xb9, + 0x23, 0x62, 0x34, 0xb9, 0x29, 0x35, 0xfd, 0xb9, 0x76, 0x52, 0xa1, 0xba, 0x4a, 0x82, 0x0d, 0xba, + 0x1d, 0x16, 0x6d, 0xba, 0xad, 0xc3, 0x29, 0x39, 0x6c, 0x31, 0x81, 0xba, 0x02, 0x12, 0xb6, 0xba, + 0xe4, 0x50, 0x66, 0xba, 0x66, 0xbc, 0x4d, 0xb9, 0xa8, 0x3e, 0xf2, 0xb7, 0xc2, 0x20, 0x75, 0xba, + 0x38, 0xf3, 0x49, 0xb9, 0xed, 0x1a, 0xd7, 0xba, 0x4c, 0x5f, 0x7e, 0x38, 0x72, 0x9b, 0x7d, 0xba, + 0x6d, 0x1b, 0xe1, 0xb9, 0x5b, 0x60, 0x34, 0xb8, 0x3a, 0x8a, 0x71, 0xb9, 0x35, 0xef, 0x0f, 0x3a, + 0x14, 0x8e, 0x20, 0xba, 0xba, 0xbb, 0xf0, 0x38, 0x57, 0x5c, 0xb0, 0x39, 0x0a, 0x5f, 0x58, 0xba, + 0xec, 0xa4, 0x8e, 0x39, 0xcd, 0xd3, 0x15, 0xba, 0x78, 0x9f, 0x3a, 0xb9, 0x92, 0xe5, 0x27, 0x38, + 0x0c, 0xed, 0xe6, 0x39, 0xaa, 0xec, 0x9a, 0xba, 0xec, 0x8a, 0x82, 0xba, 0x82, 0xe2, 0x40, 0xb9, + 0xc6, 0x2f, 0x7d, 0xb9, 0xdf, 0xc2, 0xba, 0xba, 0x9e, 0x66, 0x19, 0xba, 0xac, 0x3b, 0x9a, 0xb9, + 0x1d, 0x1e, 0x0f, 0xba, 0x96, 0x86, 0x71, 0xba, 0xcc, 0xeb, 0x34, 0xba, 0xd1, 0xdb, 0xaa, 0xb9, + 0x57, 0x74, 0xec, 0x3b, 0x74, 0x3b, 0x2c, 0xbc, 0xcd, 0x0a, 0xc3, 0x3b, 0x30, 0x7f, 0xfc, 0x3b, + 0x5c, 0x26, 0xb9, 0xba, 0x1d, 0xff, 0x09, 0xbd, 0x83, 0xa6, 0x8d, 0xbd, 0x2e, 0xb6, 0x50, 0xbd, + 0xa1, 0x51, 0x69, 0xbd, 0x92, 0x7a, 0x34, 0xbd, 0xb7, 0xb7, 0x65, 0xbc, 0xf0, 0x28, 0x84, 0xbd, + 0xa8, 0x91, 0x90, 0xbc, 0x2a, 0xfa, 0x66, 0xbd, 0xfc, 0x1b, 0xed, 0xbc, 0x77, 0x65, 0x01, 0xbd, + 0xbc, 0x69, 0x17, 0xbd, 0xf8, 0xc7, 0xdd, 0xbc, 0x2a, 0x53, 0x72, 0xbd, 0xab, 0x8e, 0xbf, 0xbc, + 0xf1, 0xb0, 0x87, 0xbd, 0x39, 0x93, 0x8b, 0xbd, 0x6e, 0xde, 0x0e, 0xbc, 0xe4, 0xbe, 0xb2, 0xbc, + 0x06, 0xab, 0x89, 0x3c, 0xd1, 0x9c, 0x84, 0xbd, 0xda, 0x4c, 0x85, 0xbd, 0x4c, 0x5d, 0x9b, 0xbc, + 0x04, 0xc7, 0xd2, 0xbc, 0x7e, 0xb7, 0x0e, 0xbc, 0x8e, 0x23, 0x44, 0xbd, 0x7c, 0x68, 0x3a, 0xbd, + 0x26, 0x7a, 0x9e, 0xbc, 0xff, 0x17, 0x11, 0xbd, 0x18, 0x4c, 0xda, 0x3c, 0xc9, 0x03, 0x14, 0xbd, + 0x1c, 0x03, 0xa4, 0x3c, 0xf2, 0x21, 0xf9, 0xbc, 0x18, 0x43, 0xaa, 0xbb, 0x44, 0x43, 0xbe, 0xbc, + 0x26, 0x95, 0xb7, 0xbc, 0x74, 0x94, 0x1f, 0x3c, 0x52, 0x2d, 0x8e, 0xbc, 0x20, 0x39, 0xa8, 0xba, + 0x6d, 0x20, 0x8e, 0xbd, 0x8d, 0x8f, 0xed, 0xbc, 0xd6, 0xa9, 0x6f, 0xbc, 0x03, 0xc4, 0xa1, 0xbd, + 0xea, 0xcc, 0x50, 0xbd, 0x1a, 0xbb, 0x1b, 0xbd, 0xb7, 0x10, 0xd6, 0xbc, 0x49, 0xb9, 0xb2, 0xbc, + 0x38, 0x5f, 0x37, 0xbd, 0xeb, 0x51, 0x45, 0xbc, 0x80, 0x49, 0x86, 0xbd, 0x72, 0xea, 0x12, 0xbd, + 0x8e, 0xe1, 0x77, 0xbc, 0x55, 0xfa, 0xbd, 0xbc, 0x9d, 0x5a, 0xa9, 0xbd, 0x47, 0x07, 0x72, 0xbd, + 0x4c, 0x76, 0x06, 0xbd, 0x3a, 0xd5, 0x32, 0xbd, 0x97, 0x95, 0x19, 0xbd, 0x90, 0xdc, 0x95, 0xbc, + 0x25, 0x97, 0x0a, 0xc0, 0x6c, 0xcf, 0x30, 0xc0, 0x6d, 0x42, 0x8e, 0xbe, 0xd7, 0x8f, 0xb6, 0xbf, + 0x36, 0xe7, 0xa3, 0xbf, 0x17, 0x3b, 0xb3, 0xbf, 0xd2, 0x62, 0xea, 0xbf, 0x96, 0x22, 0xa8, 0xbf, + 0x4d, 0x71, 0x2b, 0xc0, 0x19, 0x12, 0xa7, 0xbf, 0x7a, 0xd8, 0xf2, 0xbf, 0xa8, 0xe4, 0xda, 0xbf, + 0xe0, 0xb4, 0x75, 0xc0, 0x65, 0x42, 0xea, 0xbf, 0xd1, 0x68, 0x9c, 0xbf, 0xe5, 0x54, 0x1f, 0xc0, + 0xc3, 0x92, 0xe8, 0xbf, 0x50, 0x27, 0x12, 0xc0, 0xa4, 0xed, 0xe4, 0xbf, 0x4f, 0x0d, 0x58, 0xc0, + 0x38, 0xc1, 0x22, 0xc0, 0xea, 0xf7, 0x25, 0xc0, 0x1b, 0x85, 0x17, 0xc0, 0xfc, 0xb4, 0xc2, 0xbf, + 0x52, 0x2a, 0xe9, 0xbf, 0x69, 0xd4, 0x4e, 0xc0, 0xa5, 0xab, 0xc9, 0xbf, 0xfe, 0x08, 0x3d, 0xc0, + 0xce, 0x94, 0x94, 0xc0, 0xed, 0x08, 0x96, 0xbf, 0xab, 0x99, 0x91, 0xbf, 0x4f, 0x32, 0xfb, 0xbf, + 0xb3, 0x05, 0xc5, 0xbf, 0xaa, 0xde, 0x65, 0xc0, 0xe6, 0x4b, 0x04, 0xc0, 0x18, 0xe7, 0x23, 0xc0, + 0x19, 0x99, 0x7a, 0x3e, 0x4f, 0x09, 0x4c, 0xbf, 0x79, 0xa9, 0xb6, 0xbf, 0xd4, 0x58, 0xbf, 0xbf, + 0x02, 0x01, 0x24, 0xbf, 0x18, 0x65, 0xff, 0xbf, 0x6e, 0xe4, 0x40, 0xc0, 0x71, 0x59, 0xca, 0xbf, + 0xc7, 0x2e, 0x48, 0x3e, 0x9b, 0x45, 0x20, 0xc0, 0xe5, 0xd5, 0x22, 0xc0, 0xd1, 0x17, 0xb1, 0xbf, + 0x91, 0x25, 0xd4, 0xbf, 0x84, 0x76, 0x55, 0xc0, 0x29, 0xcf, 0xd6, 0xbf, 0x8f, 0x72, 0x0c, 0xc0, + 0xf8, 0xcb, 0x03, 0xc0, 0x15, 0xea, 0x30, 0xc0, 0xbc, 0x31, 0x6c, 0xc0, 0x44, 0x6b, 0xce, 0xbf, + 0x62, 0xfb, 0x93, 0xbf, 0x5e, 0x1c, 0x05, 0xc0, 0x3f, 0x90, 0x07, 0xc0, 0x63, 0x20, 0x93, 0xbe, + 0x30, 0xe0, 0xd1, 0xbe, 0x1e, 0x74, 0x9a, 0xbf, 0xca, 0x37, 0x8b, 0xbf, 0x5c, 0x1c, 0x10, 0xc0, + 0x5b, 0xc5, 0x2c, 0x3f, 0xd6, 0x18, 0xcd, 0x3f, 0x9a, 0xcf, 0x02, 0x3f, 0xba, 0xf6, 0xe2, 0x3f, + 0x52, 0x9e, 0x89, 0x3f, 0xda, 0x54, 0xcb, 0x3f, 0x7d, 0xb8, 0x9e, 0x3f, 0x6f, 0x5f, 0xbc, 0x3f, + 0x39, 0xbc, 0xd0, 0x3f, 0x2e, 0x74, 0x57, 0x3f, 0xb2, 0xc4, 0x86, 0x3f, 0x11, 0x49, 0x90, 0x3f, + 0x96, 0x37, 0xef, 0x3f, 0x24, 0xc5, 0xd1, 0x3f, 0xde, 0xe7, 0xaa, 0x3f, 0x8b, 0xd8, 0xe2, 0x3f, + 0xc0, 0x95, 0xce, 0x3f, 0x41, 0xa3, 0x9d, 0x3f, 0xc6, 0x22, 0xcb, 0x3f, 0x7a, 0x63, 0x9c, 0x3f, + 0xc0, 0xb8, 0x01, 0x40, 0x7e, 0x08, 0xc1, 0x3f, 0xe5, 0xe6, 0xd6, 0x3f, 0xe4, 0x40, 0x8c, 0x3f, + 0x52, 0xfa, 0x7e, 0x3f, 0x76, 0xde, 0xf3, 0x3f, 0x76, 0x58, 0xa5, 0x3f, 0x14, 0x86, 0xdf, 0x3f, + 0x88, 0xb5, 0xee, 0x3f, 0xf7, 0x0a, 0xa6, 0x3f, 0x5e, 0x58, 0xa9, 0x3f, 0x8e, 0xa4, 0x80, 0x3f, + 0x9a, 0x19, 0x9c, 0x3f, 0x47, 0x36, 0xb8, 0x3f, 0x1f, 0xd8, 0x96, 0x3f, 0x49, 0x0a, 0xaa, 0x3f, + 0xae, 0x13, 0x21, 0x3f, 0xf2, 0xa2, 0xad, 0x3f, 0x2b, 0x09, 0x97, 0x3f, 0x06, 0xe7, 0xa3, 0x3f, + 0xea, 0xdc, 0xd5, 0x3f, 0x82, 0xd3, 0x8f, 0x3f, 0x78, 0x86, 0xd1, 0x3f, 0x69, 0x8e, 0xc4, 0x3f, + 0x02, 0x0c, 0xb2, 0x3e, 0x5d, 0x0d, 0xb5, 0x3f, 0xbf, 0xa5, 0xaa, 0x3f, 0xee, 0x9d, 0xdf, 0x3f, + 0xcc, 0xab, 0xab, 0x3f, 0x26, 0xe8, 0xb6, 0x3f, 0xa6, 0x38, 0xc9, 0x3f, 0x45, 0x05, 0x93, 0x3f, + 0x2b, 0x04, 0xd8, 0x3f, 0x4a, 0x98, 0x92, 0x3f, 0xb3, 0xac, 0xd1, 0x3f, 0x1d, 0xd6, 0x62, 0x3f, + 0x09, 0x94, 0x80, 0x3f, 0x00, 0x62, 0xcd, 0x3f, 0x86, 0x9b, 0x93, 0x3f, 0x54, 0xb6, 0x73, 0x3f, + 0xc1, 0x4f, 0x84, 0x3f, 0x7b, 0xd4, 0xad, 0x3f, 0x14, 0x85, 0x22, 0x3f, 0x0d, 0x0d, 0xa6, 0x3f, + 0x69, 0x1c, 0x85, 0x3e, 0x76, 0xcb, 0x3f, 0x3f, 0xe0, 0x45, 0xfc, 0xbd, 0xc0, 0xfd, 0xb1, 0x3d, + 0x14, 0xa1, 0xaf, 0xbd, 0x6e, 0xac, 0x1a, 0x3f, 0xab, 0x67, 0x73, 0x3f, 0xf0, 0x3d, 0x05, 0x3f, + 0xce, 0x4a, 0x78, 0x3f, 0xe2, 0x73, 0x38, 0x3f, 0xfc, 0x38, 0x22, 0x3f, 0xb2, 0x0a, 0xab, 0x3d, + 0xe7, 0x24, 0x79, 0x3f, 0xd1, 0x3a, 0x37, 0x3f, 0x1e, 0x8c, 0x49, 0x3e, 0xfe, 0x2f, 0xc3, 0x3e, + 0xde, 0x77, 0xfb, 0x3e, 0xc4, 0x50, 0x12, 0x3f, 0x46, 0x57, 0x34, 0x3f, 0xce, 0xd7, 0x08, 0x3f, + 0xc8, 0x3d, 0x00, 0xbe, 0x21, 0x90, 0x39, 0x3f, 0x83, 0x1a, 0x00, 0x3f, 0x50, 0x98, 0xb2, 0x3e, + 0xbd, 0xe8, 0x81, 0xbe, 0x17, 0x87, 0xaa, 0x3f, 0x54, 0x94, 0x54, 0x3e, 0x1c, 0x2a, 0x75, 0x3f, + 0x38, 0xdd, 0x48, 0x3f, 0x98, 0xd4, 0xb7, 0x3d, 0x79, 0xf1, 0x80, 0x3f, 0x32, 0x74, 0x17, 0x3f, + 0x00, 0xc8, 0x0e, 0x3f, 0x55, 0xdd, 0xb5, 0x3f, 0x01, 0x64, 0x63, 0x3f, 0x77, 0x9f, 0xd0, 0x3e, + 0x00, 0x0c, 0x2c, 0x3d, 0xd0, 0xd6, 0xef, 0x3d, 0xbe, 0x8f, 0xb7, 0x3e, 0xd3, 0xb2, 0xe8, 0x3e, + 0x1c, 0x13, 0x15, 0xbe, 0xc6, 0xd7, 0x30, 0x3f, 0x16, 0x37, 0x69, 0x3f, 0xb4, 0xf0, 0x55, 0x3f, + 0xac, 0x72, 0x11, 0xbd, 0x72, 0x44, 0xcf, 0x3e, 0x18, 0xbe, 0x1d, 0x3f, 0x0b, 0x57, 0xd1, 0x3e, + 0xc0, 0x85, 0xaa, 0x3c, 0xfd, 0x0d, 0xa8, 0x3f, 0x94, 0x90, 0xbc, 0x3d, 0xdf, 0x3c, 0x14, 0xbe, + 0x34, 0xd8, 0x50, 0x3e, 0x7c, 0x0c, 0x08, 0x3f, 0xb5, 0x5a, 0x8e, 0x3f, 0x31, 0x02, 0x68, 0x3e, + 0xed, 0x64, 0x81, 0x3e, 0x15, 0xfc, 0xb0, 0x3f, 0x3d, 0xec, 0xef, 0xbd, 0x68, 0x0e, 0xc5, 0xbd, + 0xd8, 0x52, 0xb4, 0x3d, 0x5e, 0xda, 0xbf, 0x3e, 0xc0, 0xf8, 0xd2, 0xba, 0x86, 0x9e, 0x21, 0x3f, + 0x2f, 0x3c, 0x2c, 0xbd, 0x33, 0x2d, 0x3e, 0xbe, 0x79, 0xc8, 0x7a, 0xbd, 0x86, 0x1f, 0x64, 0xbe, + 0x76, 0x16, 0xea, 0xbd, 0x37, 0x23, 0x63, 0xbe, 0x92, 0xa4, 0x30, 0xbe, 0x16, 0xdc, 0x4f, 0xbe, + 0x66, 0xb4, 0x50, 0xbe, 0xa6, 0x38, 0xf3, 0xbd, 0x5f, 0x68, 0x01, 0xbe, 0x14, 0x18, 0xee, 0xbd, + 0x8c, 0xd8, 0x4d, 0xbe, 0xa0, 0xe6, 0x61, 0xbe, 0x7e, 0x74, 0x2d, 0xbe, 0xc6, 0xc1, 0x4e, 0xbe, + 0x7b, 0x07, 0x53, 0xbe, 0xd4, 0x7a, 0x0c, 0xbe, 0x9c, 0xc8, 0x5a, 0xbe, 0x42, 0x9d, 0xc7, 0xbd, + 0x11, 0x1e, 0x5a, 0xbe, 0x64, 0x3c, 0x34, 0xbe, 0x40, 0xb1, 0x4a, 0xbe, 0xe7, 0x3d, 0x06, 0xbe, + 0xff, 0xa2, 0x9b, 0xbd, 0x70, 0x8b, 0x7a, 0xbe, 0xcc, 0x43, 0x1b, 0xbe, 0x53, 0x71, 0x58, 0xbe, + 0x18, 0x23, 0x28, 0xbe, 0x7e, 0xd6, 0x23, 0xbe, 0xfc, 0xf6, 0x57, 0xbe, 0x50, 0x5a, 0xeb, 0xbd, + 0x45, 0x56, 0x23, 0xbe, 0xfb, 0x33, 0x2b, 0xbe, 0xde, 0xee, 0x1b, 0xbe, 0xe8, 0x7e, 0x09, 0xbe, + 0xca, 0x4c, 0xd6, 0xbd, 0xaa, 0x27, 0x3b, 0xbe, 0xa7, 0xe3, 0x16, 0xbe, 0x4a, 0xed, 0x28, 0xbe, + 0x8c, 0x50, 0x63, 0xbe, 0xce, 0xda, 0x0b, 0xbe, 0x8d, 0x32, 0x43, 0xbe, 0x1e, 0xb2, 0x60, 0xbe, + 0x2d, 0x13, 0x69, 0xbd, 0xd2, 0x82, 0x18, 0xbe, 0xd6, 0x7a, 0x15, 0xbe, 0xab, 0x0c, 0x72, 0xbe, + 0xa4, 0x7f, 0x16, 0xbe, 0xf8, 0xdc, 0x2c, 0xbe, 0xb0, 0xe5, 0x3c, 0xbe, 0x08, 0x37, 0xbc, 0xbd, + 0xc4, 0x5e, 0x47, 0xbe, 0xdc, 0x89, 0xd9, 0xbd, 0xbb, 0x94, 0x36, 0xbe, 0x7a, 0x50, 0xb9, 0xbd, + 0x01, 0x8b, 0xff, 0xbd, 0xc4, 0x42, 0x76, 0xbe, 0x30, 0xa9, 0xc5, 0xbd, 0x14, 0x7e, 0x03, 0xbe, + 0x4c, 0x4a, 0x15, 0xbe, 0xfc, 0x97, 0x3a, 0xbe, 0x06, 0x38, 0x71, 0xbd, 0xd8, 0xb9, 0x1a, 0xbe, + 0x70, 0x95, 0x12, 0xba, 0x3c, 0xa2, 0xcd, 0xbd, 0x08, 0x81, 0x0f, 0x3b, 0x86, 0xce, 0x5d, 0xbd, + 0xc0, 0x89, 0xea, 0x3a, 0x03, 0xaf, 0xfc, 0xbd, 0x3f, 0x07, 0x13, 0xbe, 0xa0, 0xea, 0xdc, 0xbd, + 0x8b, 0x08, 0x0d, 0xbe, 0x9d, 0x6b, 0xda, 0xbd, 0xd1, 0xf6, 0xac, 0xbd, 0x84, 0xbb, 0x42, 0xbc, + 0x7a, 0x42, 0xe4, 0xbd, 0x71, 0x60, 0x03, 0xbe, 0x73, 0x8a, 0x63, 0xbd, 0xad, 0xa2, 0x80, 0xbd, + 0x1c, 0x1c, 0xc0, 0xbd, 0xa8, 0xca, 0x91, 0xbd, 0x4e, 0x69, 0x00, 0xbe, 0xc6, 0x77, 0xeb, 0xbc, + 0x80, 0xba, 0x10, 0x3b, 0xb4, 0x11, 0xc8, 0xbd, 0x48, 0x06, 0xa4, 0xbd, 0x36, 0x08, 0x67, 0xbd, + 0x4f, 0x9e, 0x48, 0x3d, 0x24, 0xdf, 0x3f, 0xbe, 0x3f, 0xfc, 0x30, 0xbd, 0x81, 0xb0, 0x07, 0xbe, + 0xd3, 0x0f, 0x66, 0xbd, 0xcd, 0xc7, 0x20, 0xbd, 0xac, 0xa4, 0x37, 0xbe, 0xfb, 0xaf, 0x96, 0xbd, + 0x93, 0x23, 0xbf, 0xbd, 0x39, 0x53, 0x29, 0xbe, 0x8b, 0x6d, 0xfc, 0xbd, 0xae, 0xc0, 0x2c, 0xbd, + 0xe2, 0x14, 0x3c, 0xbd, 0x63, 0xf3, 0x70, 0xbd, 0xbb, 0x8b, 0x85, 0xbd, 0x79, 0xb0, 0xa8, 0xbd, + 0x66, 0x44, 0x25, 0xbd, 0xd1, 0x0f, 0xbf, 0xbd, 0x05, 0xba, 0xf1, 0xbd, 0xdf, 0x06, 0x19, 0xbe, + 0x68, 0x2f, 0x98, 0xbc, 0xaf, 0x7d, 0x44, 0xbd, 0x26, 0x1e, 0x98, 0xbd, 0x44, 0x0c, 0xd2, 0xbd, + 0x30, 0xb1, 0x74, 0xbc, 0x3e, 0xfb, 0x20, 0xbe, 0xc6, 0x64, 0x15, 0xbd, 0xbf, 0x54, 0x19, 0x3d, + 0x32, 0xc4, 0x3d, 0xbd, 0xc5, 0xa6, 0x37, 0xbd, 0x89, 0xb9, 0x00, 0xbe, 0xe7, 0xfc, 0xc8, 0xbc, + 0xa2, 0x26, 0x4b, 0xbd, 0xd5, 0x8a, 0x5f, 0xbe, 0x82, 0x03, 0xfa, 0x3c, 0x02, 0x3d, 0xc1, 0xbc, + 0x25, 0xd4, 0x51, 0xbd, 0xf2, 0xcc, 0xab, 0xbd, 0x04, 0xc7, 0x9b, 0x3b, 0xd3, 0x10, 0xad, 0xbd, + 0x66, 0x2e, 0x6e, 0xbd, 0x72, 0x2d, 0xc6, 0xbd, 0xcc, 0x85, 0xcf, 0xbc, 0xf4, 0xa2, 0xb6, 0xbd, + 0x54, 0x79, 0x80, 0xbd, 0x8f, 0x53, 0x9a, 0xbd, 0x14, 0x65, 0x85, 0xbd, 0x17, 0x89, 0x90, 0xbd, + 0x14, 0xd9, 0xbf, 0xbd, 0x74, 0x7c, 0x35, 0xbd, 0xc7, 0x65, 0x81, 0xbd, 0x04, 0xce, 0x8e, 0xbd, + 0xcb, 0xa8, 0xf9, 0xbd, 0x1b, 0x02, 0xaa, 0xbd, 0xc5, 0x36, 0x8b, 0xbd, 0x34, 0x53, 0xd3, 0xbd, + 0xa9, 0x65, 0xad, 0xbd, 0x50, 0xe6, 0x9d, 0xbd, 0x1b, 0xe5, 0xa4, 0xbd, 0x67, 0x60, 0xc3, 0xbd, + 0xb3, 0xac, 0xf5, 0xbd, 0xb4, 0xc7, 0xb9, 0xbd, 0x03, 0xf1, 0xc4, 0xbd, 0xd4, 0x49, 0x7e, 0xbd, + 0x8d, 0x2a, 0x91, 0xbd, 0x69, 0x86, 0xde, 0xbd, 0xf0, 0xab, 0x93, 0xbd, 0xa3, 0x6a, 0xd2, 0xbd, + 0x18, 0xd0, 0x0c, 0xbe, 0x4e, 0x56, 0x89, 0xbd, 0x58, 0x4a, 0x65, 0xbd, 0xaa, 0x06, 0x81, 0xbd, + 0xd0, 0xeb, 0x83, 0xbd, 0x64, 0xc0, 0xc4, 0xbd, 0x5a, 0x53, 0x8a, 0xbd, 0xfb, 0x01, 0xb3, 0xbd, + 0xf8, 0x93, 0x9b, 0xbc, 0x87, 0x63, 0x80, 0xbd, 0x3e, 0x27, 0x82, 0xbd, 0x7f, 0xbb, 0x89, 0xbd, + 0x6e, 0xe9, 0x99, 0xbd, 0x7c, 0xb7, 0x88, 0xbd, 0x00, 0x45, 0xcd, 0xbd, 0xe6, 0x7f, 0x96, 0xbd, + 0xef, 0x8b, 0x27, 0xbc, 0x0d, 0xbf, 0xb7, 0xbd, 0xb7, 0x75, 0xad, 0xbd, 0x8e, 0xed, 0xaa, 0xbd, + 0xd0, 0x0a, 0x9f, 0xbd, 0xe7, 0xfa, 0xbd, 0xbd, 0x7e, 0xa8, 0xaf, 0xbd, 0xda, 0xd2, 0xa6, 0xbd, + 0x26, 0xae, 0xc2, 0xbd, 0xd5, 0xb4, 0xa8, 0xbd, 0xdf, 0x4d, 0xdf, 0xbd, 0x70, 0x97, 0x6a, 0xbd, + 0x3a, 0x47, 0x5c, 0xbd, 0x3f, 0xd3, 0x9e, 0xbd, 0x34, 0xdd, 0xa3, 0xbd, 0x47, 0xc5, 0x2a, 0xbd, + 0xca, 0x3f, 0x36, 0xbd, 0x9a, 0xe4, 0x87, 0xbd, 0x09, 0xd8, 0x2b, 0xbd, 0xe7, 0x53, 0xa0, 0xbd, + 0x7a, 0x35, 0x19, 0xbd, 0xba, 0xa2, 0x5d, 0xbd, 0xf9, 0xea, 0xd0, 0x3b, 0xf8, 0x70, 0x9f, 0xbb, + 0x4f, 0x90, 0x99, 0xbb, 0x66, 0x1f, 0xd6, 0xbc, 0x82, 0x8f, 0x4f, 0xbd, 0x45, 0x89, 0xbc, 0xbc, + 0x73, 0x5a, 0x78, 0xbd, 0x6a, 0xaf, 0x1c, 0xbd, 0xa3, 0xf1, 0x2c, 0xbd, 0xc8, 0xb6, 0x8f, 0xbc, + 0x7d, 0xf1, 0x9d, 0xbd, 0xd8, 0xf7, 0x16, 0xbd, 0x19, 0x27, 0x35, 0xbc, 0x8a, 0xc5, 0x09, 0xbd, + 0xef, 0xb0, 0xe9, 0xbc, 0xa0, 0xa1, 0x34, 0xbd, 0x9a, 0x0b, 0x15, 0xbd, 0x7e, 0x01, 0x7a, 0xbd, + 0xbe, 0x5d, 0x4a, 0xbc, 0x54, 0x28, 0x53, 0xbd, 0xcc, 0x85, 0x18, 0xbd, 0x43, 0xa4, 0xca, 0xbc, + 0xcd, 0x8b, 0x09, 0xbc, 0x1d, 0xda, 0xa3, 0xbd, 0xd2, 0x9c, 0x92, 0xbc, 0xd1, 0xa9, 0x81, 0xbd, + 0x0d, 0x01, 0xab, 0xbd, 0x22, 0x1c, 0xdb, 0xbb, 0xb6, 0xa5, 0x24, 0xbd, 0xfe, 0x62, 0x2d, 0xbd, + 0xaa, 0x4e, 0x00, 0xbd, 0xa2, 0xdf, 0xc2, 0xbd, 0x83, 0x36, 0x59, 0xbd, 0x7e, 0x66, 0x29, 0xbd, + 0x71, 0xe1, 0x29, 0x3c, 0x70, 0x5a, 0x3c, 0xba, 0xd2, 0x25, 0xb8, 0xbc, 0xc2, 0x99, 0xd4, 0xbc, + 0x10, 0x58, 0x80, 0x3c, 0x2e, 0xc9, 0x38, 0xbd, 0x76, 0xef, 0x82, 0xbd, 0x4b, 0xa9, 0x1d, 0xbd, + 0x02, 0x6b, 0x12, 0x3c, 0x4e, 0xb7, 0x20, 0xbd, 0xe8, 0x13, 0x48, 0xbd, 0x16, 0x9c, 0x87, 0xbc, + 0xf6, 0xb1, 0x3e, 0xbc, 0x2d, 0x19, 0xb2, 0xbd, 0x00, 0xe0, 0x40, 0xbc, 0x96, 0x88, 0x89, 0xbc, + 0xba, 0x01, 0xab, 0xbc, 0x33, 0xb3, 0x53, 0xbd, 0xad, 0x72, 0xa9, 0xbd, 0x5c, 0x0c, 0xc7, 0xbc, + 0x12, 0x9f, 0x84, 0xbc, 0x50, 0x1b, 0x88, 0xbd, 0xad, 0x20, 0x87, 0xbc, 0x83, 0x9e, 0x2f, 0x3c, + 0xb0, 0xab, 0x36, 0x3b, 0x5d, 0xcc, 0x8c, 0xbc, 0x55, 0xfe, 0x25, 0xbc, 0xf2, 0x44, 0x38, 0xbd, + 0x05, 0x09, 0x7e, 0x3d, 0xf6, 0xce, 0x21, 0xc0, 0x3c, 0x3e, 0x18, 0xbf, 0x7d, 0x91, 0x30, 0xc0, + 0xb5, 0x14, 0x72, 0xbf, 0xe7, 0x22, 0x62, 0xc0, 0x74, 0x48, 0x46, 0xc0, 0xe9, 0xcc, 0x4a, 0xc0, + 0x87, 0x0c, 0x4b, 0xc0, 0x7c, 0x69, 0x0d, 0xc0, 0x31, 0xe2, 0xf2, 0xbf, 0x3f, 0xcd, 0x81, 0xbf, + 0x69, 0x8e, 0x23, 0xc0, 0xab, 0xed, 0x5d, 0xc0, 0x25, 0x68, 0x0e, 0xc0, 0x36, 0xc4, 0x16, 0xc0, + 0xcc, 0xd8, 0x3a, 0xc0, 0x90, 0xfb, 0xe2, 0xbf, 0x63, 0x81, 0x57, 0xc0, 0xf1, 0x6f, 0x13, 0xbf, + 0x92, 0x2b, 0xde, 0xbf, 0xb5, 0x92, 0x1b, 0xc0, 0x90, 0x4d, 0x22, 0xc0, 0xb1, 0x52, 0xdd, 0xbf, + 0x02, 0x1d, 0x1b, 0x3e, 0xeb, 0xae, 0x81, 0xc0, 0x1b, 0x25, 0xe6, 0xbf, 0x89, 0x55, 0x48, 0xc0, + 0x1b, 0xbe, 0xa8, 0xbf, 0xa5, 0x69, 0xf9, 0xbf, 0x1e, 0x63, 0x83, 0xc0, 0xa9, 0xa2, 0xd0, 0xbf, + 0x74, 0x6a, 0x1d, 0xc0, 0x66, 0xf7, 0x35, 0xc0, 0x7d, 0x4f, 0x25, 0xc0, 0xa7, 0x22, 0xac, 0xbf, + 0x1e, 0xa2, 0xe0, 0xbf, 0x17, 0x32, 0x21, 0xc0, 0x02, 0x36, 0x02, 0xc0, 0xc8, 0x29, 0x1a, 0xc0, + 0x98, 0xa1, 0x32, 0xc0, 0xc5, 0x94, 0x05, 0xc0, 0xd5, 0x01, 0x2e, 0xc0, 0xe3, 0x4a, 0x70, 0xc0, + 0x13, 0xe5, 0x65, 0xbf, 0x59, 0x69, 0xcb, 0xbf, 0xfd, 0xbe, 0xeb, 0xbf, 0x3e, 0xe6, 0x5d, 0xc0, + 0xad, 0x1b, 0xb4, 0xbf, 0x64, 0x5e, 0x35, 0xc0, 0x51, 0x3a, 0x04, 0xc0, 0xed, 0x59, 0x83, 0xbd, + 0x20, 0x97, 0x0c, 0xc0, 0x85, 0x10, 0x81, 0xbf, 0x99, 0xd6, 0x1f, 0xc0, 0x74, 0x0c, 0x64, 0xbf, + 0x78, 0x1a, 0xd6, 0xbf, 0xbc, 0x86, 0x94, 0xc0, 0xbb, 0x01, 0x5a, 0xbe, 0xfe, 0xae, 0xd1, 0xbf, + 0x86, 0x4b, 0x07, 0xc0, 0x66, 0x4a, 0x2c, 0xc0, 0x01, 0x9d, 0xac, 0xbe, 0x7f, 0x9b, 0x05, 0xc0}; +unsigned char conv2d_winograd_fp32_bias[] = { + 0x94, 0xcb, 0xde, 0x3f, 0x6f, 0x1d, 0xf0, 0x3f, 0x61, 0xfb, 0x8f, 0x40, 0x24, 0xce, 0xdb, 0x3f, + 0x55, 0x18, 0xf2, 0x40, 0x38, 0xa5, 0x64, 0x41, 0x87, 0x80, 0x94, 0xc0, 0xee, 0x19, 0x40, 0x40, + 0x28, 0x08, 0x8a, 0x40, 0x99, 0x24, 0x8c, 0xc0, 0x05, 0x80, 0x41, 0x40, 0xd4, 0x8a, 0xb3, 0x41, + 0x24, 0xe3, 0x2e, 0x41, 0x3c, 0xe6, 0xf7, 0x40, 0xa3, 0x0f, 0xdf, 0xc0, 0x6c, 0xd6, 0xdf, 0x40}; +unsigned char conv2d_winograd_fp32_out[] = { + 0xd3, 0xab, 0x56, 0x42, 0xf0, 0xb2, 0xa1, 0x42, 0xc4, 0x6b, 0xac, 0x42, 0x9c, 0x19, 0xbd, 0x42, + 0x3b, 0xac, 0xcf, 0x42, 0xc7, 0x8f, 0xc6, 0x42, 0x62, 0x76, 0xe7, 0x42, 0xed, 0x1f, 0xc5, 0x42, + 0xf6, 0x91, 0xcf, 0x42, 0xfa, 0x2c, 0x9b, 0x42, 0x5e, 0x2a, 0xcd, 0x42, 0xad, 0x6c, 0xb6, 0x42, + 0xf2, 0xd6, 0xd9, 0x42, 0xc9, 0x6c, 0x41, 0x42, 0x77, 0xc0, 0xa9, 0x42, 0x5c, 0xd0, 0xf6, 0x42, + 0x86, 0x25, 0xb6, 0x42, 0x18, 0x6e, 0xcf, 0x42, 0xf2, 0x6b, 0x19, 0x43, 0xe8, 0x8d, 0xf1, 0x42, + 0x95, 0xa8, 0x3e, 0x43, 0x1d, 0xd9, 0x16, 0x43, 0xce, 0x47, 0x3f, 0x43, 0x8c, 0x4f, 0xf0, 0x42, + 0x1e, 0x75, 0x27, 0x43, 0xa5, 0xbf, 0x0f, 0x43, 0x64, 0xbe, 0x21, 0x43, 0x72, 0xd6, 0xb4, 0x42, + 0x26, 0xf0, 0xb9, 0x42, 0x5e, 0x17, 0x02, 0x43, 0x7b, 0x2b, 0xeb, 0x42, 0xdd, 0x00, 0x0c, 0x43, + 0x0d, 0x07, 0x2c, 0x43, 0xef, 0xf1, 0x1f, 0x43, 0xc8, 0xe6, 0x3e, 0x43, 0x27, 0x94, 0x41, 0x43, + 0x1d, 0x29, 0x42, 0x43, 0xd7, 0xa9, 0x1d, 0x43, 0x9b, 0x9b, 0x32, 0x43, 0x5b, 0x4f, 0x26, 0x43, + 0xf1, 0xb6, 0x21, 0x43, 0x4e, 0xc5, 0xc5, 0x42, 0xb5, 0x89, 0xcd, 0x42, 0xca, 0xb4, 0xf2, 0x42, + 0x27, 0xbb, 0xe3, 0x42, 0xcb, 0xa9, 0x02, 0x43, 0xe8, 0xb7, 0x00, 0x43, 0x69, 0xbd, 0x18, 0x43, + 0x97, 0x31, 0x3c, 0x43, 0x8e, 0xb8, 0x41, 0x43, 0x9a, 0x24, 0x42, 0x43, 0x80, 0x71, 0x1a, 0x43, + 0xe9, 0x22, 0x2d, 0x43, 0xcf, 0x2f, 0x1c, 0x43, 0x64, 0x93, 0x1b, 0x43, 0xe6, 0x73, 0xad, 0x42, + 0x22, 0x21, 0xb0, 0x42, 0x3e, 0xfd, 0xf8, 0x42, 0x78, 0xa9, 0xf0, 0x42, 0xfd, 0x66, 0x14, 0x43, + 0x4a, 0xcd, 0x18, 0x43, 0x6f, 0x6b, 0x21, 0x43, 0x46, 0x57, 0x3c, 0x43, 0x61, 0x26, 0x42, 0x43, + 0xf7, 0x97, 0x37, 0x43, 0xe7, 0xf9, 0x1f, 0x43, 0x59, 0x44, 0x27, 0x43, 0xe3, 0xe2, 0x12, 0x43, + 0x1e, 0x8f, 0xee, 0x42, 0x04, 0xca, 0xa9, 0x42, 0xbe, 0x76, 0xd4, 0x42, 0x61, 0x6f, 0x22, 0x43, + 0x95, 0x55, 0x0b, 0x43, 0xdd, 0xef, 0x12, 0x43, 0xf5, 0x95, 0x1d, 0x43, 0x21, 0xab, 0x24, 0x43, + 0xbe, 0x0f, 0x47, 0x43, 0x07, 0xf5, 0x51, 0x43, 0xe2, 0x6c, 0x3c, 0x43, 0x45, 0xa5, 0x1b, 0x43, + 0x14, 0x27, 0x1f, 0x43, 0x9b, 0x6a, 0x10, 0x43, 0x63, 0x9f, 0x0e, 0x43, 0x6a, 0x11, 0x96, 0x42, + 0xd4, 0x1b, 0xe6, 0x42, 0x4f, 0xa2, 0x1c, 0x43, 0x9e, 0x1e, 0x04, 0x43, 0x83, 0x21, 0x12, 0x43, + 0x3a, 0x68, 0x14, 0x43, 0xc8, 0x9a, 0x2d, 0x43, 0x78, 0x8a, 0x41, 0x43, 0xd4, 0xaf, 0x33, 0x43, + 0xfd, 0xfc, 0x1c, 0x43, 0x12, 0x47, 0x04, 0x43, 0x79, 0x1b, 0x04, 0x43, 0x60, 0x5d, 0x0d, 0x43, + 0xf9, 0xd9, 0x26, 0x43, 0x0c, 0xad, 0xb2, 0x42, 0x99, 0x79, 0xcd, 0x42, 0x89, 0x7c, 0x16, 0x43, + 0x12, 0x19, 0x02, 0x43, 0x87, 0x31, 0x09, 0x43, 0xd2, 0x5e, 0x18, 0x43, 0xb1, 0x9d, 0x22, 0x43, + 0xa3, 0x85, 0x29, 0x43, 0x16, 0xef, 0x23, 0x43, 0xbb, 0xe4, 0x02, 0x43, 0x6f, 0x04, 0xe1, 0x42, + 0x7e, 0xe6, 0xeb, 0x42, 0x8e, 0x77, 0x0d, 0x43, 0xd9, 0x88, 0x19, 0x43, 0xc1, 0xb4, 0xcc, 0x42, + 0xa1, 0xe3, 0xc3, 0x42, 0x4f, 0x4c, 0x1b, 0x43, 0x83, 0x64, 0x12, 0x43, 0x39, 0x24, 0x23, 0x43, + 0x86, 0xb3, 0x17, 0x43, 0xcd, 0x1f, 0x28, 0x43, 0x6b, 0xe6, 0x29, 0x43, 0xe9, 0xc4, 0x26, 0x43, + 0xf2, 0x3a, 0x0a, 0x43, 0xd5, 0xe0, 0x01, 0x43, 0xde, 0x28, 0x0d, 0x43, 0x59, 0xeb, 0x01, 0x43, + 0xa3, 0x0c, 0x22, 0x43, 0x6c, 0x75, 0xb1, 0x42, 0x52, 0x6a, 0xba, 0x42, 0x1a, 0xbb, 0x25, 0x43, + 0xed, 0x1c, 0x1c, 0x43, 0x89, 0xa2, 0x2e, 0x43, 0x71, 0xc3, 0x14, 0x43, 0x5b, 0x24, 0x2c, 0x43, + 0x4d, 0x07, 0x29, 0x43, 0xe6, 0x9b, 0x35, 0x43, 0x79, 0x11, 0x24, 0x43, 0xe7, 0xdd, 0x13, 0x43, + 0x77, 0x57, 0x15, 0x43, 0xd5, 0xe5, 0x19, 0x43, 0xc3, 0x05, 0x3e, 0x43, 0xa9, 0xb0, 0xea, 0x42, + 0xcd, 0x58, 0xae, 0x42, 0xae, 0xa7, 0x26, 0x43, 0xf3, 0xf5, 0x29, 0x43, 0x40, 0x73, 0x1c, 0x43, + 0xe3, 0xf0, 0xfe, 0x42, 0x60, 0xb4, 0x25, 0x43, 0xc7, 0xf9, 0x15, 0x43, 0xb8, 0x11, 0x30, 0x43, + 0xa7, 0x2f, 0x2d, 0x43, 0x05, 0x68, 0x1c, 0x43, 0xe9, 0xfc, 0x2a, 0x43, 0x2f, 0x5f, 0x34, 0x43, + 0xcf, 0xcb, 0x45, 0x43, 0xf2, 0x4d, 0xec, 0x42, 0x43, 0x6f, 0xb8, 0x42, 0x66, 0x50, 0x0c, 0x43, + 0xb5, 0x48, 0x0a, 0x43, 0x58, 0x80, 0x0a, 0x43, 0x6f, 0xb9, 0x03, 0x43, 0xee, 0x18, 0x12, 0x43, + 0x69, 0x67, 0x14, 0x43, 0xc9, 0x6e, 0x2a, 0x43, 0x93, 0xa2, 0x1d, 0x43, 0x37, 0xcf, 0x40, 0x43, + 0x2a, 0x44, 0x38, 0x43, 0x3b, 0x79, 0x3e, 0x43, 0x9f, 0xbb, 0x1d, 0x43, 0x2a, 0xd4, 0xb3, 0x42, + 0xe2, 0x4d, 0xa8, 0x42, 0xd6, 0x40, 0xe4, 0x42, 0x33, 0xf8, 0xf5, 0x42, 0xfc, 0xe7, 0xef, 0x42, + 0x71, 0xab, 0x04, 0x43, 0x9f, 0x94, 0x00, 0x43, 0xfb, 0x6e, 0x02, 0x43, 0x10, 0x52, 0x31, 0x43, + 0x2c, 0x32, 0x2e, 0x43, 0xad, 0xb6, 0x49, 0x43, 0x77, 0xc1, 0x26, 0x43, 0xc3, 0xa6, 0x27, 0x43, + 0xe9, 0x8b, 0x08, 0x43, 0x60, 0xcc, 0xa6, 0x42, 0x3d, 0x16, 0x50, 0x42, 0x82, 0x11, 0x9b, 0x42, + 0xaf, 0xef, 0x9c, 0x42, 0x2a, 0x4e, 0xb4, 0x42, 0xd9, 0xce, 0xad, 0x42, 0x78, 0x21, 0xa5, 0x42, + 0x8c, 0x99, 0xc2, 0x42, 0xe0, 0xf9, 0xf1, 0x42, 0x46, 0x8c, 0xeb, 0x42, 0xdd, 0x72, 0x0f, 0x43, + 0x90, 0x5d, 0xba, 0x42, 0x19, 0x3a, 0xb8, 0x42, 0x1e, 0x50, 0x81, 0x42, 0xfd, 0xef, 0x6c, 0x42, + 0xeb, 0xa1, 0x40, 0x42, 0x1b, 0x04, 0x97, 0x42, 0x48, 0x55, 0x78, 0x42, 0x48, 0x02, 0xa2, 0x42, + 0x50, 0xe0, 0xc7, 0x42, 0xd2, 0xd3, 0xb7, 0x42, 0x7c, 0x93, 0xc5, 0x42, 0xd1, 0x6c, 0xcf, 0x42, + 0x2a, 0x2e, 0xba, 0x42, 0x32, 0x9f, 0x9c, 0x42, 0xe9, 0xe6, 0xb8, 0x42, 0xf3, 0x43, 0xaa, 0x42, + 0x82, 0xb9, 0xb4, 0x42, 0x09, 0x54, 0x42, 0x42, 0x0a, 0x0e, 0xb8, 0x42, 0xbb, 0x96, 0xd5, 0x42, + 0xdc, 0xda, 0xca, 0x42, 0x71, 0x6f, 0xdf, 0x42, 0x0c, 0x81, 0xfd, 0x42, 0xd3, 0x7f, 0xf6, 0x42, + 0xa8, 0x50, 0x20, 0x43, 0xff, 0x1f, 0x26, 0x43, 0xd1, 0x51, 0x1c, 0x43, 0xef, 0xae, 0xef, 0x42, + 0x85, 0x76, 0x07, 0x43, 0x91, 0x3e, 0x16, 0x43, 0x25, 0x58, 0x0c, 0x43, 0x57, 0x0a, 0x9b, 0x42, + 0x50, 0xe7, 0xc5, 0x42, 0x6a, 0x76, 0xea, 0x42, 0x5a, 0x31, 0xcd, 0x42, 0x1e, 0xdb, 0xed, 0x42, + 0xe5, 0x92, 0x07, 0x43, 0x45, 0x45, 0x19, 0x43, 0x07, 0x27, 0x24, 0x43, 0xfd, 0xb5, 0x26, 0x43, + 0x15, 0x32, 0x21, 0x43, 0xdb, 0x0b, 0x11, 0x43, 0x74, 0x6e, 0x1a, 0x43, 0xc3, 0x08, 0x1b, 0x43, + 0xab, 0x72, 0x1c, 0x43, 0x11, 0x1b, 0xbe, 0x42, 0x08, 0x69, 0xd9, 0x42, 0xf6, 0x0e, 0xf6, 0x42, + 0x8a, 0x0c, 0xc2, 0x42, 0x89, 0x99, 0x01, 0x43, 0xd2, 0xb7, 0xf0, 0x42, 0x5c, 0xba, 0x07, 0x43, + 0xfb, 0xac, 0x28, 0x43, 0x3d, 0xfc, 0x31, 0x43, 0xc2, 0x51, 0x2e, 0x43, 0xb7, 0x06, 0x23, 0x43, + 0x01, 0xdd, 0x14, 0x43, 0x22, 0x6a, 0x18, 0x43, 0xa1, 0x21, 0x07, 0x43, 0x06, 0x45, 0x9f, 0x42, + 0xf1, 0x8d, 0xbc, 0x42, 0x4a, 0x57, 0xe2, 0x42, 0x8d, 0x38, 0xea, 0x42, 0xbb, 0x86, 0x11, 0x43, + 0x16, 0xdf, 0x0a, 0x43, 0xaf, 0x1c, 0x1c, 0x43, 0x79, 0x0b, 0x2d, 0x43, 0x92, 0x90, 0x37, 0x43, + 0x0f, 0x4a, 0x27, 0x43, 0x90, 0x82, 0x15, 0x43, 0x90, 0x8c, 0x07, 0x43, 0xb4, 0x2e, 0x0c, 0x43, + 0xbe, 0xde, 0xfb, 0x42, 0xf8, 0x42, 0x98, 0x42, 0x3a, 0x9e, 0xd5, 0x42, 0x63, 0x07, 0x06, 0x43, + 0x67, 0x8e, 0x02, 0x43, 0x7a, 0x3c, 0xff, 0x42, 0x77, 0x1b, 0xf4, 0x42, 0xdd, 0x00, 0x20, 0x43, + 0x3c, 0x94, 0x4b, 0x43, 0xd7, 0x51, 0x3f, 0x43, 0x27, 0xe9, 0x38, 0x43, 0x71, 0xfb, 0x06, 0x43, + 0xd3, 0x7e, 0xfe, 0x42, 0x26, 0xcb, 0xf5, 0x42, 0x21, 0x06, 0x0a, 0x43, 0x92, 0xe1, 0x9f, 0x42, + 0xe4, 0x92, 0xda, 0x42, 0x3b, 0x6b, 0x11, 0x43, 0x56, 0x8f, 0xff, 0x42, 0xff, 0x32, 0xf9, 0x42, + 0x08, 0x31, 0x10, 0x43, 0xdf, 0xe4, 0x1a, 0x43, 0x16, 0x29, 0x31, 0x43, 0x91, 0x73, 0x0e, 0x43, + 0x7f, 0x5d, 0x11, 0x43, 0x88, 0xf6, 0xee, 0x42, 0x2a, 0x71, 0x02, 0x43, 0x74, 0x04, 0xfe, 0x42, + 0x15, 0xe0, 0x0c, 0x43, 0x04, 0xb5, 0xc5, 0x42, 0x98, 0x8b, 0xd3, 0x42, 0xfd, 0xa6, 0x04, 0x43, + 0xbe, 0xdf, 0xdf, 0x42, 0xc1, 0xaf, 0x0b, 0x43, 0x98, 0xf1, 0x0a, 0x43, 0xbb, 0x4e, 0x13, 0x43, + 0x3f, 0x60, 0x2f, 0x43, 0x43, 0x2c, 0x19, 0x43, 0xb5, 0xa3, 0x05, 0x43, 0xaf, 0xc0, 0xe4, 0x42, + 0x78, 0x4b, 0xdc, 0x42, 0x02, 0x9b, 0xfb, 0x42, 0xf0, 0xe5, 0x0c, 0x43, 0x04, 0x1b, 0xc4, 0x42, + 0x8f, 0x2d, 0xd0, 0x42, 0xe2, 0x72, 0x0f, 0x43, 0xd7, 0x3c, 0x03, 0x43, 0x16, 0x85, 0x07, 0x43, + 0x24, 0x00, 0x19, 0x43, 0xa6, 0x01, 0x15, 0x43, 0xa7, 0x10, 0x1b, 0x43, 0x6b, 0x13, 0x0e, 0x43, + 0xcf, 0x1d, 0x03, 0x43, 0x85, 0x41, 0xe5, 0x42, 0x94, 0x53, 0xf0, 0x42, 0x3f, 0x5e, 0x05, 0x43, + 0xb7, 0xff, 0x0f, 0x43, 0xb2, 0x43, 0xbd, 0x42, 0xaa, 0x50, 0xd3, 0x42, 0x54, 0x9b, 0x14, 0x43, + 0x58, 0xc1, 0x1c, 0x43, 0x9d, 0xe0, 0x19, 0x43, 0xa4, 0x79, 0x12, 0x43, 0x3f, 0x71, 0x17, 0x43, + 0xf5, 0x90, 0x0b, 0x43, 0xb5, 0x3c, 0x24, 0x43, 0xa5, 0xbe, 0x18, 0x43, 0x34, 0xb1, 0xfa, 0x42, + 0x95, 0xd5, 0x06, 0x43, 0xc1, 0x17, 0x1a, 0x43, 0xbf, 0xf2, 0x20, 0x43, 0x09, 0xb8, 0xd1, 0x42, + 0x7c, 0xb9, 0xd1, 0x42, 0x15, 0x7c, 0x0d, 0x43, 0x38, 0x95, 0x1c, 0x43, 0x0e, 0xa1, 0x11, 0x43, + 0x31, 0x34, 0x09, 0x43, 0xd5, 0x82, 0x0b, 0x43, 0xca, 0xf4, 0x0e, 0x43, 0x5c, 0xa3, 0x1a, 0x43, + 0xbc, 0x2d, 0x11, 0x43, 0x49, 0x76, 0x10, 0x43, 0x70, 0xdf, 0x1f, 0x43, 0xce, 0x47, 0x1b, 0x43, + 0xf7, 0x49, 0x29, 0x43, 0xbc, 0x7f, 0xd8, 0x42, 0x8e, 0xc5, 0xbc, 0x42, 0xe8, 0x4e, 0xf7, 0x42, + 0x92, 0xa7, 0xf0, 0x42, 0x24, 0xc6, 0x05, 0x43, 0x85, 0x5c, 0xfa, 0x42, 0x75, 0x7d, 0xf8, 0x42, + 0x95, 0x28, 0x0d, 0x43, 0x74, 0x25, 0x1f, 0x43, 0x3d, 0x31, 0x1a, 0x43, 0xbe, 0xe4, 0x24, 0x43, + 0xa6, 0x3a, 0x2b, 0x43, 0x3d, 0x67, 0x2a, 0x43, 0xbf, 0x5c, 0x10, 0x43, 0x56, 0x2b, 0xad, 0x42, + 0xdf, 0x90, 0xb1, 0x42, 0x35, 0x38, 0xdf, 0x42, 0x94, 0xa3, 0xd9, 0x42, 0x43, 0xf1, 0xee, 0x42, + 0x32, 0xbe, 0xe6, 0x42, 0xb5, 0xe3, 0xe2, 0x42, 0x8a, 0x26, 0xf9, 0x42, 0xae, 0xf9, 0x10, 0x43, + 0x04, 0x96, 0x1c, 0x43, 0xb4, 0xf5, 0x34, 0x43, 0x4d, 0x9f, 0x1c, 0x43, 0xe8, 0xcb, 0x0b, 0x43, + 0x7a, 0xe9, 0x05, 0x43, 0x73, 0xf3, 0xa3, 0x42, 0x55, 0x3f, 0x61, 0x42, 0x89, 0xee, 0x83, 0x42, + 0x91, 0x9f, 0x82, 0x42, 0xf6, 0xbf, 0x92, 0x42, 0x3f, 0x8f, 0xa0, 0x42, 0x9c, 0x06, 0xab, 0x42, + 0x02, 0x90, 0xae, 0x42, 0xec, 0x3c, 0xc3, 0x42, 0xb6, 0xaa, 0xd7, 0x42, 0xe7, 0xfc, 0xf4, 0x42, + 0x1f, 0xb0, 0xcd, 0x42, 0x3e, 0xfa, 0xb4, 0x42, 0x2f, 0x68, 0x62, 0x42, 0x45, 0x9f, 0x33, 0x42, + 0xdd, 0xd2, 0x4a, 0x42, 0x06, 0xbd, 0x77, 0x42, 0x8a, 0xdd, 0x72, 0x42, 0x75, 0x3a, 0x93, 0x42, + 0x4c, 0x5e, 0xb1, 0x42, 0x46, 0x09, 0xa2, 0x42, 0x22, 0x31, 0xcc, 0x42, 0x6e, 0xae, 0x9b, 0x42, + 0xde, 0x88, 0xc0, 0x42, 0x66, 0xf0, 0x8b, 0x42, 0xeb, 0xc9, 0xb4, 0x42, 0xf5, 0x8d, 0xb5, 0x42, + 0x8c, 0x1f, 0x9f, 0x42, 0x2e, 0x8b, 0xe3, 0x41, 0xc9, 0x9b, 0xa3, 0x42, 0xee, 0x59, 0xc5, 0x42, + 0x87, 0x9e, 0xc9, 0x42, 0x38, 0x93, 0xdc, 0x42, 0x60, 0x2b, 0xf5, 0x42, 0x88, 0x9e, 0xfa, 0x42, + 0x21, 0xb0, 0x15, 0x43, 0x5e, 0xb2, 0x11, 0x43, 0x9a, 0x24, 0x15, 0x43, 0x1f, 0x5d, 0x01, 0x43, + 0x5b, 0x45, 0x17, 0x43, 0x51, 0x3f, 0x09, 0x43, 0xff, 0xd5, 0x0d, 0x43, 0x93, 0x95, 0x9e, 0x42, + 0x0a, 0x99, 0xaf, 0x42, 0xaf, 0x0a, 0xc8, 0x42, 0x2a, 0x68, 0xd2, 0x42, 0x84, 0x88, 0x0b, 0x43, + 0x6a, 0xde, 0xf8, 0x42, 0x5b, 0xeb, 0x01, 0x43, 0x10, 0xbb, 0x27, 0x43, 0x82, 0x2b, 0x22, 0x43, + 0x62, 0x67, 0x0f, 0x43, 0x13, 0xc4, 0xeb, 0x42, 0x78, 0xd3, 0x08, 0x43, 0x20, 0x2a, 0x11, 0x43, + 0xcc, 0x61, 0x02, 0x43, 0x43, 0x30, 0xa2, 0x42, 0xf2, 0xd5, 0xa7, 0x42, 0xd7, 0x1d, 0xe5, 0x42, + 0x59, 0xc6, 0xe8, 0x42, 0x68, 0x99, 0xe8, 0x42, 0x18, 0x1a, 0xfe, 0x42, 0xdd, 0x52, 0x0a, 0x43, + 0x91, 0xcd, 0x2b, 0x43, 0xa0, 0xa7, 0x21, 0x43, 0xd1, 0x2a, 0x28, 0x43, 0x7f, 0xb7, 0x01, 0x43, + 0x21, 0x1c, 0x13, 0x43, 0x2f, 0x43, 0x0a, 0x43, 0xb7, 0xda, 0x01, 0x43, 0x36, 0x7b, 0xa2, 0x42, + 0xf1, 0xe7, 0xa6, 0x42, 0x20, 0xec, 0xff, 0x42, 0xc2, 0x7c, 0xff, 0x42, 0x29, 0x9a, 0xf8, 0x42, + 0x17, 0xa9, 0x09, 0x43, 0xb0, 0xdc, 0x14, 0x43, 0x95, 0xfc, 0x34, 0x43, 0x0b, 0x40, 0x25, 0x43, + 0xc5, 0x6d, 0x23, 0x43, 0xb8, 0x09, 0x14, 0x43, 0x10, 0xea, 0xfe, 0x42, 0xf9, 0x97, 0x03, 0x43, + 0x2c, 0xc5, 0xe0, 0x42, 0x32, 0x5a, 0x8c, 0x42, 0x3a, 0xd3, 0xc3, 0x42, 0x92, 0xdf, 0x01, 0x43, + 0x8d, 0x11, 0xe9, 0x42, 0x36, 0x42, 0x19, 0x43, 0xb5, 0x01, 0xee, 0x42, 0xbd, 0x8f, 0x09, 0x43, + 0x60, 0x29, 0x3b, 0x43, 0x17, 0x93, 0x46, 0x43, 0xf2, 0x9b, 0x2f, 0x43, 0xfe, 0x9e, 0x09, 0x43, + 0xab, 0x43, 0xf8, 0x42, 0xaf, 0x19, 0xe1, 0x42, 0x16, 0x06, 0xe6, 0x42, 0x48, 0x21, 0x8c, 0x42, + 0x93, 0x0f, 0xd7, 0x42, 0x96, 0xaa, 0xfb, 0x42, 0x14, 0xed, 0xeb, 0x42, 0xde, 0x34, 0xef, 0x42, + 0xbc, 0xe5, 0x08, 0x43, 0x82, 0x47, 0x0d, 0x43, 0x6b, 0x34, 0x24, 0x43, 0x84, 0x0f, 0x28, 0x43, + 0xf3, 0xa2, 0x1a, 0x43, 0x0a, 0x20, 0xce, 0x42, 0x6c, 0x11, 0xdd, 0x42, 0xa0, 0xd5, 0xf5, 0x42, + 0xd9, 0xe1, 0x05, 0x43, 0x9c, 0x1c, 0xa8, 0x42, 0xfc, 0xd6, 0xc6, 0x42, 0x25, 0xaa, 0x13, 0x43, + 0xb7, 0x4d, 0xe6, 0x42, 0x30, 0x76, 0xe7, 0x42, 0xbf, 0x08, 0x11, 0x43, 0x87, 0x69, 0x15, 0x43, + 0x44, 0xd2, 0x14, 0x43, 0xf5, 0x04, 0x07, 0x43, 0x90, 0xf3, 0x02, 0x43, 0x04, 0xf7, 0xc0, 0x42, + 0x42, 0x9a, 0xd5, 0x42, 0x6a, 0x3e, 0x08, 0x43, 0x14, 0xde, 0x0f, 0x43, 0x2c, 0xd8, 0xc4, 0x42, + 0x29, 0xee, 0xb0, 0x42, 0x54, 0x07, 0x1d, 0x43, 0x47, 0x34, 0x03, 0x43, 0xe4, 0xc0, 0x04, 0x43, + 0xb0, 0x5c, 0x0f, 0x43, 0xb2, 0x46, 0x0a, 0x43, 0xe4, 0x39, 0x19, 0x43, 0x09, 0x52, 0x05, 0x43, + 0xde, 0x55, 0xdf, 0x42, 0x52, 0x08, 0xf6, 0x42, 0x1a, 0x45, 0xfb, 0x42, 0xbe, 0xc2, 0xe6, 0x42, + 0x0b, 0x48, 0x07, 0x43, 0x79, 0x3f, 0xb9, 0x42, 0x54, 0xfe, 0xd1, 0x42, 0x31, 0xfc, 0x0d, 0x43, + 0x6a, 0x5d, 0x09, 0x43, 0x72, 0x8a, 0x16, 0x43, 0x0c, 0x88, 0x19, 0x43, 0xf1, 0xe6, 0x0f, 0x43, + 0x8a, 0x30, 0x08, 0x43, 0x7f, 0x11, 0x0e, 0x43, 0x47, 0x85, 0xfb, 0x42, 0x9e, 0xf1, 0x10, 0x43, + 0x2a, 0x3b, 0xf1, 0x42, 0x86, 0x5a, 0x0a, 0x43, 0x4b, 0xa1, 0x2c, 0x43, 0x6c, 0x79, 0xcc, 0x42, + 0xe0, 0x36, 0xcb, 0x42, 0xa5, 0xff, 0x20, 0x43, 0xa6, 0xd7, 0x0e, 0x43, 0x63, 0xf4, 0x06, 0x43, + 0x4e, 0xed, 0xed, 0x42, 0xd5, 0xb1, 0x0b, 0x43, 0x70, 0xb7, 0x19, 0x43, 0x85, 0xe2, 0x15, 0x43, + 0x70, 0x6c, 0x0c, 0x43, 0xb7, 0xe7, 0xef, 0x42, 0xb8, 0xe7, 0x1c, 0x43, 0xe7, 0x8d, 0x20, 0x43, + 0x19, 0x1b, 0x36, 0x43, 0x3c, 0x8e, 0xa7, 0x42, 0x58, 0x2f, 0xb4, 0x42, 0x99, 0x9d, 0xfe, 0x42, + 0x92, 0x54, 0xcd, 0x42, 0x78, 0xae, 0x07, 0x43, 0x7c, 0xb1, 0xe2, 0x42, 0x50, 0xfd, 0xf4, 0x42, + 0xdc, 0x2d, 0xea, 0x42, 0x09, 0xe8, 0x19, 0x43, 0xc8, 0xba, 0x08, 0x43, 0x9f, 0x3f, 0x24, 0x43, + 0xc5, 0x00, 0x22, 0x43, 0xcd, 0xc2, 0x1d, 0x43, 0xc6, 0xcc, 0xf9, 0x42, 0xd6, 0xf1, 0xb3, 0x42, + 0xd4, 0xe3, 0xa2, 0x42, 0x14, 0x3e, 0xd2, 0x42, 0x4c, 0x3b, 0xc7, 0x42, 0x8d, 0x73, 0xe3, 0x42, + 0x31, 0x64, 0xd4, 0x42, 0x41, 0x46, 0xfa, 0x42, 0xe9, 0x09, 0xf1, 0x42, 0xb8, 0x4a, 0x0a, 0x43, + 0x85, 0x85, 0x25, 0x43, 0x72, 0xc8, 0x25, 0x43, 0x30, 0xad, 0x19, 0x43, 0xa5, 0x26, 0x0b, 0x43, + 0x69, 0x7e, 0x07, 0x43, 0x6a, 0x5b, 0x87, 0x42, 0xfa, 0x4d, 0x42, 0x42, 0x69, 0x27, 0x8e, 0x42, + 0xa2, 0x41, 0x8e, 0x42, 0x93, 0xe2, 0x99, 0x42, 0x76, 0x0d, 0x9c, 0x42, 0xaa, 0x22, 0x71, 0x42, + 0x70, 0x35, 0xac, 0x42, 0x32, 0x72, 0xdb, 0x42, 0x51, 0x46, 0xc5, 0x42, 0x1c, 0xa6, 0xe3, 0x42, + 0x62, 0x7e, 0xb4, 0x42, 0x20, 0x49, 0x97, 0x42, 0x26, 0xc8, 0x85, 0x42, 0x70, 0xf0, 0x51, 0x42, + 0xf9, 0x0c, 0x28, 0x42, 0x71, 0xb7, 0x84, 0x42, 0x9b, 0xed, 0x7f, 0x42, 0x82, 0x61, 0x83, 0x42, + 0x2d, 0x0b, 0x9c, 0x42, 0xd2, 0xb0, 0x95, 0x42, 0xee, 0x4a, 0xb5, 0x42, 0x82, 0x8f, 0xa8, 0x42, + 0x8d, 0x76, 0xd1, 0x42, 0x33, 0x2f, 0x7b, 0x42, 0x1f, 0x4d, 0x92, 0x42, 0x29, 0x30, 0xbc, 0x42, + 0x1c, 0xa4, 0x8d, 0x42, 0x91, 0x0c, 0x2c, 0x42, 0x87, 0x35, 0xc9, 0x42, 0x0a, 0x01, 0xdf, 0x42, + 0x0e, 0x98, 0xa0, 0x42, 0x53, 0xdb, 0xcb, 0x42, 0x91, 0x12, 0x0a, 0x43, 0xc0, 0x39, 0x06, 0x43, + 0x8b, 0xe9, 0x07, 0x43, 0x3d, 0x64, 0x00, 0x43, 0x06, 0xba, 0x11, 0x43, 0x40, 0xd4, 0x0e, 0x43, + 0xa1, 0xc9, 0x00, 0x43, 0xb2, 0xf3, 0x03, 0x43, 0x54, 0xaa, 0x0e, 0x43, 0x3b, 0x6f, 0xd1, 0x42, + 0xa1, 0x9a, 0x9f, 0x42, 0x00, 0xd3, 0xff, 0x42, 0x92, 0x6e, 0xd1, 0x42, 0x85, 0x6b, 0xfa, 0x42, + 0xe9, 0xaa, 0xfb, 0x42, 0x74, 0xd0, 0x09, 0x43, 0xc6, 0x3b, 0x1f, 0x43, 0xa2, 0xd1, 0x20, 0x43, + 0x92, 0xd2, 0x1b, 0x43, 0x29, 0x0a, 0x04, 0x43, 0xbb, 0x7f, 0x0e, 0x43, 0xdb, 0x50, 0x16, 0x43, + 0xb3, 0x0d, 0x15, 0x43, 0x79, 0xcc, 0xb2, 0x42, 0xb4, 0xdb, 0xbd, 0x42, 0xe2, 0xad, 0xfb, 0x42, + 0xab, 0xed, 0xdd, 0x42, 0x91, 0x1c, 0x00, 0x43, 0x6f, 0x47, 0x06, 0x43, 0xe5, 0x5f, 0xf2, 0x42, + 0x5e, 0xb6, 0x2d, 0x43, 0xd0, 0xd3, 0x2e, 0x43, 0x03, 0x5a, 0x39, 0x43, 0xe3, 0x42, 0xe7, 0x42, + 0xcc, 0xa5, 0x1e, 0x43, 0x1e, 0xd5, 0x15, 0x43, 0xbe, 0x72, 0x16, 0x43, 0x84, 0x09, 0xa7, 0x42, + 0x36, 0xcf, 0xb2, 0x42, 0x98, 0x87, 0xe7, 0x42, 0x63, 0xd3, 0xd8, 0x42, 0xca, 0x1a, 0xf8, 0x42, + 0xba, 0xf3, 0x04, 0x43, 0x4b, 0x0c, 0x08, 0x43, 0xb2, 0x6d, 0x3d, 0x43, 0xa3, 0x8c, 0x34, 0x43, + 0x7c, 0x80, 0x26, 0x43, 0x05, 0x15, 0xf7, 0x42, 0x63, 0xa1, 0x13, 0x43, 0xfe, 0x4d, 0x1a, 0x43, + 0xa8, 0x79, 0x02, 0x43, 0x2c, 0x88, 0x94, 0x42, 0x25, 0x7a, 0xc0, 0x42, 0xe8, 0x0d, 0x03, 0x43, + 0x6b, 0x0c, 0xcb, 0x42, 0x7f, 0x29, 0xfa, 0x42, 0xf6, 0x99, 0xf9, 0x42, 0x4c, 0xec, 0x08, 0x43, + 0x33, 0x44, 0x2f, 0x43, 0xe6, 0x9f, 0x2d, 0x43, 0xb8, 0xa9, 0x2b, 0x43, 0x16, 0x06, 0x05, 0x43, + 0x8f, 0x45, 0x0e, 0x43, 0x94, 0x41, 0x07, 0x43, 0x63, 0x85, 0xf9, 0x42, 0xe3, 0x46, 0xaf, 0x42, + 0x15, 0x1b, 0xcf, 0x42, 0x0e, 0x81, 0x0b, 0x43, 0xb1, 0x0c, 0xf2, 0x42, 0xbf, 0x90, 0xf7, 0x42, + 0x74, 0x1b, 0xf7, 0x42, 0x45, 0xf6, 0x21, 0x43, 0xd4, 0x1f, 0x36, 0x43, 0x75, 0xbb, 0x2d, 0x43, + 0xd8, 0x8d, 0x18, 0x43, 0xd9, 0x94, 0xe6, 0x42, 0xb4, 0x9c, 0xfd, 0x42, 0x73, 0x68, 0xef, 0x42, + 0x2a, 0xa1, 0x07, 0x43, 0x61, 0xff, 0xb3, 0x42, 0xb1, 0x27, 0xc7, 0x42, 0xf3, 0x17, 0x04, 0x43, + 0x23, 0xf9, 0xd1, 0x42, 0xfc, 0x13, 0xde, 0x42, 0xed, 0x10, 0x1a, 0x43, 0x24, 0x1a, 0x0d, 0x43, + 0x5b, 0xe3, 0x1c, 0x43, 0x62, 0x8c, 0x1f, 0x43, 0x20, 0xc3, 0xfd, 0x42, 0x21, 0x8b, 0xc9, 0x42, + 0x6e, 0xd4, 0xfe, 0x42, 0x64, 0xba, 0x02, 0x43, 0x64, 0xd9, 0x04, 0x43, 0x51, 0x5e, 0xb9, 0x42, + 0x0d, 0xa3, 0xd7, 0x42, 0xf9, 0x50, 0x08, 0x43, 0x09, 0x9c, 0x0c, 0x43, 0xcf, 0x1e, 0x02, 0x43, + 0x87, 0xfa, 0x05, 0x43, 0x45, 0xb9, 0xf1, 0x42, 0x34, 0x9b, 0x0c, 0x43, 0xa2, 0x3b, 0x13, 0x43, + 0x30, 0x44, 0xec, 0x42, 0xd0, 0xd2, 0xc9, 0x42, 0xd0, 0xb9, 0xd6, 0x42, 0x58, 0x42, 0x08, 0x43, + 0x86, 0xc7, 0x08, 0x43, 0x59, 0x14, 0xb4, 0x42, 0x36, 0x6c, 0xd1, 0x42, 0xd6, 0xed, 0x0a, 0x43, + 0x73, 0xb5, 0x1c, 0x43, 0x04, 0x9e, 0x2b, 0x43, 0x0a, 0xd6, 0x00, 0x43, 0x94, 0xd0, 0x11, 0x43, + 0x62, 0xd9, 0x03, 0x43, 0xa8, 0x01, 0x12, 0x43, 0x5c, 0x9c, 0x0f, 0x43, 0x29, 0xac, 0x13, 0x43, + 0x9e, 0x06, 0xed, 0x42, 0x9e, 0xe6, 0xf3, 0x42, 0x8c, 0x5d, 0x22, 0x43, 0x56, 0x3a, 0xdd, 0x42, + 0x63, 0x97, 0xa0, 0x42, 0x63, 0xa8, 0x16, 0x43, 0x62, 0xac, 0x19, 0x43, 0x58, 0x5b, 0x25, 0x43, + 0xf4, 0x25, 0xff, 0x42, 0x32, 0x04, 0x17, 0x43, 0x5a, 0x67, 0x1a, 0x43, 0x02, 0x75, 0x17, 0x43, + 0xd5, 0x6a, 0x14, 0x43, 0x60, 0x44, 0x06, 0x43, 0x81, 0xf5, 0x25, 0x43, 0x96, 0x17, 0x25, 0x43, + 0x70, 0x61, 0x2c, 0x43, 0xdf, 0xcb, 0xd1, 0x42, 0xf9, 0x9c, 0xb0, 0x42, 0xf4, 0x2e, 0x0a, 0x43, + 0xaf, 0x0e, 0xd0, 0x42, 0x3a, 0x38, 0x01, 0x43, 0x10, 0xb6, 0xea, 0x42, 0x3e, 0x69, 0x05, 0x43, + 0x37, 0x9f, 0xf8, 0x42, 0x2b, 0x84, 0x16, 0x43, 0x5a, 0x22, 0x06, 0x43, 0x2f, 0xae, 0x1c, 0x43, + 0x32, 0x7e, 0x1f, 0x43, 0x6e, 0x54, 0x29, 0x43, 0x99, 0xf0, 0x18, 0x43, 0xb0, 0xd4, 0xe7, 0x42, + 0x74, 0x96, 0xa1, 0x42, 0x92, 0x06, 0xe8, 0x42, 0x3d, 0xc4, 0xd5, 0x42, 0x81, 0x8c, 0xda, 0x42, + 0x0a, 0x31, 0xcf, 0x42, 0xfd, 0x1b, 0xee, 0x42, 0x96, 0xdd, 0xec, 0x42, 0x70, 0xcc, 0x11, 0x43, + 0x5f, 0x09, 0x17, 0x43, 0xea, 0xdf, 0x2b, 0x43, 0xeb, 0x0e, 0x1e, 0x43, 0xea, 0xab, 0x1f, 0x43, + 0x59, 0xf1, 0xf9, 0x42, 0xf3, 0x5f, 0xbe, 0x42, 0x3f, 0xb9, 0x4f, 0x42, 0x7e, 0x74, 0xae, 0x42, + 0x8f, 0x9e, 0xa0, 0x42, 0xa4, 0x7e, 0xac, 0x42, 0xe5, 0x59, 0xa4, 0x42, 0x99, 0xe1, 0x8d, 0x42, + 0x1c, 0x35, 0xbb, 0x42, 0x1c, 0x02, 0xe1, 0x42, 0xe1, 0xcc, 0xe9, 0x42, 0xd1, 0xcb, 0x00, 0x43, + 0xe4, 0xe0, 0xcb, 0x42, 0xcd, 0xc2, 0xc5, 0x42, 0x73, 0x0d, 0x88, 0x42, 0x46, 0xdc, 0x24, 0x42, + 0xcb, 0xe2, 0x50, 0x42, 0x89, 0x2e, 0xa3, 0x42, 0xb7, 0x8a, 0x94, 0x42, 0x4d, 0x4e, 0xa8, 0x42, + 0x6d, 0x30, 0xbd, 0x42, 0xe3, 0x45, 0xca, 0x42, 0xef, 0xf9, 0xdf, 0x42, 0xd2, 0x71, 0xd3, 0x42, + 0x47, 0x08, 0xd2, 0x42, 0xef, 0xdc, 0xb4, 0x42, 0xe1, 0x3b, 0xd6, 0x42, 0xcb, 0x03, 0xc4, 0x42, + 0x6b, 0x20, 0xc6, 0x42, 0xa1, 0xd5, 0x60, 0x42, 0xd5, 0x5f, 0x9d, 0x42, 0xf2, 0x11, 0x05, 0x43, + 0xb5, 0xc1, 0xeb, 0x42, 0xa2, 0x87, 0x02, 0x43, 0x49, 0x2e, 0x0f, 0x43, 0x7e, 0x2a, 0x12, 0x43, + 0xa1, 0x35, 0x25, 0x43, 0xf2, 0x36, 0x1a, 0x43, 0xfc, 0xb0, 0x36, 0x43, 0x0c, 0x54, 0xfa, 0x42, + 0xd2, 0x74, 0x1f, 0x43, 0x55, 0xdb, 0x18, 0x43, 0xa9, 0x01, 0x28, 0x43, 0x3e, 0xa5, 0xc6, 0x42, + 0xdf, 0x25, 0xd5, 0x42, 0x09, 0x24, 0x05, 0x43, 0x1a, 0xd2, 0xbe, 0x42, 0xd8, 0xe1, 0x01, 0x43, + 0xfa, 0x7d, 0x19, 0x43, 0x4d, 0x0d, 0x1c, 0x43, 0xf8, 0x44, 0x38, 0x43, 0xe1, 0xa1, 0x30, 0x43, + 0x85, 0x73, 0x32, 0x43, 0x2a, 0x53, 0x1d, 0x43, 0xb3, 0x09, 0x32, 0x43, 0xa2, 0x2f, 0x1a, 0x43, + 0xd3, 0x67, 0x28, 0x43, 0xc9, 0xcf, 0xd2, 0x42, 0x42, 0xe2, 0xca, 0x42, 0x2b, 0xcf, 0x08, 0x43, + 0x6d, 0x71, 0xea, 0x42, 0xb2, 0xd6, 0x19, 0x43, 0x33, 0x65, 0x13, 0x43, 0x9f, 0xab, 0x11, 0x43, + 0xc5, 0x0b, 0x32, 0x43, 0xbd, 0x93, 0x3f, 0x43, 0x5f, 0x2e, 0x32, 0x43, 0xd8, 0x30, 0x26, 0x43, + 0xf2, 0xd3, 0x2e, 0x43, 0xfe, 0x6d, 0x1f, 0x43, 0x99, 0xb9, 0x21, 0x43, 0xde, 0x4f, 0xdb, 0x42, + 0xfb, 0x46, 0xd9, 0x42, 0xed, 0xc1, 0x0a, 0x43, 0xe6, 0xbd, 0xfb, 0x42, 0xa2, 0xf0, 0x10, 0x43, + 0x97, 0xa9, 0x0c, 0x43, 0x9e, 0x3d, 0x1c, 0x43, 0x3b, 0xb2, 0x3c, 0x43, 0xf3, 0x04, 0x4e, 0x43, + 0xd7, 0x24, 0x40, 0x43, 0x79, 0x1c, 0x24, 0x43, 0x24, 0x3b, 0x27, 0x43, 0x68, 0xaf, 0x07, 0x43, + 0x03, 0x44, 0x11, 0x43, 0x4b, 0x14, 0xc6, 0x42, 0x39, 0xcd, 0xd2, 0x42, 0x05, 0x7c, 0x15, 0x43, + 0x98, 0xe0, 0x00, 0x43, 0x55, 0xa8, 0x1c, 0x43, 0x15, 0xe6, 0x09, 0x43, 0xcf, 0x2e, 0x16, 0x43, + 0x16, 0xb4, 0x48, 0x43, 0x0e, 0x33, 0x4f, 0x43, 0xb7, 0x9b, 0x47, 0x43, 0xf3, 0x4d, 0x24, 0x43, + 0x80, 0x97, 0x12, 0x43, 0x11, 0x30, 0x0f, 0x43, 0x55, 0x78, 0x11, 0x43, 0xcb, 0xb4, 0xdd, 0x42, + 0xd2, 0xd8, 0xfa, 0x42, 0x75, 0xe7, 0x1d, 0x43, 0x95, 0xfa, 0x0b, 0x43, 0xe6, 0x7d, 0x17, 0x43, + 0xe5, 0x54, 0x18, 0x43, 0xba, 0xc6, 0x1d, 0x43, 0x76, 0x6a, 0x44, 0x43, 0x85, 0xf0, 0x41, 0x43, + 0x3b, 0xee, 0x20, 0x43, 0x6d, 0x49, 0x0d, 0x43, 0x55, 0x9d, 0x05, 0x43, 0x62, 0x36, 0x06, 0x43, + 0x05, 0x0b, 0x1a, 0x43, 0xb9, 0x06, 0xca, 0x42, 0x7a, 0x0a, 0xdf, 0x42, 0x7a, 0x01, 0x13, 0x43, + 0xba, 0x30, 0x06, 0x43, 0x0e, 0xfa, 0x16, 0x43, 0x4c, 0x14, 0x1f, 0x43, 0x05, 0xa5, 0x10, 0x43, + 0x94, 0x27, 0x2a, 0x43, 0x81, 0x83, 0x30, 0x43, 0x3c, 0xfd, 0x0c, 0x43, 0xcb, 0x09, 0x08, 0x43, + 0xf6, 0x56, 0xf6, 0x42, 0x73, 0x90, 0x11, 0x43, 0xf3, 0xab, 0x30, 0x43, 0xd9, 0x89, 0xee, 0x42, + 0x1d, 0xbf, 0xce, 0x42, 0xc5, 0x12, 0x13, 0x43, 0xed, 0x7f, 0x19, 0x43, 0xfb, 0xda, 0x0f, 0x43, + 0x18, 0xfd, 0x11, 0x43, 0xc8, 0xbf, 0x26, 0x43, 0x5b, 0xa8, 0x27, 0x43, 0xf2, 0xbf, 0x1c, 0x43, + 0xf5, 0xa2, 0x0d, 0x43, 0x73, 0xa5, 0x08, 0x43, 0x80, 0x39, 0x05, 0x43, 0x05, 0x12, 0x12, 0x43, + 0xcb, 0x6b, 0x23, 0x43, 0x46, 0x10, 0xd4, 0x42, 0x35, 0x30, 0xce, 0x42, 0x93, 0x17, 0x3d, 0x43, + 0x6b, 0xac, 0x2b, 0x43, 0x1d, 0xa9, 0x32, 0x43, 0x71, 0x82, 0x14, 0x43, 0x84, 0x93, 0x29, 0x43, + 0xe3, 0x91, 0x21, 0x43, 0x35, 0x12, 0x29, 0x43, 0x1b, 0xaf, 0x21, 0x43, 0xd9, 0xb9, 0x18, 0x43, + 0xa0, 0x54, 0x0d, 0x43, 0x9e, 0xe4, 0x10, 0x43, 0x67, 0x1f, 0x2e, 0x43, 0x73, 0xe2, 0xf4, 0x42, + 0xcd, 0xe6, 0xd0, 0x42, 0xa7, 0xd5, 0x26, 0x43, 0xf3, 0xd9, 0x28, 0x43, 0x22, 0x97, 0x25, 0x43, + 0xfb, 0x22, 0x11, 0x43, 0x57, 0x03, 0x2b, 0x43, 0x07, 0x57, 0x18, 0x43, 0x5a, 0xf6, 0x2a, 0x43, + 0xcb, 0xc6, 0x21, 0x43, 0xcd, 0xd5, 0x21, 0x43, 0xbd, 0x9c, 0x27, 0x43, 0x73, 0x85, 0x31, 0x43, + 0x11, 0xa6, 0x3f, 0x43, 0xa6, 0x67, 0xf4, 0x42, 0x75, 0x46, 0xb9, 0x42, 0x28, 0x3c, 0x0b, 0x43, + 0x45, 0x9b, 0x0d, 0x43, 0x80, 0x23, 0x07, 0x43, 0x7a, 0x05, 0x11, 0x43, 0x44, 0x96, 0x1b, 0x43, + 0x15, 0x7d, 0x14, 0x43, 0x8b, 0x6c, 0x23, 0x43, 0xa3, 0xa5, 0x23, 0x43, 0x1b, 0x40, 0x2c, 0x43, + 0x91, 0x0a, 0x41, 0x43, 0xca, 0xa0, 0x41, 0x43, 0x75, 0x1a, 0x2a, 0x43, 0xb5, 0xd4, 0xe1, 0x42, + 0xba, 0x35, 0xb6, 0x42, 0x47, 0xc1, 0xf1, 0x42, 0xb0, 0x87, 0x06, 0x43, 0x6b, 0xd8, 0xdb, 0x42, + 0x39, 0x4a, 0xf9, 0x42, 0xad, 0x71, 0x00, 0x43, 0x5c, 0x4a, 0x0c, 0x43, 0xc3, 0xfb, 0x2c, 0x43, + 0xce, 0x20, 0x2b, 0x43, 0x7b, 0xd9, 0x3e, 0x43, 0xa3, 0x84, 0x29, 0x43, 0xa3, 0x7e, 0x33, 0x43, + 0xb5, 0x19, 0xf9, 0x42, 0x78, 0xfe, 0xbd, 0x42, 0x1f, 0x05, 0x88, 0x42, 0xc7, 0xea, 0x9f, 0x42, + 0xb8, 0xd3, 0xa1, 0x42, 0x63, 0xfe, 0xb6, 0x42, 0xb8, 0xe3, 0xba, 0x42, 0x3d, 0x8c, 0xc1, 0x42, + 0xfd, 0x7c, 0xc3, 0x42, 0xf0, 0xbd, 0xee, 0x42, 0xf2, 0x24, 0xeb, 0x42, 0xac, 0xe5, 0x0b, 0x43, + 0x79, 0xd6, 0xf6, 0x42, 0x9f, 0x33, 0xd6, 0x42, 0x85, 0x8c, 0xae, 0x42, 0x05, 0x1f, 0x56, 0x42, + 0xfc, 0xf8, 0x45, 0x42, 0x2d, 0x44, 0x80, 0x42, 0xb6, 0x40, 0x81, 0x42, 0x15, 0xf5, 0xab, 0x42, + 0x7a, 0x10, 0xb7, 0x42, 0x64, 0x7c, 0xc9, 0x42, 0x7f, 0x59, 0xcc, 0x42, 0xfe, 0x04, 0xd3, 0x42, + 0x6f, 0x8e, 0xd8, 0x42, 0xf8, 0x43, 0x97, 0x42, 0x5d, 0x88, 0xdb, 0x42, 0x23, 0x6d, 0xa4, 0x42, + 0x0d, 0x82, 0xa0, 0x42, 0xa1, 0x11, 0x73, 0x42, 0x1d, 0x1d, 0xbc, 0x42, 0x55, 0x0f, 0xd6, 0x42, + 0xbb, 0x1d, 0xbc, 0x42, 0x05, 0xcd, 0xf9, 0x42, 0xe9, 0xd3, 0x0c, 0x43, 0x32, 0xaf, 0xf1, 0x42, + 0xd6, 0xe5, 0x0f, 0x43, 0x70, 0x58, 0x20, 0x43, 0xb2, 0xea, 0x1c, 0x43, 0xcc, 0x61, 0xf1, 0x42, + 0x82, 0x89, 0x13, 0x43, 0x1a, 0x58, 0x1d, 0x43, 0xc8, 0xa4, 0x14, 0x43, 0xa2, 0xbb, 0xaa, 0x42, + 0x4d, 0x92, 0xd0, 0x42, 0xa1, 0xf8, 0xdc, 0x42, 0x19, 0x3e, 0xe0, 0x42, 0x81, 0xc7, 0xfb, 0x42, + 0x06, 0xf0, 0x15, 0x43, 0x3a, 0x91, 0x23, 0x43, 0x84, 0x89, 0x27, 0x43, 0xf5, 0x80, 0x0a, 0x43, + 0xf4, 0xdb, 0x15, 0x43, 0x85, 0x53, 0xfa, 0x42, 0x44, 0xf5, 0x18, 0x43, 0x96, 0xc6, 0x13, 0x43, + 0x0a, 0xac, 0x1a, 0x43, 0x80, 0xc8, 0xe1, 0x42, 0xf3, 0x5e, 0xc9, 0x42, 0x3a, 0x03, 0x07, 0x43, + 0x66, 0x58, 0x04, 0x43, 0xe7, 0xde, 0xfc, 0x42, 0x7e, 0x1f, 0x09, 0x43, 0x4e, 0x3e, 0x06, 0x43, + 0x24, 0xf3, 0x3a, 0x43, 0xe8, 0x34, 0x3b, 0x43, 0xa6, 0x57, 0x27, 0x43, 0xda, 0x29, 0x17, 0x43, + 0x1e, 0x05, 0x1a, 0x43, 0xfc, 0x6c, 0x1d, 0x43, 0x5a, 0x36, 0x0d, 0x43, 0x5d, 0x21, 0xad, 0x42, + 0x1b, 0xbc, 0xc5, 0x42, 0x3a, 0xf2, 0x06, 0x43, 0xe3, 0xa1, 0xe5, 0x42, 0x26, 0x4d, 0x0e, 0x43, + 0x87, 0xf9, 0x09, 0x43, 0x06, 0x17, 0x22, 0x43, 0x32, 0xb5, 0x16, 0x43, 0x8e, 0xfb, 0x3a, 0x43, + 0xac, 0x56, 0x2d, 0x43, 0x6a, 0xa4, 0x21, 0x43, 0xb8, 0xce, 0x17, 0x43, 0xfc, 0xb6, 0x16, 0x43, + 0x21, 0x43, 0xfa, 0x42, 0xf2, 0x0e, 0xc1, 0x42, 0xb7, 0x78, 0xd5, 0x42, 0xbc, 0x63, 0x18, 0x43, + 0x24, 0x7f, 0xf8, 0x42, 0x4c, 0xe5, 0xfa, 0x42, 0xcb, 0xea, 0xf9, 0x42, 0x10, 0x9b, 0x1d, 0x43, + 0xae, 0xab, 0x3b, 0x43, 0xf6, 0x37, 0x48, 0x43, 0x5c, 0x32, 0x4a, 0x43, 0xd8, 0x00, 0x1b, 0x43, + 0xb2, 0x6a, 0x0e, 0x43, 0xba, 0x72, 0x10, 0x43, 0xe4, 0x44, 0x0f, 0x43, 0x7b, 0x01, 0xbb, 0x42, + 0xae, 0x87, 0xc8, 0x42, 0x8a, 0x44, 0x0e, 0x43, 0x72, 0x14, 0x0b, 0x43, 0x81, 0xd5, 0xf5, 0x42, + 0xda, 0xa7, 0x0f, 0x43, 0xa2, 0xd3, 0x18, 0x43, 0x12, 0x9d, 0x38, 0x43, 0x02, 0xec, 0x1a, 0x43, + 0xe0, 0x18, 0x0f, 0x43, 0xd6, 0xf2, 0xfd, 0x42, 0x80, 0x18, 0x0d, 0x43, 0xd8, 0xb7, 0x03, 0x43, + 0x0a, 0xb9, 0x16, 0x43, 0x21, 0xe3, 0xd6, 0x42, 0x1a, 0xb3, 0xbe, 0x42, 0x92, 0x98, 0x1d, 0x43, + 0xbd, 0x89, 0x0b, 0x43, 0x28, 0x2e, 0x07, 0x43, 0x92, 0x68, 0x0e, 0x43, 0x76, 0x9d, 0x2b, 0x43, + 0xe0, 0xaa, 0x2f, 0x43, 0xa4, 0xde, 0x20, 0x43, 0x56, 0x2c, 0x1c, 0x43, 0x93, 0xff, 0xe9, 0x42, + 0x93, 0x4f, 0xf3, 0x42, 0x96, 0x8f, 0x02, 0x43, 0xe4, 0xe2, 0x0f, 0x43, 0xa9, 0xac, 0xdb, 0x42, + 0x95, 0x97, 0xbf, 0x42, 0xc4, 0x2c, 0x25, 0x43, 0x92, 0x06, 0x17, 0x43, 0x40, 0x91, 0x08, 0x43, + 0x54, 0x83, 0x1d, 0x43, 0x84, 0x6d, 0x1c, 0x43, 0xa6, 0xc6, 0x1e, 0x43, 0x4a, 0xc9, 0x09, 0x43, + 0x88, 0x73, 0xfb, 0x42, 0xe4, 0x34, 0x12, 0x43, 0x36, 0xba, 0x16, 0x43, 0x12, 0xd1, 0x06, 0x43, + 0x42, 0xa3, 0x10, 0x43, 0xef, 0x33, 0xd8, 0x42, 0x88, 0x37, 0xd4, 0x42, 0xf6, 0x01, 0x28, 0x43, + 0x98, 0xe0, 0x0e, 0x43, 0xfa, 0xd4, 0x20, 0x43, 0x7a, 0xc9, 0x10, 0x43, 0xd4, 0x22, 0x29, 0x43, + 0x08, 0x45, 0x21, 0x43, 0x14, 0x40, 0x30, 0x43, 0xa6, 0x71, 0x22, 0x43, 0xea, 0x06, 0x10, 0x43, + 0xe4, 0xfc, 0x08, 0x43, 0x50, 0xb9, 0x14, 0x43, 0xba, 0x24, 0x2e, 0x43, 0x8f, 0xa3, 0xf1, 0x42, + 0xe9, 0x0f, 0xb3, 0x42, 0x8c, 0x78, 0x1a, 0x43, 0x5e, 0x49, 0x2e, 0x43, 0x0c, 0x1f, 0x30, 0x43, + 0x7c, 0x12, 0x09, 0x43, 0x4a, 0x21, 0x18, 0x43, 0x6a, 0x02, 0x1c, 0x43, 0xde, 0x87, 0x1a, 0x43, + 0xae, 0x69, 0x20, 0x43, 0xd2, 0xf4, 0x06, 0x43, 0xd2, 0x50, 0x22, 0x43, 0xfe, 0x1e, 0x2f, 0x43, + 0xac, 0x57, 0x28, 0x43, 0x55, 0xb9, 0xce, 0x42, 0x9a, 0x05, 0xc5, 0x42, 0xa1, 0x81, 0xf7, 0x42, + 0xf6, 0x4e, 0xeb, 0x42, 0xbc, 0xf8, 0x18, 0x43, 0xe2, 0x01, 0x02, 0x43, 0xe6, 0xb1, 0x19, 0x43, + 0x92, 0x84, 0x16, 0x43, 0xa4, 0x0d, 0x24, 0x43, 0x72, 0xa6, 0x1a, 0x43, 0x4c, 0x4b, 0x26, 0x43, + 0x40, 0x68, 0x34, 0x43, 0xb0, 0x77, 0x45, 0x43, 0xc2, 0xaa, 0x16, 0x43, 0x2c, 0x45, 0xc2, 0x42, + 0xc7, 0x6d, 0xc5, 0x42, 0x02, 0x48, 0xdd, 0x42, 0xcb, 0xa9, 0xf2, 0x42, 0xc3, 0xc1, 0xef, 0x42, + 0x3e, 0x4e, 0xff, 0x42, 0x87, 0x27, 0xde, 0x42, 0xb6, 0x7f, 0x00, 0x43, 0x36, 0x5b, 0x2a, 0x43, + 0xd8, 0x7b, 0x20, 0x43, 0x64, 0xa4, 0x2e, 0x43, 0xfe, 0xcf, 0x20, 0x43, 0xfe, 0x62, 0x16, 0x43, + 0x06, 0x1d, 0x20, 0x43, 0x87, 0xce, 0xa6, 0x42, 0x9c, 0x57, 0x7c, 0x42, 0x65, 0xa3, 0x9a, 0x42, + 0xe5, 0x96, 0xa5, 0x42, 0xf1, 0x25, 0xbc, 0x42, 0x6b, 0x38, 0xc8, 0x42, 0x3b, 0x7c, 0xaa, 0x42, + 0x99, 0x9e, 0xc9, 0x42, 0xd9, 0x41, 0xee, 0x42, 0xc6, 0x2c, 0x01, 0x43, 0xd3, 0x25, 0x0d, 0x43, + 0xcc, 0x93, 0xdd, 0x42, 0xf9, 0xa5, 0xa9, 0x42, 0x6d, 0x3b, 0x8b, 0x42, 0xff, 0xb0, 0x80, 0x42, + 0x17, 0x80, 0x36, 0x42, 0x79, 0x25, 0x87, 0x42, 0x12, 0xc8, 0x64, 0x42, 0x21, 0x02, 0x9a, 0x42, + 0x68, 0xc2, 0xba, 0x42, 0x36, 0x67, 0xb2, 0x42, 0x86, 0xd6, 0xb8, 0x42, 0xbf, 0xcc, 0xab, 0x42, + 0xba, 0xad, 0xb7, 0x42, 0x25, 0x9f, 0x87, 0x42, 0xf6, 0xe1, 0x95, 0x42, 0xc6, 0x1a, 0xbd, 0x42, + 0xa6, 0xce, 0x9f, 0x42, 0x4a, 0xa0, 0x4d, 0x42, 0x4f, 0xf0, 0x93, 0x42, 0xcf, 0x5b, 0xc6, 0x42, + 0xae, 0x87, 0xc7, 0x42, 0x99, 0xb9, 0xd9, 0x42, 0xda, 0xbf, 0xfd, 0x42, 0x58, 0x8a, 0xe9, 0x42, + 0x2e, 0x11, 0x0d, 0x43, 0x89, 0xbe, 0x13, 0x43, 0xbb, 0x88, 0x15, 0x43, 0x7b, 0x9e, 0xea, 0x42, + 0x0b, 0xf5, 0x0d, 0x43, 0xed, 0x16, 0x10, 0x43, 0x3a, 0x7b, 0x10, 0x43, 0x62, 0xdb, 0xbb, 0x42, + 0xdc, 0x1b, 0xaa, 0x42, 0x36, 0x29, 0xe1, 0x42, 0x8a, 0xaf, 0x9b, 0x42, 0xe0, 0x69, 0xe3, 0x42, + 0x38, 0xe8, 0xf7, 0x42, 0xc1, 0x3e, 0x09, 0x43, 0x98, 0xa9, 0x1f, 0x43, 0x41, 0x1d, 0x1e, 0x43, + 0x40, 0x7d, 0x0f, 0x43, 0x90, 0x94, 0x08, 0x43, 0x1e, 0xf8, 0x01, 0x43, 0x16, 0x53, 0x16, 0x43, + 0x3e, 0xc2, 0x15, 0x43, 0x10, 0x86, 0xb0, 0x42, 0x4b, 0x74, 0xb3, 0x42, 0x40, 0x30, 0xea, 0x42, + 0x30, 0x20, 0xc0, 0x42, 0xce, 0xe8, 0xfa, 0x42, 0xf2, 0xbc, 0xe7, 0x42, 0xa0, 0xf9, 0x02, 0x43, + 0x9c, 0xb5, 0x2a, 0x43, 0x56, 0xa6, 0x2f, 0x43, 0xf4, 0xf8, 0x35, 0x43, 0x42, 0x97, 0x0c, 0x43, + 0x61, 0x64, 0x05, 0x43, 0xa9, 0x61, 0x18, 0x43, 0xf1, 0x9e, 0x04, 0x43, 0x9f, 0xfe, 0xa1, 0x42, + 0x8f, 0xb6, 0x8a, 0x42, 0x3c, 0x0d, 0xde, 0x42, 0xff, 0x42, 0xde, 0x42, 0x72, 0x2a, 0xf4, 0x42, + 0x45, 0xea, 0x0b, 0x43, 0x9c, 0xc5, 0x04, 0x43, 0xa6, 0x39, 0x21, 0x43, 0x01, 0x34, 0x2e, 0x43, + 0xbd, 0x9d, 0x29, 0x43, 0x19, 0xed, 0x10, 0x43, 0x64, 0x2a, 0x11, 0x43, 0xcc, 0xbe, 0x06, 0x43, + 0xa2, 0x46, 0xeb, 0x42, 0xc8, 0xbc, 0x9a, 0x42, 0x7e, 0x67, 0xb1, 0x42, 0x8b, 0xcf, 0x0a, 0x43, + 0xe7, 0x1c, 0xe4, 0x42, 0x58, 0xc5, 0xfb, 0x42, 0xea, 0xac, 0xee, 0x42, 0x8b, 0x84, 0x17, 0x43, + 0xdd, 0xf4, 0x2e, 0x43, 0xfb, 0xe5, 0x29, 0x43, 0x3e, 0xb2, 0x3c, 0x43, 0x3e, 0x98, 0x0b, 0x43, + 0xd6, 0x37, 0x04, 0x43, 0x79, 0x5b, 0xc5, 0x42, 0xb6, 0xcb, 0x00, 0x43, 0x10, 0x06, 0xae, 0x42, + 0x69, 0xdc, 0xbe, 0x42, 0x77, 0x58, 0x13, 0x43, 0x78, 0x2d, 0x00, 0x43, 0xc2, 0x60, 0xdc, 0x42, + 0x66, 0xd8, 0x03, 0x43, 0xc2, 0xc5, 0x04, 0x43, 0xa7, 0x16, 0x25, 0x43, 0x57, 0x57, 0x11, 0x43, + 0x9e, 0x08, 0x1a, 0x43, 0x82, 0x7f, 0xe4, 0x42, 0x94, 0x6f, 0xe5, 0x42, 0x7b, 0x52, 0x02, 0x43, + 0x70, 0xeb, 0x08, 0x43, 0x89, 0x11, 0xb7, 0x42, 0xd4, 0xe4, 0xba, 0x42, 0x6b, 0x95, 0x0d, 0x43, + 0x4e, 0x94, 0xea, 0x42, 0x53, 0x8b, 0xf3, 0x42, 0x9a, 0x28, 0x06, 0x43, 0xb2, 0x4f, 0x0f, 0x43, + 0x6d, 0x68, 0x25, 0x43, 0x15, 0x43, 0xf5, 0x42, 0x6e, 0xe4, 0xf9, 0x42, 0x8e, 0x17, 0xdc, 0x42, + 0x59, 0x7c, 0xb3, 0x42, 0xb9, 0xa7, 0xe4, 0x42, 0xe8, 0x6a, 0xf5, 0x42, 0xf4, 0x10, 0xc2, 0x42, + 0xb3, 0x62, 0xa1, 0x42, 0xa7, 0xba, 0x08, 0x43, 0xc6, 0xa0, 0x03, 0x43, 0x8f, 0x90, 0x1c, 0x43, + 0xa9, 0x37, 0x23, 0x43, 0x64, 0x8f, 0x14, 0x43, 0x76, 0xd0, 0x0a, 0x43, 0xf2, 0x51, 0xfd, 0x42, + 0x6c, 0x57, 0xe2, 0x42, 0xdf, 0x0a, 0xe3, 0x42, 0x9c, 0xe8, 0xed, 0x42, 0x8e, 0xdf, 0xea, 0x42, + 0x0c, 0x31, 0x0e, 0x43, 0x26, 0xa4, 0xc6, 0x42, 0x97, 0x38, 0xab, 0x42, 0xe4, 0x88, 0x0a, 0x43, + 0x47, 0xda, 0x0c, 0x43, 0x7a, 0x9f, 0x10, 0x43, 0xb6, 0x4b, 0x09, 0x43, 0x38, 0x22, 0x16, 0x43, + 0x9b, 0x5a, 0x1d, 0x43, 0x38, 0x48, 0x1b, 0x43, 0x2d, 0x96, 0x16, 0x43, 0xa8, 0x66, 0xf8, 0x42, + 0x43, 0xbd, 0x03, 0x43, 0xa7, 0xbd, 0x17, 0x43, 0xba, 0x24, 0x18, 0x43, 0xa3, 0x1c, 0xce, 0x42, + 0xea, 0x34, 0xbe, 0x42, 0x35, 0x42, 0x16, 0x43, 0xff, 0xbd, 0x0b, 0x43, 0x35, 0x47, 0x14, 0x43, + 0x5e, 0xd8, 0x06, 0x43, 0xc2, 0xf2, 0x02, 0x43, 0xfe, 0x70, 0x0e, 0x43, 0x22, 0x89, 0x1a, 0x43, + 0x92, 0x81, 0x07, 0x43, 0x82, 0xd0, 0x01, 0x43, 0xf7, 0x5c, 0x1b, 0x43, 0x7b, 0x8f, 0x11, 0x43, + 0xc0, 0xc5, 0x29, 0x43, 0xd0, 0x5c, 0xe9, 0x42, 0x05, 0x59, 0x92, 0x42, 0x16, 0x05, 0x03, 0x43, + 0x64, 0xc1, 0xd2, 0x42, 0xc0, 0x81, 0x05, 0x43, 0xc8, 0x5d, 0xf5, 0x42, 0xa4, 0x46, 0xf0, 0x42, + 0x29, 0x7d, 0xe9, 0x42, 0x51, 0x7d, 0x14, 0x43, 0xbc, 0xcd, 0x10, 0x43, 0x04, 0x53, 0x13, 0x43, + 0x92, 0x86, 0x1d, 0x43, 0x46, 0x7f, 0x33, 0x43, 0x30, 0xd8, 0x09, 0x43, 0xf4, 0x71, 0xb4, 0x42, + 0x28, 0x02, 0x8c, 0x42, 0xd9, 0x85, 0xf5, 0x42, 0xae, 0x08, 0xc8, 0x42, 0xe7, 0x09, 0xc2, 0x42, + 0x9a, 0x44, 0xc9, 0x42, 0x54, 0x82, 0xea, 0x42, 0x9b, 0x2e, 0xef, 0x42, 0x60, 0xf8, 0x13, 0x43, + 0x0b, 0x08, 0x0e, 0x43, 0x80, 0x73, 0x1f, 0x43, 0x45, 0x7f, 0x30, 0x43, 0xcc, 0xab, 0x14, 0x43, + 0xc0, 0xd6, 0xf3, 0x42, 0x58, 0x7d, 0xa7, 0x42, 0x13, 0x6f, 0x39, 0x42, 0x0a, 0x75, 0x82, 0x42, + 0x7d, 0x01, 0x89, 0x42, 0xc0, 0xdf, 0x89, 0x42, 0x26, 0xf9, 0x9b, 0x42, 0x29, 0x72, 0xa4, 0x42, + 0xce, 0xab, 0xa5, 0x42, 0x74, 0xc7, 0xc5, 0x42, 0x11, 0xf7, 0xcd, 0x42, 0xc2, 0x37, 0xf1, 0x42, + 0x0b, 0xcf, 0xaf, 0x42, 0xb1, 0x5d, 0xa2, 0x42, 0xc7, 0xa3, 0x24, 0x42, 0x51, 0x2e, 0x2e, 0x42, + 0x71, 0xa7, 0x5f, 0x42, 0x3e, 0x43, 0x96, 0x42, 0xfe, 0x56, 0x8e, 0x42, 0x9e, 0xc3, 0xa9, 0x42, + 0x9d, 0x94, 0xd4, 0x42, 0xed, 0x4e, 0xb8, 0x42, 0xda, 0x74, 0xd7, 0x42, 0xeb, 0xca, 0xc0, 0x42, + 0xaf, 0xc7, 0xec, 0x42, 0xd9, 0x2c, 0x8e, 0x42, 0x32, 0x60, 0xab, 0x42, 0xba, 0xfd, 0xce, 0x42, + 0xbc, 0x9a, 0xb7, 0x42, 0x45, 0x35, 0x49, 0x42, 0x6b, 0xb2, 0xbb, 0x42, 0xc8, 0xae, 0x02, 0x43, + 0x77, 0x74, 0xac, 0x42, 0x03, 0x77, 0xdc, 0x42, 0x5f, 0xa8, 0x01, 0x43, 0xef, 0x79, 0xde, 0x42, + 0x71, 0xee, 0x1b, 0x43, 0x69, 0xcf, 0x20, 0x43, 0xf4, 0xbf, 0x30, 0x43, 0x1f, 0x66, 0xfb, 0x42, + 0xf1, 0xae, 0x1c, 0x43, 0x66, 0x6e, 0x0f, 0x43, 0x00, 0x98, 0x13, 0x43, 0xd1, 0xfa, 0xc1, 0x42, + 0xd7, 0x67, 0xc3, 0x42, 0xc7, 0x1a, 0xe0, 0x42, 0xf1, 0xfe, 0xbd, 0x42, 0xd7, 0xdc, 0x08, 0x43, + 0x58, 0x72, 0x15, 0x43, 0x58, 0xd5, 0x11, 0x43, 0x92, 0x57, 0x23, 0x43, 0xc2, 0x9f, 0x27, 0x43, + 0x1e, 0xca, 0x29, 0x43, 0xe2, 0xbf, 0x07, 0x43, 0x05, 0x82, 0x1a, 0x43, 0x0c, 0x67, 0x1c, 0x43, + 0xae, 0xa2, 0x1a, 0x43, 0x8c, 0xb9, 0xbf, 0x42, 0x73, 0xf9, 0xcf, 0x42, 0x0c, 0x0b, 0x02, 0x43, + 0x46, 0xb0, 0xe3, 0x42, 0xbd, 0xdc, 0xde, 0x42, 0xf5, 0x1e, 0x03, 0x43, 0x3c, 0xf4, 0x09, 0x43, + 0x7e, 0x74, 0x47, 0x43, 0x02, 0x44, 0x37, 0x43, 0x56, 0x50, 0x33, 0x43, 0xbf, 0x77, 0x16, 0x43, + 0xeb, 0x9a, 0x1f, 0x43, 0x8a, 0x9f, 0x1f, 0x43, 0x8d, 0xbb, 0x0f, 0x43, 0x98, 0x19, 0xb4, 0x42, + 0x0b, 0x1c, 0xb0, 0x42, 0x3b, 0xf9, 0xf0, 0x42, 0x70, 0xbc, 0xe4, 0x42, 0xfc, 0x5f, 0x06, 0x43, + 0xb7, 0x5f, 0x03, 0x43, 0x8a, 0xf0, 0x15, 0x43, 0x58, 0xc6, 0x43, 0x43, 0x06, 0x20, 0x3a, 0x43, + 0x23, 0xe3, 0x1b, 0x43, 0x21, 0xba, 0x21, 0x43, 0x00, 0xbd, 0x22, 0x43, 0x41, 0x5e, 0x12, 0x43, + 0x0b, 0x07, 0x05, 0x43, 0x25, 0xa7, 0xa0, 0x42, 0xb5, 0xd0, 0xce, 0x42, 0xf2, 0x04, 0x0a, 0x43, + 0x88, 0xe8, 0xfd, 0x42, 0xf0, 0xab, 0x10, 0x43, 0x4e, 0x2e, 0x05, 0x43, 0x20, 0xfa, 0x23, 0x43, + 0x75, 0x3b, 0x3b, 0x43, 0x5a, 0x30, 0x4e, 0x43, 0x5a, 0xd4, 0x3a, 0x43, 0xdb, 0x30, 0x11, 0x43, + 0xa7, 0x31, 0x11, 0x43, 0x5f, 0xdf, 0x04, 0x43, 0x3b, 0xcb, 0xe7, 0x42, 0xdb, 0x76, 0xaa, 0x42, + 0x82, 0xbd, 0xe0, 0x42, 0xc1, 0xfc, 0x10, 0x43, 0x13, 0x5d, 0xfd, 0x42, 0xcd, 0x26, 0x02, 0x43, + 0x2e, 0x8b, 0x15, 0x43, 0xc3, 0x45, 0x20, 0x43, 0x51, 0x07, 0x30, 0x43, 0x5a, 0xb6, 0x40, 0x43, + 0x02, 0xca, 0x19, 0x43, 0x40, 0xfc, 0xf1, 0x42, 0x57, 0xcd, 0xee, 0x42, 0x5e, 0x1f, 0x0d, 0x43, + 0x2a, 0x26, 0x0e, 0x43, 0x1b, 0x02, 0xcf, 0x42, 0x43, 0xfc, 0xd3, 0x42, 0xc8, 0xca, 0x0d, 0x43, + 0x33, 0xb2, 0xf6, 0x42, 0x23, 0xc6, 0xfe, 0x42, 0x56, 0x6f, 0x04, 0x43, 0x24, 0xdf, 0x2d, 0x43, + 0x8d, 0xf3, 0x27, 0x43, 0x6b, 0xec, 0x15, 0x43, 0x9a, 0x97, 0xfe, 0x42, 0x89, 0x20, 0xe2, 0x42, + 0x0a, 0x93, 0xdd, 0x42, 0xcf, 0xb1, 0xfe, 0x42, 0x16, 0xa4, 0x10, 0x43, 0x4c, 0x28, 0xcf, 0x42, + 0x5c, 0x01, 0xbe, 0x42, 0xed, 0xc5, 0x07, 0x43, 0x55, 0x13, 0x1c, 0x43, 0x75, 0xca, 0x18, 0x43, + 0x3e, 0x35, 0x0f, 0x43, 0x4d, 0xab, 0x14, 0x43, 0xf5, 0xaa, 0x15, 0x43, 0x36, 0x75, 0x14, 0x43, + 0x4b, 0xeb, 0x0a, 0x43, 0x46, 0x27, 0x0e, 0x43, 0xee, 0xfe, 0x00, 0x43, 0xc0, 0x58, 0x01, 0x43, + 0xe4, 0xcd, 0x0d, 0x43, 0x46, 0x63, 0xc1, 0x42, 0x85, 0xc6, 0xd2, 0x42, 0x8e, 0x4b, 0x14, 0x43, + 0xa1, 0x69, 0x18, 0x43, 0x45, 0xbd, 0x22, 0x43, 0xa0, 0x62, 0x15, 0x43, 0x7e, 0x3c, 0x22, 0x43, + 0x5e, 0xd7, 0x1b, 0x43, 0xe0, 0x18, 0x2c, 0x43, 0x6a, 0x9b, 0x22, 0x43, 0xc0, 0xbf, 0x12, 0x43, + 0xf4, 0xbd, 0x0d, 0x43, 0x98, 0x54, 0x1b, 0x43, 0xdc, 0x3a, 0x23, 0x43, 0x86, 0xbb, 0xe2, 0x42, + 0x6f, 0x8e, 0xc7, 0x42, 0x71, 0x56, 0x1f, 0x43, 0xba, 0xe9, 0x13, 0x43, 0x62, 0xb3, 0x1f, 0x43, + 0xee, 0xae, 0x1b, 0x43, 0xe6, 0x36, 0x1e, 0x43, 0xfa, 0x59, 0x15, 0x43, 0x44, 0xe1, 0x1f, 0x43, + 0x96, 0x33, 0x18, 0x43, 0xc0, 0x35, 0x18, 0x43, 0x81, 0x48, 0x20, 0x43, 0xc0, 0xd3, 0x1b, 0x43, + 0xfe, 0x3f, 0x42, 0x43, 0x8f, 0xf9, 0xf7, 0x42, 0x16, 0xd7, 0xa6, 0x42, 0xca, 0x49, 0x07, 0x43, + 0x6d, 0x59, 0xde, 0x42, 0x4b, 0x50, 0x0d, 0x43, 0xa6, 0x80, 0xf4, 0x42, 0x34, 0xac, 0xe7, 0x42, + 0x50, 0x0b, 0x08, 0x43, 0x22, 0x74, 0x1b, 0x43, 0x9a, 0xee, 0x1f, 0x43, 0x3a, 0x1f, 0x2b, 0x43, + 0x2f, 0x6f, 0x27, 0x43, 0x48, 0x7b, 0x3d, 0x43, 0x73, 0x5c, 0x18, 0x43, 0xe3, 0xd0, 0xc1, 0x42, + 0xa9, 0x29, 0xc3, 0x42, 0x31, 0x61, 0xe6, 0x42, 0xc1, 0x8d, 0xa6, 0x42, 0xb4, 0x30, 0xf4, 0x42, + 0xe3, 0x90, 0x02, 0x43, 0x18, 0x53, 0x04, 0x43, 0xc5, 0x3f, 0xfe, 0x42, 0x78, 0x89, 0x16, 0x43, + 0x9d, 0x49, 0x25, 0x43, 0x49, 0xe9, 0x39, 0x43, 0xea, 0x85, 0x40, 0x43, 0xaa, 0x0e, 0x22, 0x43, + 0xf3, 0x35, 0xe8, 0x42, 0x89, 0x36, 0xa6, 0x42, 0xf3, 0x0a, 0x72, 0x42, 0xc9, 0x7e, 0x8b, 0x42, + 0x89, 0x25, 0x99, 0x42, 0xa2, 0xd7, 0x9a, 0x42, 0x3f, 0x01, 0xb6, 0x42, 0x0d, 0x75, 0xb9, 0x42, + 0x41, 0xe7, 0xb4, 0x42, 0x95, 0xf9, 0xd2, 0x42, 0xf1, 0x91, 0xe3, 0x42, 0xb6, 0x0d, 0x06, 0x43, + 0x99, 0xc3, 0xcd, 0x42, 0x93, 0x43, 0xa1, 0x42, 0xeb, 0x50, 0x76, 0x42, 0xe3, 0x82, 0x6d, 0x42, + 0x92, 0x15, 0x36, 0x42, 0x70, 0x82, 0x8a, 0x42, 0x9f, 0x24, 0x7f, 0x42, 0xda, 0x5f, 0x9f, 0x42, + 0xd0, 0x1c, 0xc9, 0x42, 0x92, 0x36, 0xc4, 0x42, 0x86, 0x27, 0xc1, 0x42, 0x2a, 0xac, 0xbc, 0x42, + 0x58, 0xc1, 0xc3, 0x42, 0x62, 0x7d, 0x88, 0x42, 0x3c, 0x6a, 0xd6, 0x42, 0xdc, 0xda, 0xa9, 0x42, + 0x52, 0xbb, 0xab, 0x42, 0x09, 0x51, 0x34, 0x42, 0x06, 0x65, 0x9f, 0x42, 0xda, 0x70, 0xcd, 0x42, + 0x40, 0x31, 0xd5, 0x42, 0x48, 0x53, 0xfc, 0x42, 0xc2, 0x32, 0x0b, 0x43, 0x52, 0x85, 0xfb, 0x42, + 0x4b, 0xc0, 0x17, 0x43, 0x1b, 0xfc, 0x11, 0x43, 0x64, 0xe7, 0x19, 0x43, 0xc4, 0xd5, 0xd7, 0x42, + 0xba, 0x06, 0x19, 0x43, 0x63, 0xa7, 0x05, 0x43, 0xa7, 0xf8, 0x18, 0x43, 0xf8, 0x9e, 0xaa, 0x42, + 0x32, 0xbf, 0xba, 0x42, 0x50, 0x7d, 0xb7, 0x42, 0x16, 0xd3, 0xbd, 0x42, 0xcc, 0xcc, 0x00, 0x43, + 0xd3, 0xd6, 0x09, 0x43, 0x71, 0xca, 0x06, 0x43, 0x87, 0x8c, 0x20, 0x43, 0xf3, 0x21, 0x23, 0x43, + 0xa7, 0x0c, 0x13, 0x43, 0xa0, 0xd4, 0x01, 0x43, 0x97, 0x68, 0x0d, 0x43, 0x66, 0xdd, 0x07, 0x43, + 0xca, 0x1d, 0x0f, 0x43, 0xc0, 0xdd, 0xc4, 0x42, 0xb8, 0xf1, 0xa0, 0x42, 0x1e, 0x48, 0xf6, 0x42, + 0x3e, 0x9f, 0xd9, 0x42, 0x32, 0xfe, 0x06, 0x43, 0x38, 0x3e, 0xfa, 0x42, 0x49, 0x11, 0x15, 0x43, + 0xab, 0x3f, 0x1b, 0x43, 0xc7, 0xfd, 0x27, 0x43, 0x21, 0xfc, 0x1f, 0x43, 0x50, 0xaf, 0x1d, 0x43, + 0x29, 0xad, 0x02, 0x43, 0x49, 0xe3, 0x16, 0x43, 0xe0, 0x1a, 0xfb, 0x42, 0xa6, 0x32, 0xbd, 0x42, + 0x90, 0xd9, 0xcd, 0x42, 0xce, 0x5a, 0xea, 0x42, 0xe4, 0xbb, 0xd2, 0x42, 0xf4, 0x73, 0x01, 0x43, + 0x26, 0x9a, 0xda, 0x42, 0x7a, 0x81, 0x17, 0x43, 0x7b, 0x8d, 0x28, 0x43, 0xf1, 0x59, 0x23, 0x43, + 0x51, 0xf3, 0x28, 0x43, 0xdf, 0x50, 0x19, 0x43, 0x73, 0xae, 0x09, 0x43, 0x9a, 0x7c, 0xf8, 0x42, + 0x66, 0x04, 0xf2, 0x42, 0x20, 0x5b, 0x9f, 0x42, 0xec, 0x3c, 0xdb, 0x42, 0x0d, 0xc4, 0x04, 0x43, + 0x8c, 0xac, 0xeb, 0x42, 0x72, 0x47, 0x0b, 0x43, 0x2c, 0xba, 0xf5, 0x42, 0x73, 0xd7, 0x06, 0x43, + 0x15, 0x6a, 0x36, 0x43, 0xdd, 0xb7, 0x35, 0x43, 0x57, 0x89, 0x33, 0x43, 0x6f, 0xf0, 0x0c, 0x43, + 0xd1, 0x77, 0x16, 0x43, 0x3c, 0x21, 0x00, 0x43, 0xe3, 0x6a, 0x09, 0x43, 0xaa, 0xb1, 0xa8, 0x42, + 0x18, 0x9c, 0xd8, 0x42, 0x9f, 0xe6, 0x0b, 0x43, 0xea, 0x77, 0xe7, 0x42, 0xa8, 0xc4, 0xfb, 0x42, + 0x35, 0xb3, 0x0f, 0x43, 0xe8, 0xc9, 0x12, 0x43, 0x5b, 0x2d, 0x33, 0x43, 0x51, 0xfc, 0x1e, 0x43, + 0xeb, 0x43, 0x03, 0x43, 0x06, 0x11, 0xcf, 0x42, 0x62, 0x1a, 0xed, 0x42, 0xa2, 0xe5, 0x02, 0x43, + 0xa0, 0x6b, 0x0d, 0x43, 0x32, 0x25, 0xa3, 0x42, 0x58, 0x7b, 0xcd, 0x42, 0x3b, 0x7e, 0x12, 0x43, + 0xb4, 0x6a, 0xdc, 0x42, 0x20, 0x02, 0xf6, 0x42, 0x9e, 0x4d, 0xfc, 0x42, 0x94, 0xab, 0x20, 0x43, + 0xcb, 0xdb, 0x1d, 0x43, 0x0c, 0x19, 0x13, 0x43, 0xc7, 0xd8, 0x00, 0x43, 0xe6, 0xc5, 0xd9, 0x42, + 0xe2, 0xae, 0xc9, 0x42, 0x28, 0x70, 0x01, 0x43, 0x93, 0x22, 0x0e, 0x43, 0xf2, 0xbc, 0xb7, 0x42, + 0xba, 0x29, 0xaa, 0x42, 0xe1, 0x49, 0x1a, 0x43, 0xa0, 0xde, 0x00, 0x43, 0xac, 0x00, 0x02, 0x43, + 0x59, 0x3f, 0x01, 0x43, 0x25, 0x1f, 0x20, 0x43, 0x38, 0x32, 0x1c, 0x43, 0x55, 0x7b, 0x05, 0x43, + 0x6a, 0x15, 0x06, 0x43, 0x9b, 0xa0, 0x05, 0x43, 0x5c, 0x86, 0xf0, 0x42, 0xaa, 0xa6, 0xfa, 0x42, + 0x69, 0x51, 0x16, 0x43, 0x54, 0xb6, 0xc9, 0x42, 0x94, 0x73, 0xc5, 0x42, 0x31, 0x68, 0x19, 0x43, + 0x4c, 0xf1, 0x20, 0x43, 0xd8, 0xda, 0x16, 0x43, 0x19, 0x29, 0x0b, 0x43, 0xf1, 0x45, 0x21, 0x43, + 0x38, 0x2f, 0x0c, 0x43, 0xcd, 0xa2, 0x20, 0x43, 0xab, 0xb1, 0x0f, 0x43, 0x02, 0xf4, 0x01, 0x43, + 0x27, 0x9e, 0x02, 0x43, 0x2b, 0x67, 0x12, 0x43, 0x7b, 0x2d, 0x1f, 0x43, 0xfc, 0x3a, 0xde, 0x42, + 0xdc, 0xca, 0xd8, 0x42, 0x52, 0x88, 0x00, 0x43, 0x42, 0x53, 0x22, 0x43, 0x5f, 0xd1, 0x09, 0x43, + 0x9c, 0x0b, 0x07, 0x43, 0x54, 0x98, 0x0c, 0x43, 0xa1, 0xe0, 0x07, 0x43, 0x23, 0x25, 0x26, 0x43, + 0x33, 0x1c, 0x0b, 0x43, 0x3b, 0x39, 0x04, 0x43, 0xd1, 0xcc, 0x11, 0x43, 0x70, 0xae, 0x17, 0x43, + 0x09, 0x5e, 0x2c, 0x43, 0x4a, 0x81, 0xbf, 0x42, 0x52, 0x5f, 0xad, 0x42, 0xc0, 0x89, 0xe5, 0x42, + 0xea, 0xf0, 0x0a, 0x43, 0x9e, 0x70, 0xfc, 0x42, 0xc8, 0x95, 0xe3, 0x42, 0xf8, 0x98, 0xf5, 0x42, + 0xb1, 0xcc, 0x09, 0x43, 0x47, 0x10, 0x11, 0x43, 0x64, 0xd6, 0x0d, 0x43, 0x18, 0x19, 0x19, 0x43, + 0x80, 0xb2, 0x2a, 0x43, 0x2f, 0x18, 0x2b, 0x43, 0xe6, 0xcd, 0x13, 0x43, 0xd0, 0x9f, 0xa5, 0x42, + 0xd4, 0x99, 0xaa, 0x42, 0x7a, 0x76, 0xc2, 0x42, 0xd6, 0xe5, 0xe2, 0x42, 0x5c, 0x4a, 0x03, 0x43, + 0x14, 0x51, 0xc9, 0x42, 0x0c, 0xf1, 0xce, 0x42, 0xa9, 0x85, 0x09, 0x43, 0x12, 0xd6, 0x1d, 0x43, + 0xa2, 0x30, 0x15, 0x43, 0xdd, 0xe0, 0x2e, 0x43, 0x5f, 0x78, 0x13, 0x43, 0x35, 0x50, 0x08, 0x43, + 0xa4, 0x61, 0xfc, 0x42, 0x8c, 0x96, 0x97, 0x42, 0x79, 0x23, 0x61, 0x42, 0xfe, 0x55, 0x87, 0x42, + 0x94, 0xa3, 0x8b, 0x42, 0x06, 0xf9, 0xb2, 0x42, 0xba, 0xb3, 0xb1, 0x42, 0xde, 0x1a, 0x8c, 0x42, + 0xba, 0x0b, 0xa1, 0x42, 0x5c, 0xab, 0xd3, 0x42, 0x64, 0x98, 0xed, 0x42, 0x10, 0x97, 0xfd, 0x42, + 0x66, 0xfd, 0xc9, 0x42, 0x9c, 0xbc, 0x8a, 0x42, 0xea, 0xed, 0x97, 0x42, 0x17, 0xcd, 0x4c, 0x42, + 0x32, 0xcb, 0xb6, 0x41, 0xb5, 0x7d, 0x60, 0x42, 0x23, 0xc4, 0x86, 0x42, 0x4c, 0xb5, 0x92, 0x42, + 0xd3, 0xf7, 0xab, 0x42, 0x90, 0x26, 0x9e, 0x42, 0x82, 0x0f, 0xbd, 0x42, 0x0a, 0x00, 0xa7, 0x42, + 0x08, 0x96, 0xc0, 0x42, 0xc5, 0x33, 0x8c, 0x42, 0x04, 0xcc, 0xa6, 0x42, 0xf6, 0x85, 0x92, 0x42, + 0xae, 0x54, 0xb9, 0x42, 0xb5, 0x5c, 0x37, 0x42, 0xc3, 0x69, 0xb1, 0x42, 0x73, 0x78, 0xd0, 0x42, + 0x16, 0xc4, 0xa6, 0x42, 0x8c, 0x65, 0xd0, 0x42, 0x3c, 0x2d, 0x0f, 0x43, 0x42, 0x7c, 0xf1, 0x42, + 0x63, 0x70, 0x1c, 0x43, 0xb5, 0xec, 0x10, 0x43, 0x9f, 0x30, 0x19, 0x43, 0x53, 0xf2, 0xed, 0x42, + 0x0b, 0xc2, 0x0d, 0x43, 0x9b, 0x83, 0x1b, 0x43, 0xf6, 0xc6, 0x0a, 0x43, 0x68, 0xc9, 0x97, 0x42, + 0x31, 0xc0, 0xb8, 0x42, 0x3a, 0xd1, 0xd1, 0x42, 0x57, 0x5f, 0xe1, 0x42, 0x44, 0x6e, 0xf5, 0x42, + 0x32, 0x3b, 0x1a, 0x43, 0xee, 0x35, 0x19, 0x43, 0x4d, 0x67, 0x1e, 0x43, 0x87, 0xd1, 0x23, 0x43, + 0x5f, 0x47, 0x14, 0x43, 0x22, 0xff, 0x0a, 0x43, 0x87, 0x46, 0x18, 0x43, 0x2f, 0xbb, 0x0f, 0x43, + 0xdf, 0xa4, 0x12, 0x43, 0xaf, 0xf7, 0xbc, 0x42, 0xb2, 0x53, 0xdb, 0x42, 0x59, 0xd2, 0xe8, 0x42, + 0x38, 0xdd, 0xc4, 0x42, 0x00, 0xdb, 0xe4, 0x42, 0x7b, 0x9f, 0x01, 0x43, 0x02, 0x67, 0x01, 0x43, + 0x90, 0x79, 0x3f, 0x43, 0xa4, 0x6e, 0x33, 0x43, 0x3f, 0x2f, 0x34, 0x43, 0x7e, 0x67, 0x11, 0x43, + 0x69, 0x0b, 0x1e, 0x43, 0x15, 0x70, 0x20, 0x43, 0x4f, 0xc7, 0x06, 0x43, 0x7c, 0x5c, 0xaa, 0x42, + 0x6c, 0x80, 0xad, 0x42, 0x00, 0x1f, 0xe4, 0x42, 0x56, 0x69, 0xf4, 0x42, 0xcb, 0xbb, 0xf6, 0x42, + 0x61, 0x45, 0x06, 0x43, 0x40, 0x83, 0x1b, 0x43, 0x8a, 0xbe, 0x1d, 0x43, 0x23, 0xd9, 0x40, 0x43, + 0xca, 0xbd, 0x29, 0x43, 0x53, 0x64, 0x10, 0x43, 0x7d, 0x59, 0x14, 0x43, 0x2f, 0x9e, 0x19, 0x43, + 0x7e, 0xb4, 0xfc, 0x42, 0x96, 0x91, 0x96, 0x42, 0x6f, 0xf6, 0xcf, 0x42, 0xf5, 0x17, 0x13, 0x43, + 0x65, 0x53, 0xe8, 0x42, 0x40, 0xf5, 0xfc, 0x42, 0x67, 0xc2, 0x08, 0x43, 0xc9, 0x39, 0x0a, 0x43, + 0x5d, 0x71, 0x36, 0x43, 0xe3, 0xd0, 0x4b, 0x43, 0x45, 0x41, 0x3c, 0x43, 0xee, 0xfd, 0x12, 0x43, + 0x67, 0xaf, 0x0d, 0x43, 0xe7, 0xfe, 0x05, 0x43, 0x6d, 0xfe, 0x00, 0x43, 0x6c, 0xf7, 0xa4, 0x42, + 0xc9, 0x10, 0xd0, 0x42, 0x2b, 0xf1, 0x0f, 0x43, 0xfe, 0x3d, 0xfd, 0x42, 0xdc, 0xc8, 0xfa, 0x42, + 0xdf, 0xa4, 0x0f, 0x43, 0x54, 0x08, 0x16, 0x43, 0x2f, 0x0a, 0x2a, 0x43, 0x3e, 0x13, 0x2c, 0x43, + 0xd8, 0x7f, 0x19, 0x43, 0x25, 0x04, 0xf3, 0x42, 0x27, 0x86, 0xe1, 0x42, 0x51, 0xb9, 0xf3, 0x42, + 0xf5, 0x35, 0x18, 0x43, 0x74, 0xb9, 0xb0, 0x42, 0x34, 0x2e, 0xc8, 0x42, 0xdc, 0x39, 0x05, 0x43, + 0x50, 0x0b, 0xf5, 0x42, 0x5c, 0x63, 0x0b, 0x43, 0x1c, 0x45, 0xf9, 0x42, 0x03, 0x4b, 0x1c, 0x43, + 0x8c, 0xf5, 0x2c, 0x43, 0xfc, 0x67, 0x29, 0x43, 0xff, 0x60, 0x21, 0x43, 0xe6, 0x4b, 0xcb, 0x42, + 0x1f, 0x99, 0xcb, 0x42, 0xb0, 0x24, 0x0f, 0x43, 0x7b, 0x9b, 0x1c, 0x43, 0x83, 0x6f, 0xb7, 0x42, + 0x51, 0xd7, 0xc8, 0x42, 0x79, 0xd8, 0x23, 0x43, 0x3e, 0x5c, 0x0e, 0x43, 0x3b, 0x82, 0xf0, 0x42, + 0x77, 0x13, 0x03, 0x43, 0x7f, 0x8e, 0x12, 0x43, 0xe7, 0x62, 0x11, 0x43, 0x72, 0xa1, 0x07, 0x43, + 0x11, 0xdd, 0x16, 0x43, 0x8f, 0x6f, 0xef, 0x42, 0x19, 0x29, 0x05, 0x43, 0x4e, 0x2f, 0xe8, 0x42, + 0x9b, 0x32, 0x16, 0x43, 0x33, 0x9c, 0xd7, 0x42, 0xee, 0x05, 0xb7, 0x42, 0x83, 0x9b, 0x20, 0x43, + 0x34, 0xe0, 0x12, 0x43, 0xb4, 0xc2, 0x23, 0x43, 0xe3, 0x37, 0x1e, 0x43, 0xa3, 0xc0, 0x09, 0x43, + 0x39, 0xf4, 0x17, 0x43, 0x05, 0xf9, 0x1f, 0x43, 0xf5, 0xad, 0x17, 0x43, 0xf4, 0xed, 0x15, 0x43, + 0x78, 0x60, 0xfa, 0x42, 0xb5, 0x9c, 0x07, 0x43, 0x49, 0xa8, 0x26, 0x43, 0x59, 0xa4, 0xe6, 0x42, + 0xb4, 0x29, 0xa6, 0x42, 0xca, 0x81, 0x1c, 0x43, 0x50, 0x63, 0x18, 0x43, 0xef, 0x23, 0x1b, 0x43, + 0x47, 0x01, 0x1b, 0x43, 0x11, 0x17, 0x19, 0x43, 0x2d, 0xfc, 0x18, 0x43, 0x33, 0x66, 0x10, 0x43, + 0x81, 0x5e, 0x0e, 0x43, 0xbc, 0xb7, 0x09, 0x43, 0xac, 0x63, 0x25, 0x43, 0xec, 0xf6, 0x20, 0x43, + 0xbf, 0xb5, 0x1f, 0x43, 0x56, 0xcf, 0xd7, 0x42, 0x80, 0xb3, 0x98, 0x42, 0x66, 0x90, 0x0d, 0x43, + 0xf8, 0x0f, 0xf9, 0x42, 0x9f, 0x7a, 0x05, 0x43, 0x34, 0x07, 0xed, 0x42, 0xb3, 0x1f, 0x05, 0x43, + 0xc6, 0x38, 0x17, 0x43, 0x5c, 0x1c, 0x2d, 0x43, 0xe1, 0xf8, 0x0b, 0x43, 0x9f, 0xfe, 0x25, 0x43, + 0xb6, 0xb7, 0x1d, 0x43, 0x1b, 0xb5, 0x39, 0x43, 0xdf, 0xde, 0x1c, 0x43, 0x1b, 0x7f, 0xc4, 0x42, + 0xaf, 0x61, 0xa9, 0x42, 0xd2, 0x23, 0xdd, 0x42, 0x06, 0x1a, 0xe6, 0x42, 0x72, 0xd4, 0xf6, 0x42, + 0x01, 0x1f, 0xcb, 0x42, 0xd8, 0x79, 0xdd, 0x42, 0x3d, 0x05, 0xdc, 0x42, 0xac, 0xdb, 0x28, 0x43, + 0x55, 0x02, 0x24, 0x43, 0xb9, 0xdd, 0x2c, 0x43, 0x51, 0xbc, 0x1c, 0x43, 0x99, 0xc3, 0x1c, 0x43, + 0x70, 0x4d, 0x05, 0x43, 0xf2, 0xd9, 0xac, 0x42, 0xfd, 0xac, 0x2a, 0x42, 0x19, 0x32, 0x9c, 0x42, + 0xa4, 0x19, 0x85, 0x42, 0xc3, 0xe3, 0x98, 0x42, 0xb2, 0xa7, 0xb1, 0x42, 0x36, 0xac, 0x8c, 0x42, + 0x15, 0x0b, 0xa6, 0x42, 0xdd, 0xdf, 0xcd, 0x42, 0xcc, 0x82, 0xed, 0x42, 0x08, 0x66, 0x05, 0x43, + 0x21, 0xf0, 0xd2, 0x42, 0xa3, 0x24, 0xa7, 0x42, 0xb5, 0xf1, 0x45, 0x42, 0xdc, 0x76, 0x52, 0x42, + 0x66, 0x8a, 0x49, 0x42, 0x56, 0x70, 0x9b, 0x42, 0x66, 0x61, 0x60, 0x42, 0xb6, 0xa1, 0xa5, 0x42, + 0x5b, 0x5f, 0xbe, 0x42, 0xc9, 0x3a, 0xc3, 0x42, 0xc4, 0x26, 0xc9, 0x42, 0x5e, 0x81, 0xb2, 0x42, + 0x0b, 0x47, 0xd4, 0x42, 0x6b, 0xd2, 0xae, 0x42, 0x4f, 0x8a, 0xb5, 0x42, 0x22, 0x7a, 0xa8, 0x42, + 0x97, 0xc9, 0xa2, 0x42, 0x85, 0xb0, 0x23, 0x42, 0xea, 0xe8, 0xb0, 0x42, 0xe8, 0xa0, 0xcc, 0x42, + 0x49, 0x0f, 0xd2, 0x42, 0x5c, 0xd2, 0xfd, 0x42, 0xb2, 0xc0, 0xef, 0x42, 0xe8, 0x3a, 0xf4, 0x42, + 0xf7, 0x51, 0x0d, 0x43, 0x76, 0x03, 0x0f, 0x43, 0xae, 0xfc, 0x18, 0x43, 0xba, 0x21, 0xdc, 0x42, + 0x2f, 0x93, 0x08, 0x43, 0x90, 0x30, 0x18, 0x43, 0xce, 0x79, 0x15, 0x43, 0x86, 0x70, 0xb2, 0x42, + 0x04, 0xa4, 0x99, 0x42, 0xfe, 0xf0, 0xe0, 0x42, 0x20, 0xbc, 0xe0, 0x42, 0x5e, 0x23, 0xdc, 0x42, + 0x22, 0xd9, 0x08, 0x43, 0xb2, 0x79, 0x08, 0x43, 0x89, 0xc7, 0x1d, 0x43, 0x94, 0x98, 0x1d, 0x43, + 0xd8, 0xc3, 0x1a, 0x43, 0x04, 0x0a, 0xf2, 0x42, 0x5c, 0xcf, 0x15, 0x43, 0x92, 0x8e, 0x11, 0x43, + 0x22, 0xd0, 0x1b, 0x43, 0x24, 0x30, 0xbe, 0x42, 0x3a, 0x9b, 0xbb, 0x42, 0xf9, 0xaa, 0x04, 0x43, + 0xdb, 0x74, 0xf4, 0x42, 0x43, 0xc3, 0x01, 0x43, 0x71, 0xfe, 0x00, 0x43, 0xfe, 0x2b, 0x0e, 0x43, + 0x56, 0xf6, 0x1b, 0x43, 0xc3, 0xf5, 0x3a, 0x43, 0xe7, 0xa6, 0x31, 0x43, 0x24, 0xd0, 0x24, 0x43, + 0x21, 0x67, 0x17, 0x43, 0x49, 0x04, 0x17, 0x43, 0x1f, 0xb0, 0x0b, 0x43, 0x1c, 0x32, 0x9f, 0x42, + 0x56, 0x49, 0xb4, 0x42, 0xa8, 0x62, 0xe6, 0x42, 0x14, 0xb4, 0xd8, 0x42, 0x2c, 0xa1, 0xe9, 0x42, + 0x6f, 0x3e, 0x01, 0x43, 0x91, 0x47, 0x14, 0x43, 0xbb, 0x17, 0x21, 0x43, 0x6a, 0x13, 0x3d, 0x43, + 0x4b, 0x56, 0x2e, 0x43, 0x34, 0x5a, 0x1d, 0x43, 0x2c, 0xed, 0x0b, 0x43, 0xa2, 0xf6, 0x0d, 0x43, + 0xa0, 0xb7, 0xfb, 0x42, 0xbe, 0x88, 0xb2, 0x42, 0x24, 0x91, 0xba, 0x42, 0x16, 0xc2, 0xf8, 0x42, + 0xe0, 0xf1, 0xfb, 0x42, 0x6f, 0x7c, 0x0b, 0x43, 0x18, 0xcb, 0xea, 0x42, 0xad, 0xf4, 0x14, 0x43, + 0x3a, 0xeb, 0x3e, 0x43, 0xf5, 0x76, 0x40, 0x43, 0x6c, 0xf9, 0x42, 0x43, 0x15, 0x36, 0x17, 0x43, + 0x92, 0x62, 0x02, 0x43, 0x47, 0xc6, 0xf7, 0x42, 0xc9, 0xcc, 0x03, 0x43, 0x7a, 0x56, 0xa8, 0x42, + 0x9e, 0x52, 0xd5, 0x42, 0x75, 0x8a, 0x09, 0x43, 0x75, 0x17, 0xfc, 0x42, 0x57, 0x17, 0xfe, 0x42, + 0x98, 0x84, 0x05, 0x43, 0xf0, 0x43, 0x19, 0x43, 0xe4, 0xc1, 0x27, 0x43, 0x40, 0xd8, 0x11, 0x43, + 0x47, 0x72, 0x18, 0x43, 0x86, 0xcb, 0xea, 0x42, 0x55, 0x31, 0x05, 0x43, 0xac, 0xf4, 0xfa, 0x42, + 0xa0, 0x09, 0x06, 0x43, 0x6d, 0x81, 0xc6, 0x42, 0x98, 0x56, 0xca, 0x42, 0xdb, 0x4b, 0x10, 0x43, + 0x0e, 0xa3, 0xf4, 0x42, 0x1c, 0x0d, 0x00, 0x43, 0x68, 0xb6, 0x05, 0x43, 0x71, 0xc2, 0x08, 0x43, + 0x09, 0xf1, 0x2b, 0x43, 0x0d, 0x1f, 0x10, 0x43, 0x46, 0x21, 0x0a, 0x43, 0x08, 0x5c, 0xea, 0x42, + 0xe3, 0x2b, 0xf8, 0x42, 0x3c, 0x26, 0x04, 0x43, 0xd4, 0x43, 0x04, 0x43, 0xba, 0x6a, 0xce, 0x42, + 0x64, 0xd2, 0xc2, 0x42, 0x96, 0xde, 0x14, 0x43, 0x81, 0xee, 0x01, 0x43, 0x48, 0xe2, 0xf2, 0x42, + 0xd6, 0x50, 0x12, 0x43, 0xc1, 0x08, 0x0a, 0x43, 0xc1, 0x63, 0x1e, 0x43, 0x98, 0xe2, 0x06, 0x43, + 0x03, 0x86, 0xee, 0x42, 0xf6, 0x4e, 0xff, 0x42, 0x84, 0x5e, 0xf7, 0x42, 0xc6, 0x54, 0xfe, 0x42, + 0x16, 0xde, 0x19, 0x43, 0x00, 0x73, 0xc5, 0x42, 0x58, 0xab, 0xb0, 0x42, 0x19, 0x32, 0x20, 0x43, + 0x64, 0xa9, 0x1c, 0x43, 0xd8, 0xcb, 0x1e, 0x43, 0x58, 0x6e, 0x1c, 0x43, 0x1e, 0x82, 0x21, 0x43, + 0xdf, 0x4e, 0x1e, 0x43, 0xea, 0x0d, 0x1e, 0x43, 0x48, 0x71, 0x13, 0x43, 0x02, 0xb8, 0xfb, 0x42, + 0xa8, 0xaa, 0xfd, 0x42, 0x25, 0x6d, 0x1a, 0x43, 0xc0, 0xb9, 0x28, 0x43, 0x27, 0xd9, 0xc6, 0x42, + 0xca, 0x69, 0xb3, 0x42, 0x1a, 0xa5, 0x19, 0x43, 0x64, 0xa7, 0x17, 0x43, 0xe0, 0xcf, 0x0c, 0x43, + 0x45, 0xb3, 0xfc, 0x42, 0xbe, 0x6c, 0x0d, 0x43, 0x24, 0xcf, 0x11, 0x43, 0xfe, 0x89, 0x1a, 0x43, + 0xf6, 0x27, 0x13, 0x43, 0xbb, 0xd7, 0x06, 0x43, 0x3c, 0xc5, 0x1c, 0x43, 0xa4, 0x8c, 0x1a, 0x43, + 0x60, 0x6c, 0x2e, 0x43, 0x5a, 0x77, 0xdd, 0x42, 0x8d, 0x46, 0x9e, 0x42, 0xe8, 0xd5, 0xfa, 0x42, + 0x81, 0x60, 0xe8, 0x42, 0x25, 0xa3, 0x04, 0x43, 0xbc, 0x0f, 0xf9, 0x42, 0x74, 0x4f, 0x04, 0x43, + 0xf1, 0x3c, 0x03, 0x43, 0x56, 0xe8, 0x16, 0x43, 0xcc, 0x1c, 0x10, 0x43, 0xb5, 0xb0, 0x1c, 0x43, + 0x8e, 0x8e, 0x19, 0x43, 0x28, 0xd0, 0x32, 0x43, 0x30, 0x71, 0x19, 0x43, 0xb7, 0xf4, 0xbe, 0x42, + 0x67, 0x0f, 0x99, 0x42, 0x23, 0x3b, 0xeb, 0x42, 0xd8, 0x80, 0xec, 0x42, 0x85, 0xb6, 0xdf, 0x42, + 0x4b, 0x7d, 0xf9, 0x42, 0x21, 0x00, 0xde, 0x42, 0xe4, 0x7f, 0xfb, 0x42, 0x01, 0xc9, 0x17, 0x43, + 0x5c, 0x6f, 0x1d, 0x43, 0xfc, 0x28, 0x32, 0x43, 0x47, 0xc3, 0x1d, 0x43, 0xc4, 0xdb, 0x0f, 0x43, + 0x16, 0x01, 0x06, 0x43, 0xfa, 0x3f, 0xa3, 0x42, 0xe2, 0x2d, 0x6d, 0x42, 0x83, 0x79, 0x94, 0x42, + 0xc2, 0x7f, 0x96, 0x42, 0xf1, 0x10, 0xa1, 0x42, 0x9b, 0xea, 0xa0, 0x42, 0xb4, 0x79, 0x97, 0x42, + 0x2c, 0xf8, 0xa1, 0x42, 0xac, 0x97, 0xd0, 0x42, 0x2e, 0xba, 0xdb, 0x42, 0xb6, 0x0b, 0xfc, 0x42, + 0xd6, 0x52, 0xd2, 0x42, 0x0c, 0xfd, 0xb2, 0x42, 0x6c, 0xa5, 0x83, 0x42, 0x65, 0x4b, 0x69, 0x42, + 0xe1, 0x3f, 0x7a, 0x42, 0x59, 0x6c, 0xbf, 0x42, 0x1c, 0xd6, 0x9c, 0x42, 0x13, 0x33, 0xb5, 0x42, + 0xbc, 0x23, 0xe1, 0x42, 0x31, 0x9f, 0xbf, 0x42, 0x7a, 0x37, 0x03, 0x43, 0xd6, 0xb9, 0xd1, 0x42, + 0xfb, 0x0f, 0xed, 0x42, 0x43, 0x14, 0xc0, 0x42, 0x8d, 0xb0, 0xde, 0x42, 0xdf, 0x7f, 0xc9, 0x42, + 0x6f, 0x4e, 0xf5, 0x42, 0x10, 0xb4, 0x68, 0x42, 0xb5, 0x8f, 0xe9, 0x42, 0x0f, 0x35, 0xf9, 0x42, + 0xf0, 0xd9, 0xbc, 0x42, 0xd3, 0x00, 0x03, 0x43, 0xf8, 0x67, 0x0a, 0x43, 0x2e, 0xa5, 0x07, 0x43, + 0x20, 0x2c, 0x2c, 0x43, 0x9c, 0x88, 0x20, 0x43, 0xf2, 0xfb, 0x27, 0x43, 0x9c, 0x95, 0x0a, 0x43, + 0xaa, 0xbb, 0x1f, 0x43, 0x5a, 0xe4, 0x17, 0x43, 0x9a, 0x18, 0x13, 0x43, 0x29, 0xd3, 0xb6, 0x42, + 0xb8, 0xed, 0xbe, 0x42, 0xb0, 0x31, 0xff, 0x42, 0xcb, 0x76, 0xf5, 0x42, 0x82, 0x45, 0x15, 0x43, + 0x6a, 0xd2, 0x18, 0x43, 0x6a, 0xe0, 0x14, 0x43, 0xb6, 0xe4, 0x3a, 0x43, 0x3a, 0x8b, 0x28, 0x43, + 0x5c, 0x85, 0x33, 0x43, 0x6c, 0x5d, 0x2a, 0x43, 0x6c, 0x7a, 0x1e, 0x43, 0x7a, 0x63, 0x22, 0x43, + 0x10, 0x9d, 0x22, 0x43, 0x1b, 0x21, 0xe5, 0x42, 0xe8, 0xfd, 0xde, 0x42, 0xb5, 0xec, 0xfb, 0x42, + 0x31, 0x8a, 0xdc, 0x42, 0xe4, 0x1a, 0x05, 0x43, 0xbe, 0x56, 0x01, 0x43, 0xbe, 0x10, 0x13, 0x43, + 0x14, 0xef, 0x31, 0x43, 0x48, 0xf0, 0x26, 0x43, 0xac, 0x62, 0x43, 0x43, 0xd2, 0x8f, 0x23, 0x43, + 0x8a, 0x5e, 0x1a, 0x43, 0xa0, 0x5d, 0x1d, 0x43, 0xa0, 0x9b, 0x0f, 0x43, 0x20, 0x4a, 0xd9, 0x42, + 0x19, 0x1c, 0xbb, 0x42, 0x02, 0xc3, 0x05, 0x43, 0x96, 0xe1, 0x12, 0x43, 0x4a, 0x5e, 0x06, 0x43, + 0x8e, 0x0b, 0x17, 0x43, 0x4c, 0xb0, 0x27, 0x43, 0xd0, 0x6e, 0x3f, 0x43, 0xb0, 0x07, 0x3c, 0x43, + 0x36, 0xfe, 0x45, 0x43, 0x5a, 0x42, 0x2e, 0x43, 0xea, 0x02, 0x25, 0x43, 0xaa, 0x46, 0x10, 0x43, + 0x52, 0xa2, 0x15, 0x43, 0x2e, 0xd2, 0xab, 0x42, 0xed, 0xa2, 0xcd, 0x42, 0x58, 0x5d, 0x14, 0x43, + 0xa2, 0x6c, 0x07, 0x43, 0x68, 0xfd, 0x18, 0x43, 0x42, 0x0b, 0x15, 0x43, 0xc0, 0x6f, 0x26, 0x43, + 0x94, 0xb5, 0x4a, 0x43, 0x4e, 0xd8, 0x4f, 0x43, 0xc8, 0x9b, 0x3c, 0x43, 0x96, 0x73, 0x2a, 0x43, + 0xe4, 0xab, 0x0c, 0x43, 0x3b, 0x9e, 0xf5, 0x42, 0xb0, 0x32, 0x0c, 0x43, 0x2d, 0x40, 0xcf, 0x42, + 0xdf, 0x27, 0xd2, 0x42, 0x2e, 0x88, 0x1c, 0x43, 0xb0, 0xeb, 0x12, 0x43, 0x32, 0xa2, 0x0d, 0x43, + 0x0a, 0xdf, 0x02, 0x43, 0x6e, 0x9c, 0x2c, 0x43, 0x84, 0xf5, 0x40, 0x43, 0xf0, 0x02, 0x30, 0x43, + 0x10, 0x90, 0x28, 0x43, 0xe0, 0xc6, 0x03, 0x43, 0x9a, 0x4a, 0xfd, 0x42, 0x57, 0x6b, 0x0e, 0x43, + 0x4a, 0xb9, 0x14, 0x43, 0x8a, 0x3b, 0xcc, 0x42, 0xc1, 0x8e, 0xc6, 0x42, 0x20, 0xa5, 0x23, 0x43, + 0xf8, 0x72, 0x11, 0x43, 0x2a, 0x55, 0x0a, 0x43, 0xda, 0xfa, 0x1a, 0x43, 0xf8, 0xfa, 0x1f, 0x43, + 0x98, 0x66, 0x2c, 0x43, 0x94, 0xf9, 0x14, 0x43, 0xde, 0x7e, 0x12, 0x43, 0x2c, 0x09, 0x00, 0x43, + 0x9d, 0x8b, 0xfc, 0x42, 0xa8, 0x33, 0x21, 0x43, 0xbc, 0x1e, 0x18, 0x43, 0x39, 0xe4, 0xe2, 0x42, + 0xf1, 0xa2, 0xdb, 0x42, 0xb6, 0x59, 0x25, 0x43, 0xce, 0x1a, 0x19, 0x43, 0x98, 0xa5, 0x0d, 0x43, + 0x46, 0x00, 0x15, 0x43, 0xfe, 0x60, 0x29, 0x43, 0xca, 0xe4, 0x20, 0x43, 0x9a, 0x55, 0x1f, 0x43, + 0xc0, 0x08, 0x17, 0x43, 0xfc, 0xdf, 0x0e, 0x43, 0x1b, 0x68, 0x05, 0x43, 0xb2, 0xa4, 0x05, 0x43, + 0xa8, 0x1a, 0x17, 0x43, 0x7b, 0x8d, 0xdb, 0x42, 0xff, 0xd6, 0xe0, 0x42, 0xde, 0x18, 0x1b, 0x43, + 0xae, 0xa5, 0x24, 0x43, 0x84, 0x65, 0x2b, 0x43, 0x9c, 0xa0, 0x2b, 0x43, 0x8c, 0x2f, 0x34, 0x43, + 0x96, 0xe9, 0x24, 0x43, 0x14, 0xbb, 0x3a, 0x43, 0x16, 0x17, 0x1a, 0x43, 0x10, 0xea, 0x06, 0x43, + 0x48, 0xe0, 0x0c, 0x43, 0xe2, 0xd6, 0x1d, 0x43, 0xc4, 0x66, 0x3a, 0x43, 0x37, 0xe4, 0xe4, 0x42, + 0x6a, 0xda, 0xc7, 0x42, 0x02, 0x0e, 0x27, 0x43, 0x40, 0x04, 0x18, 0x43, 0xb8, 0x61, 0x29, 0x43, + 0x9c, 0x9c, 0x0b, 0x43, 0x98, 0xb9, 0x12, 0x43, 0x76, 0x90, 0x22, 0x43, 0xe6, 0x16, 0x27, 0x43, + 0xaa, 0x13, 0x1c, 0x43, 0xf0, 0x33, 0x23, 0x43, 0xd0, 0x45, 0x31, 0x43, 0x18, 0xe3, 0x38, 0x43, + 0x20, 0x7b, 0x3f, 0x43, 0xe9, 0xb7, 0xe6, 0x42, 0x97, 0x1c, 0xc0, 0x42, 0x7f, 0x5b, 0x11, 0x43, + 0x24, 0x17, 0xff, 0x42, 0xf4, 0x04, 0x1b, 0x43, 0xfa, 0xc2, 0x0b, 0x43, 0x02, 0xf7, 0x0a, 0x43, + 0xb8, 0x9a, 0x17, 0x43, 0x8e, 0x15, 0x28, 0x43, 0xd0, 0x45, 0x2e, 0x43, 0xac, 0x1d, 0x2a, 0x43, + 0x80, 0x82, 0x2d, 0x43, 0x0e, 0x65, 0x42, 0x43, 0xbe, 0x63, 0x1c, 0x43, 0x78, 0x4c, 0xdd, 0x42, + 0xea, 0x8f, 0xa9, 0x42, 0xfd, 0x2b, 0xfb, 0x42, 0x73, 0x23, 0xf5, 0x42, 0xc0, 0xbd, 0x06, 0x43, + 0x30, 0x12, 0xfe, 0x42, 0x04, 0x8c, 0x09, 0x43, 0x1a, 0x72, 0x09, 0x43, 0x30, 0x6d, 0x26, 0x43, + 0xec, 0x79, 0x33, 0x43, 0x1c, 0x9e, 0x4b, 0x43, 0xac, 0xcf, 0x25, 0x43, 0xa4, 0x4b, 0x1a, 0x43, + 0xf0, 0x0d, 0x03, 0x43, 0xd1, 0x08, 0xbe, 0x42, 0x05, 0x5e, 0x85, 0x42, 0x7b, 0xe3, 0xb3, 0x42, + 0x95, 0xdc, 0xb0, 0x42, 0x03, 0x35, 0xbb, 0x42, 0x8e, 0x2b, 0xcc, 0x42, 0x0a, 0xdc, 0xd2, 0x42, + 0x3b, 0xd8, 0xc2, 0x42, 0x62, 0xef, 0xf1, 0x42, 0x9f, 0x54, 0xea, 0x42, 0x58, 0x1e, 0x0c, 0x43, + 0xba, 0x43, 0xd6, 0x42, 0x9e, 0xa3, 0xd4, 0x42, 0x8d, 0xb0, 0xa8, 0x42, 0x6b, 0xd7, 0x84, 0x42, + 0xde, 0xe2, 0x4b, 0x42, 0x1e, 0x3e, 0x99, 0x42, 0xa7, 0x7e, 0x93, 0x42, 0x28, 0x5f, 0xd2, 0x42, + 0x98, 0x53, 0xdf, 0x42, 0x52, 0x91, 0xd4, 0x42, 0xb6, 0x76, 0xd9, 0x42, 0x82, 0x53, 0xe4, 0x42, + 0x5a, 0xf1, 0xca, 0x42, 0x6a, 0x8d, 0xa7, 0x42, 0x86, 0x4d, 0xc1, 0x42, 0x50, 0x34, 0xd2, 0x42, + 0xe2, 0x53, 0xaa, 0x42, 0x3e, 0xa7, 0x6d, 0x42, 0x36, 0xc4, 0xcd, 0x42, 0x58, 0x28, 0xce, 0x42, + 0x12, 0xb9, 0xca, 0x42, 0xdf, 0xb4, 0x00, 0x43, 0x57, 0xa2, 0x12, 0x43, 0x4f, 0xa9, 0x13, 0x43, + 0x1a, 0x74, 0x25, 0x43, 0xe5, 0xa9, 0x3d, 0x43, 0x66, 0x7b, 0x44, 0x43, 0x1e, 0xbd, 0x07, 0x43, + 0x97, 0xfc, 0x20, 0x43, 0x27, 0xd6, 0x24, 0x43, 0xbc, 0xc5, 0x23, 0x43, 0x82, 0x03, 0xc2, 0x42, + 0x28, 0x4e, 0xe9, 0x42, 0xf4, 0xab, 0xea, 0x42, 0x58, 0xb6, 0xbf, 0x42, 0xfc, 0xa4, 0xf5, 0x42, + 0x26, 0x8a, 0x25, 0x43, 0x0d, 0xd5, 0x0e, 0x43, 0xc0, 0xd6, 0x3b, 0x43, 0xed, 0x5a, 0x39, 0x43, + 0x86, 0x54, 0x39, 0x43, 0x82, 0x6a, 0x12, 0x43, 0x2a, 0xb5, 0x22, 0x43, 0x4a, 0x7e, 0x23, 0x43, + 0xc0, 0x1b, 0x29, 0x43, 0xb8, 0x23, 0xe0, 0x42, 0x7a, 0x0e, 0xcc, 0x42, 0x36, 0xcf, 0x13, 0x43, + 0xf0, 0x80, 0x04, 0x43, 0x58, 0xd9, 0xfc, 0x42, 0xf6, 0xfe, 0x0e, 0x43, 0x23, 0x9f, 0x1d, 0x43, + 0x55, 0x6d, 0x27, 0x43, 0xcc, 0xa1, 0x46, 0x43, 0x60, 0x15, 0x3a, 0x43, 0x3c, 0x48, 0x28, 0x43, + 0xd2, 0xc9, 0x23, 0x43, 0xce, 0x45, 0x2f, 0x43, 0xe2, 0x4c, 0x26, 0x43, 0x2a, 0xce, 0xd9, 0x42, + 0x58, 0x8b, 0xe3, 0x42, 0x58, 0x5f, 0xfe, 0x42, 0x10, 0x99, 0x0a, 0x43, 0xf7, 0x2a, 0x08, 0x43, + 0xd1, 0x73, 0x1e, 0x43, 0x60, 0xf6, 0x33, 0x43, 0xf1, 0x15, 0x30, 0x43, 0x43, 0x73, 0x47, 0x43, + 0x1b, 0x43, 0x38, 0x43, 0x1f, 0x86, 0x20, 0x43, 0xaf, 0x93, 0x15, 0x43, 0x58, 0xc0, 0x22, 0x43, + 0x06, 0x8b, 0x08, 0x43, 0xda, 0x45, 0xc3, 0x42, 0x72, 0x8c, 0xf3, 0x42, 0x3f, 0x76, 0x2e, 0x43, + 0x2f, 0x7f, 0x10, 0x43, 0x7d, 0xbf, 0x19, 0x43, 0x7c, 0x17, 0x17, 0x43, 0xb4, 0x29, 0x47, 0x43, + 0xe0, 0x5e, 0x55, 0x43, 0xd6, 0xa5, 0x4f, 0x43, 0xce, 0x52, 0x58, 0x43, 0x11, 0xb4, 0x1d, 0x43, + 0x88, 0x41, 0x12, 0x43, 0x9e, 0x67, 0x0b, 0x43, 0xd5, 0xee, 0x11, 0x43, 0x78, 0xea, 0xd2, 0x42, + 0xac, 0x5d, 0xc6, 0x42, 0xc6, 0x1e, 0x24, 0x43, 0x1e, 0xad, 0x17, 0x43, 0x46, 0x47, 0x06, 0x43, + 0x09, 0x0a, 0x18, 0x43, 0x43, 0x85, 0x3a, 0x43, 0x7c, 0xfe, 0x3f, 0x43, 0xc6, 0x58, 0x36, 0x43, + 0x70, 0x11, 0x30, 0x43, 0x00, 0x37, 0xf7, 0x42, 0xec, 0x34, 0x06, 0x43, 0x81, 0xc5, 0x0a, 0x43, + 0x56, 0x86, 0x1f, 0x43, 0x02, 0xf3, 0xee, 0x42, 0x1a, 0xf9, 0xee, 0x42, 0xd0, 0x32, 0x1c, 0x43, + 0xd2, 0xa8, 0x02, 0x43, 0xb7, 0x09, 0x09, 0x43, 0x54, 0x5e, 0x1f, 0x43, 0x02, 0x66, 0x2b, 0x43, + 0x5e, 0xb6, 0x42, 0x43, 0x76, 0x34, 0x23, 0x43, 0x2c, 0x69, 0x1b, 0x43, 0xae, 0xce, 0x0b, 0x43, + 0x36, 0xfd, 0xe9, 0x42, 0x9b, 0x59, 0x07, 0x43, 0x7e, 0x19, 0x1c, 0x43, 0x08, 0xea, 0xfc, 0x42, + 0x5e, 0x3f, 0xdd, 0x42, 0x1d, 0x9b, 0x22, 0x43, 0xe8, 0xfc, 0x20, 0x43, 0xeb, 0xaf, 0x19, 0x43, + 0xfb, 0x23, 0x28, 0x43, 0x79, 0x8b, 0x2f, 0x43, 0x5a, 0xd6, 0x22, 0x43, 0xb8, 0x21, 0x29, 0x43, + 0x13, 0x94, 0x15, 0x43, 0x15, 0x5c, 0x04, 0x43, 0x97, 0x2e, 0x11, 0x43, 0x2e, 0xe1, 0x11, 0x43, + 0x72, 0x05, 0x2c, 0x43, 0x12, 0xde, 0xf4, 0x42, 0xca, 0x5a, 0xcf, 0x42, 0x94, 0x19, 0x3b, 0x43, + 0x67, 0x2e, 0x1d, 0x43, 0xa1, 0x30, 0x1b, 0x43, 0xb7, 0xc9, 0x22, 0x43, 0xca, 0x8b, 0x35, 0x43, + 0x3d, 0x4f, 0x2b, 0x43, 0x72, 0x5f, 0x34, 0x43, 0x72, 0x71, 0x2d, 0x43, 0x05, 0xec, 0x18, 0x43, + 0x1c, 0x64, 0x1d, 0x43, 0x17, 0x42, 0x17, 0x43, 0x72, 0x3f, 0x2b, 0x43, 0xc6, 0x09, 0x0d, 0x43, + 0x78, 0xf5, 0xe1, 0x42, 0xe0, 0xae, 0x20, 0x43, 0x12, 0x35, 0x2a, 0x43, 0xa0, 0x21, 0x41, 0x43, + 0x0b, 0x8a, 0x1c, 0x43, 0xdf, 0xd8, 0x13, 0x43, 0x2a, 0x9d, 0x20, 0x43, 0x04, 0xa8, 0x2e, 0x43, + 0xe1, 0x5f, 0x28, 0x43, 0x4a, 0xf3, 0x16, 0x43, 0x31, 0x5d, 0x2c, 0x43, 0xe6, 0x4d, 0x3b, 0x43, + 0x06, 0x91, 0x2c, 0x43, 0x04, 0xd7, 0xfe, 0x42, 0xba, 0xf8, 0xa7, 0x42, 0xe4, 0x72, 0x0d, 0x43, + 0x21, 0x8d, 0x0f, 0x43, 0xa4, 0x09, 0x21, 0x43, 0x9f, 0x6e, 0x0f, 0x43, 0xbc, 0xac, 0x0e, 0x43, + 0xbe, 0x5d, 0x1b, 0x43, 0xf5, 0xc6, 0x1e, 0x43, 0xca, 0x01, 0x2e, 0x43, 0xe7, 0x60, 0x2c, 0x43, + 0xd2, 0x74, 0x36, 0x43, 0x74, 0xca, 0x41, 0x43, 0x4e, 0x0a, 0x2c, 0x43, 0x28, 0x39, 0xb1, 0x42, + 0x46, 0x1f, 0xaa, 0x42, 0x1a, 0xc1, 0xed, 0x42, 0x4a, 0x9c, 0x00, 0x43, 0xb0, 0x02, 0x0e, 0x43, + 0x08, 0x4e, 0xf3, 0x42, 0x42, 0xb7, 0xfc, 0x42, 0xc7, 0x6f, 0x1c, 0x43, 0x5d, 0xda, 0x31, 0x43, + 0xc6, 0xe6, 0x27, 0x43, 0x0a, 0x88, 0x41, 0x43, 0x52, 0x92, 0x37, 0x43, 0x74, 0xf5, 0x30, 0x43, + 0x52, 0xba, 0x0f, 0x43, 0xcc, 0x93, 0xd8, 0x42, 0x4c, 0xd6, 0x94, 0x42, 0xc4, 0x73, 0x89, 0x42, + 0xe2, 0x7c, 0xad, 0x42, 0xf8, 0x99, 0xc9, 0x42, 0x96, 0xe8, 0xdc, 0x42, 0xc6, 0xaf, 0xb9, 0x42, + 0xf6, 0x6f, 0x95, 0x42, 0x4e, 0xda, 0xf0, 0x42, 0x1b, 0x91, 0x0b, 0x43, 0x79, 0x6b, 0x0c, 0x43, + 0x5c, 0xc4, 0xea, 0x42, 0x4c, 0x44, 0xbe, 0x42, 0x48, 0x19, 0xa9, 0x42, 0xdd, 0x92, 0x51, 0x42, + 0xb2, 0x13, 0x6d, 0x42, 0xd6, 0x6a, 0x98, 0x42, 0x65, 0x83, 0x8e, 0x42, 0x31, 0x08, 0x93, 0x42, + 0x7c, 0x98, 0xbc, 0x42, 0x88, 0x63, 0xbc, 0x42, 0x65, 0x26, 0xd5, 0x42, 0x90, 0xb9, 0xcd, 0x42, + 0x08, 0x86, 0xaf, 0x42, 0x05, 0x15, 0x93, 0x42, 0x86, 0xc6, 0xc7, 0x42, 0x96, 0x1b, 0xac, 0x42, + 0x8c, 0xaa, 0xc5, 0x42, 0xa8, 0xb0, 0x5b, 0x42, 0xc7, 0x70, 0xac, 0x42, 0xac, 0x19, 0xef, 0x42, + 0xac, 0xd8, 0xd2, 0x42, 0x03, 0x6d, 0x07, 0x43, 0x1a, 0x11, 0x16, 0x43, 0xe2, 0x8b, 0x14, 0x43, + 0xa0, 0x84, 0x30, 0x43, 0xac, 0xec, 0x22, 0x43, 0xbf, 0x23, 0x27, 0x43, 0x40, 0xb5, 0xf4, 0x42, + 0x62, 0x2c, 0x15, 0x43, 0x26, 0x41, 0x17, 0x43, 0x2e, 0x1d, 0x1f, 0x43, 0x34, 0x7d, 0x9b, 0x42, + 0x5e, 0x56, 0xd9, 0x42, 0x1e, 0xca, 0xd7, 0x42, 0x9d, 0xab, 0xd7, 0x42, 0x19, 0xaa, 0x06, 0x43, + 0xf1, 0xca, 0x07, 0x43, 0xb1, 0x86, 0x11, 0x43, 0xd5, 0xf5, 0x35, 0x43, 0x90, 0xae, 0x30, 0x43, + 0x8c, 0x4a, 0x2a, 0x43, 0x50, 0xa3, 0x0f, 0x43, 0x7c, 0x6e, 0x17, 0x43, 0xd2, 0xfe, 0x24, 0x43, + 0x74, 0x80, 0x1d, 0x43, 0x74, 0x30, 0xd1, 0x42, 0xda, 0x22, 0xc9, 0x42, 0x58, 0x48, 0xfa, 0x42, + 0x4d, 0x77, 0xc6, 0x42, 0x64, 0xce, 0x0c, 0x43, 0xaf, 0x03, 0x17, 0x43, 0x5b, 0x88, 0x0b, 0x43, + 0xaf, 0x6d, 0x3c, 0x43, 0x55, 0xb1, 0x27, 0x43, 0x62, 0x4f, 0x31, 0x43, 0xdc, 0x4e, 0x22, 0x43, + 0x1a, 0x95, 0x1a, 0x43, 0x1c, 0x9e, 0x23, 0x43, 0xda, 0x91, 0x12, 0x43, 0x0a, 0x8e, 0xdc, 0x42, + 0x42, 0xfc, 0xb5, 0x42, 0xf9, 0x91, 0xf7, 0x42, 0xf9, 0x19, 0xf7, 0x42, 0xf3, 0x07, 0x09, 0x43, + 0x09, 0x88, 0x0f, 0x43, 0xea, 0xa2, 0x22, 0x43, 0xb8, 0x65, 0x1f, 0x43, 0xdb, 0xbb, 0x3f, 0x43, + 0xf3, 0x0f, 0x2d, 0x43, 0xf2, 0x99, 0x1c, 0x43, 0xd0, 0xc8, 0x1c, 0x43, 0x8b, 0xd3, 0x04, 0x43, + 0x38, 0x8b, 0x07, 0x43, 0x9e, 0x73, 0x9a, 0x42, 0x97, 0xe3, 0xd0, 0x42, 0xf8, 0xe2, 0x0e, 0x43, + 0x33, 0xeb, 0x04, 0x43, 0x61, 0x16, 0x0b, 0x43, 0x86, 0x59, 0x05, 0x43, 0x85, 0xd0, 0x1b, 0x43, + 0x9b, 0x56, 0x3f, 0x43, 0x34, 0x66, 0x43, 0x43, 0xaa, 0xf8, 0x49, 0x43, 0xe9, 0xa0, 0x1c, 0x43, + 0xed, 0xa6, 0x02, 0x43, 0x38, 0x92, 0xfd, 0x42, 0xc2, 0x98, 0x13, 0x43, 0x55, 0x05, 0xc7, 0x42, + 0x10, 0x44, 0xe0, 0x42, 0x0c, 0xa2, 0x1f, 0x43, 0x3e, 0x2d, 0x07, 0x43, 0x24, 0xae, 0x10, 0x43, + 0x22, 0x02, 0x1b, 0x43, 0x01, 0xaf, 0x24, 0x43, 0x50, 0x77, 0x4c, 0x43, 0x3f, 0x08, 0x33, 0x43, + 0x83, 0xd2, 0x11, 0x43, 0x5e, 0xc0, 0x01, 0x43, 0xfa, 0x51, 0xe8, 0x42, 0x28, 0xcc, 0x01, 0x43, + 0xbc, 0x87, 0x17, 0x43, 0x98, 0x72, 0xb9, 0x42, 0x30, 0xda, 0xd7, 0x42, 0x50, 0x31, 0x16, 0x43, + 0x8e, 0xb6, 0x09, 0x43, 0xc9, 0xba, 0x12, 0x43, 0x37, 0x7b, 0x1a, 0x43, 0x07, 0xe9, 0x24, 0x43, + 0xae, 0x60, 0x1f, 0x43, 0x54, 0xd8, 0x1f, 0x43, 0x9c, 0xf8, 0x0b, 0x43, 0xd1, 0xc1, 0xe7, 0x42, + 0xce, 0xa8, 0xe8, 0x42, 0x3c, 0x87, 0x08, 0x43, 0x24, 0xce, 0x17, 0x43, 0xc9, 0xfb, 0xdc, 0x42, + 0x48, 0xb2, 0xdb, 0x42, 0xad, 0x32, 0x1d, 0x43, 0x66, 0x5c, 0x11, 0x43, 0xfd, 0x61, 0x02, 0x43, + 0xac, 0x2b, 0x15, 0x43, 0x19, 0x8a, 0x1d, 0x43, 0x97, 0x4e, 0x23, 0x43, 0xb0, 0x0d, 0x20, 0x43, + 0xa4, 0x22, 0x07, 0x43, 0x56, 0x9c, 0xfe, 0x42, 0xeb, 0x67, 0x03, 0x43, 0x24, 0xa6, 0x0a, 0x43, + 0x18, 0x8c, 0x1f, 0x43, 0x6c, 0x6b, 0xcd, 0x42, 0xd4, 0x5d, 0xd1, 0x42, 0x38, 0x8a, 0x2e, 0x43, + 0xa4, 0xf0, 0x25, 0x43, 0xa8, 0x11, 0x21, 0x43, 0x23, 0x07, 0x29, 0x43, 0x42, 0xd7, 0x2f, 0x43, + 0xd1, 0x58, 0x20, 0x43, 0xb9, 0x00, 0x26, 0x43, 0x1d, 0xe4, 0x18, 0x43, 0x79, 0x6a, 0x0b, 0x43, + 0xf6, 0x6e, 0x0c, 0x43, 0x65, 0x9a, 0x12, 0x43, 0x3e, 0xe5, 0x2c, 0x43, 0x42, 0x17, 0xf9, 0x42, + 0x31, 0xc0, 0xd4, 0x42, 0x86, 0xeb, 0x27, 0x43, 0x60, 0x37, 0x28, 0x43, 0xfc, 0xae, 0x28, 0x43, + 0x66, 0xbb, 0x07, 0x43, 0x76, 0x2f, 0x1f, 0x43, 0xcd, 0x3b, 0x11, 0x43, 0xfe, 0xaa, 0x2f, 0x43, + 0xad, 0xf9, 0x08, 0x43, 0x1f, 0x6c, 0x13, 0x43, 0xd1, 0x14, 0x25, 0x43, 0x0e, 0x63, 0x33, 0x43, + 0x06, 0xa7, 0x33, 0x43, 0xa2, 0x74, 0xf7, 0x42, 0x80, 0xd2, 0xaf, 0x42, 0xa2, 0x42, 0x0e, 0x43, + 0xf1, 0x57, 0x0c, 0x43, 0x70, 0x43, 0x0f, 0x43, 0x7f, 0xe2, 0xef, 0x42, 0xcc, 0x11, 0x05, 0x43, + 0x67, 0xaa, 0x15, 0x43, 0x20, 0xfd, 0x1d, 0x43, 0x89, 0xfd, 0x25, 0x43, 0x14, 0xa5, 0x22, 0x43, + 0xea, 0x28, 0x30, 0x43, 0x78, 0xec, 0x40, 0x43, 0x34, 0xc3, 0x21, 0x43, 0x88, 0xd9, 0xcd, 0x42, + 0xda, 0xb0, 0xa9, 0x42, 0x16, 0x3b, 0xe1, 0x42, 0xf8, 0x5c, 0x05, 0x43, 0x2f, 0x39, 0xf7, 0x42, + 0xae, 0x31, 0xf0, 0x42, 0x9a, 0xbd, 0xf2, 0x42, 0x04, 0xb2, 0x0a, 0x43, 0x69, 0xb0, 0x1e, 0x43, + 0xdf, 0xc4, 0x30, 0x43, 0x8c, 0x7f, 0x35, 0x43, 0x79, 0x5a, 0x2c, 0x43, 0x40, 0x43, 0x1b, 0x43, + 0x12, 0xf9, 0xed, 0x42, 0xcb, 0xde, 0xa6, 0x42, 0xa4, 0x2c, 0x82, 0x42, 0xfc, 0xfe, 0x99, 0x42, + 0xd0, 0x83, 0xaa, 0x42, 0xf4, 0xc4, 0xb7, 0x42, 0x8f, 0xb3, 0xb1, 0x42, 0xd6, 0x0c, 0xb9, 0x42, + 0x6a, 0x1a, 0xc4, 0x42, 0x56, 0x75, 0xe0, 0x42, 0x94, 0x2b, 0xf7, 0x42, 0xe0, 0xeb, 0x08, 0x43, + 0xf3, 0xf5, 0xd0, 0x42, 0xc6, 0x78, 0xc6, 0x42, 0x2c, 0xf4, 0xa0, 0x42, 0x7a, 0x33, 0x5d, 0x42, + 0xee, 0xf4, 0x13, 0x42, 0x30, 0xb3, 0x66, 0x42, 0x3e, 0x45, 0x61, 0x42, 0xf4, 0x84, 0x7f, 0x42, + 0xe1, 0x9a, 0x8c, 0x42, 0x8d, 0x34, 0x99, 0x42, 0x5e, 0x82, 0xa5, 0x42, 0x3c, 0x22, 0xbf, 0x42, + 0x1b, 0xaf, 0x9f, 0x42, 0xd2, 0xc8, 0x9b, 0x42, 0x63, 0x54, 0x90, 0x42, 0x52, 0x0c, 0x9b, 0x42, + 0x56, 0x22, 0xb4, 0x42, 0x66, 0x13, 0x1b, 0x42, 0xf8, 0xde, 0x9c, 0x42, 0x68, 0x3a, 0xc9, 0x42, + 0xba, 0x72, 0xb4, 0x42, 0xb5, 0x35, 0xb9, 0x42, 0xd5, 0x9a, 0xe9, 0x42, 0x19, 0xe7, 0xd2, 0x42, + 0x11, 0xd2, 0x11, 0x43, 0x29, 0xd3, 0xef, 0x42, 0xb4, 0x54, 0x10, 0x43, 0xdc, 0x52, 0xc2, 0x42, + 0x76, 0xcd, 0xdc, 0x42, 0xcb, 0x23, 0x0e, 0x43, 0xc6, 0x9f, 0xfb, 0x42, 0x42, 0xce, 0x96, 0x42, + 0x8c, 0xaa, 0xa0, 0x42, 0x2a, 0x2b, 0xed, 0x42, 0xfb, 0x73, 0xdf, 0x42, 0x26, 0x9a, 0xde, 0x42, + 0x57, 0xee, 0x0e, 0x43, 0xcb, 0xf6, 0x0c, 0x43, 0xa1, 0x8e, 0x11, 0x43, 0xe6, 0x30, 0x0c, 0x43, + 0x6b, 0x76, 0x18, 0x43, 0x28, 0xb9, 0xfe, 0x42, 0x69, 0xb6, 0x13, 0x43, 0xa4, 0xa7, 0x10, 0x43, + 0xc3, 0x30, 0x10, 0x43, 0x89, 0xc7, 0xde, 0x42, 0x3a, 0x2d, 0xc4, 0x42, 0xef, 0x50, 0xce, 0x42, + 0x66, 0xc9, 0x9c, 0x42, 0xd5, 0x94, 0xe3, 0x42, 0x60, 0xd3, 0x08, 0x43, 0x59, 0x9c, 0xe8, 0x42, + 0x0f, 0x4a, 0x1c, 0x43, 0x68, 0x81, 0x25, 0x43, 0x72, 0x47, 0x2f, 0x43, 0x6d, 0x1b, 0x0a, 0x43, + 0xf5, 0x62, 0x09, 0x43, 0xb3, 0x11, 0x08, 0x43, 0x21, 0x7f, 0x02, 0x43, 0x86, 0xd0, 0x8b, 0x42, + 0x9c, 0xe1, 0x83, 0x42, 0x5c, 0x77, 0xc4, 0x42, 0xaa, 0xb4, 0xcd, 0x42, 0x12, 0xcf, 0xe0, 0x42, + 0x96, 0x16, 0xf9, 0x42, 0xbc, 0xe0, 0x07, 0x43, 0x3d, 0xb8, 0x19, 0x43, 0x5c, 0x3f, 0x35, 0x43, + 0x05, 0xab, 0x22, 0x43, 0x37, 0x42, 0x06, 0x43, 0x82, 0x68, 0x04, 0x43, 0xdd, 0x20, 0x01, 0x43, + 0xaa, 0x28, 0xd8, 0x42, 0xd1, 0x67, 0x94, 0x42, 0x84, 0xe7, 0xa9, 0x42, 0xde, 0x15, 0xdd, 0x42, + 0x21, 0x0f, 0xd0, 0x42, 0x2e, 0x8f, 0xc6, 0x42, 0x37, 0x33, 0xe6, 0x42, 0x46, 0x04, 0xf6, 0x42, + 0xac, 0x0e, 0x33, 0x43, 0xe5, 0x7a, 0x3d, 0x43, 0x5f, 0x95, 0x1d, 0x43, 0xa5, 0xb1, 0xf0, 0x42, + 0xd7, 0xc1, 0x05, 0x43, 0xd0, 0xc9, 0xe8, 0x42, 0xce, 0x14, 0xea, 0x42, 0xea, 0xe0, 0x8c, 0x42, + 0xe4, 0x08, 0xb9, 0x42, 0xa8, 0xf4, 0x07, 0x43, 0xbb, 0x58, 0xc8, 0x42, 0x7b, 0x74, 0xf0, 0x42, + 0xd7, 0x37, 0x04, 0x43, 0x76, 0xd3, 0x0b, 0x43, 0x37, 0x43, 0x21, 0x43, 0x96, 0x7e, 0x06, 0x43, + 0x46, 0xf6, 0xf5, 0x42, 0x5c, 0xca, 0xe0, 0x42, 0xce, 0xf2, 0xfa, 0x42, 0xa4, 0x95, 0x07, 0x43, + 0x5a, 0x7d, 0xfb, 0x42, 0x46, 0x4d, 0xa6, 0x42, 0x73, 0xbd, 0xd3, 0x42, 0x52, 0x21, 0x01, 0x43, + 0xf7, 0x35, 0xcc, 0x42, 0x18, 0xa8, 0xe8, 0x42, 0x39, 0x93, 0x07, 0x43, 0x83, 0x4c, 0x16, 0x43, + 0x01, 0xf1, 0x12, 0x43, 0x88, 0x2c, 0x15, 0x43, 0x5e, 0x23, 0xf2, 0x42, 0xa8, 0x52, 0xbf, 0x42, + 0x6b, 0xc7, 0xbf, 0x42, 0x2e, 0x86, 0xfb, 0x42, 0xf9, 0x63, 0x08, 0x43, 0xfd, 0xbc, 0xb8, 0x42, + 0x82, 0x25, 0xc1, 0x42, 0xaf, 0xd3, 0x0b, 0x43, 0x15, 0x3a, 0xe9, 0x42, 0x60, 0x46, 0xeb, 0x42, + 0xcb, 0xe0, 0xec, 0x42, 0x12, 0x9a, 0x0e, 0x43, 0x2f, 0xb5, 0x0d, 0x43, 0x1b, 0x7d, 0x12, 0x43, + 0xde, 0x97, 0xe3, 0x42, 0x79, 0xf5, 0xc7, 0x42, 0x79, 0xb0, 0xe4, 0x42, 0xa2, 0xd2, 0xcf, 0x42, + 0xfa, 0x3c, 0xf3, 0x42, 0xef, 0x01, 0x9e, 0x42, 0x0e, 0x25, 0xb0, 0x42, 0xd9, 0xbe, 0x05, 0x43, + 0x00, 0x72, 0x0f, 0x43, 0xf8, 0x72, 0x29, 0x43, 0xfe, 0x3c, 0x0e, 0x43, 0xd3, 0x8a, 0x08, 0x43, + 0x17, 0xd0, 0x08, 0x43, 0xc7, 0xe0, 0x15, 0x43, 0x74, 0xb8, 0x0a, 0x43, 0x90, 0xf5, 0xda, 0x42, + 0xfb, 0xd2, 0xf1, 0x42, 0x1d, 0x9a, 0x10, 0x43, 0xef, 0x9c, 0x1e, 0x43, 0x42, 0x6e, 0xbd, 0x42, + 0xb9, 0xa0, 0x85, 0x42, 0xdf, 0x9c, 0x10, 0x43, 0xad, 0x00, 0x0d, 0x43, 0xcd, 0x01, 0x12, 0x43, + 0xf0, 0x9e, 0xc2, 0x42, 0x34, 0x3f, 0x06, 0x43, 0x8f, 0x46, 0x0c, 0x43, 0xe7, 0x58, 0x07, 0x43, + 0x82, 0x24, 0x00, 0x43, 0xc0, 0xa3, 0x04, 0x43, 0xef, 0x84, 0x1a, 0x43, 0x94, 0xf3, 0x1e, 0x43, + 0x39, 0xc6, 0x16, 0x43, 0x0b, 0x1c, 0xe3, 0x42, 0x13, 0xc2, 0x9f, 0x42, 0x46, 0x36, 0xe7, 0x42, + 0xb2, 0xe7, 0xe3, 0x42, 0x49, 0xd1, 0xea, 0x42, 0x57, 0x47, 0xd8, 0x42, 0xde, 0xdc, 0xf3, 0x42, + 0xaa, 0x16, 0xf5, 0x42, 0x03, 0x47, 0x19, 0x43, 0xa9, 0xb3, 0x16, 0x43, 0x02, 0x3a, 0x1e, 0x43, + 0xa6, 0x2d, 0x1c, 0x43, 0x9b, 0xdf, 0x21, 0x43, 0x7e, 0xc3, 0x15, 0x43, 0x78, 0x93, 0xb7, 0x42, + 0xb0, 0xf2, 0x9b, 0x42, 0xad, 0xdd, 0xdc, 0x42, 0xe2, 0x68, 0xdd, 0x42, 0xc2, 0x61, 0xc7, 0x42, + 0x24, 0xb6, 0xc8, 0x42, 0x56, 0xf7, 0xc9, 0x42, 0x96, 0xc0, 0xd4, 0x42, 0x78, 0x58, 0x04, 0x43, + 0x33, 0x0e, 0x0f, 0x43, 0x81, 0x82, 0x21, 0x43, 0x1f, 0x59, 0x0c, 0x43, 0xf4, 0xdd, 0x01, 0x43, + 0x52, 0xe7, 0xee, 0x42, 0x04, 0xc8, 0x86, 0x42, 0xa1, 0x7e, 0x54, 0x42, 0x68, 0x63, 0x6f, 0x42, + 0x3c, 0xf8, 0x63, 0x42, 0xf8, 0xd5, 0x7b, 0x42, 0xf2, 0x8e, 0x84, 0x42, 0x4a, 0x7b, 0x96, 0x42, + 0x5d, 0x49, 0xac, 0x42, 0xb6, 0x7c, 0xc0, 0x42, 0xa9, 0x8f, 0xbe, 0x42, 0xae, 0x9e, 0xcf, 0x42, + 0x44, 0x57, 0xb2, 0x42, 0x39, 0xef, 0xaf, 0x42, 0xec, 0xa4, 0x4a, 0x42, 0x96, 0x71, 0x46, 0x42, + 0x38, 0xf8, 0x70, 0x42, 0xb1, 0x2c, 0x86, 0x42, 0x9a, 0xde, 0xa0, 0x42, 0x19, 0x05, 0xae, 0x42, + 0x70, 0x85, 0xc3, 0x42, 0x1a, 0xa9, 0xc7, 0x42, 0x8e, 0x52, 0xda, 0x42, 0x6d, 0x50, 0xda, 0x42, + 0x49, 0x6d, 0xd4, 0x42, 0xc0, 0x4f, 0xaa, 0x42, 0x99, 0x3e, 0xcd, 0x42, 0x23, 0x8b, 0xd6, 0x42, + 0x12, 0x8e, 0xbf, 0x42, 0x7c, 0x70, 0x6b, 0x42, 0x9f, 0xe3, 0xc5, 0x42, 0xdf, 0xdb, 0xf8, 0x42, + 0xcf, 0xce, 0xe3, 0x42, 0x1b, 0x12, 0xf3, 0x42, 0xad, 0xd0, 0x14, 0x43, 0x37, 0xea, 0x0c, 0x43, + 0x23, 0x92, 0x2a, 0x43, 0x5e, 0x19, 0x1d, 0x43, 0xdd, 0x1b, 0x2a, 0x43, 0xf6, 0x06, 0x0b, 0x43, + 0xa7, 0xfc, 0x26, 0x43, 0x55, 0xf6, 0x11, 0x43, 0x63, 0x49, 0x36, 0x43, 0xf6, 0xca, 0xc8, 0x42, + 0xeb, 0x08, 0xc8, 0x42, 0x1e, 0x9f, 0x03, 0x43, 0xf0, 0xbf, 0xd9, 0x42, 0x88, 0x0c, 0x0d, 0x43, + 0xac, 0x0d, 0x1f, 0x43, 0x6f, 0xa2, 0x1f, 0x43, 0xdb, 0xa2, 0x47, 0x43, 0x6f, 0x62, 0x37, 0x43, + 0x2c, 0x63, 0x2b, 0x43, 0x59, 0x79, 0x0b, 0x43, 0x17, 0xa5, 0x22, 0x43, 0x20, 0xc9, 0x24, 0x43, + 0xc5, 0x1b, 0x20, 0x43, 0x12, 0x48, 0xdd, 0x42, 0x24, 0x5d, 0xd0, 0x42, 0xec, 0x10, 0x04, 0x43, + 0xdb, 0xa9, 0xda, 0x42, 0x92, 0xd8, 0x06, 0x43, 0xc3, 0x22, 0x19, 0x43, 0xa7, 0xe5, 0x11, 0x43, + 0xdc, 0xd1, 0x2f, 0x43, 0x17, 0x6f, 0x51, 0x43, 0xe9, 0xa6, 0x4e, 0x43, 0x80, 0x3b, 0x1d, 0x43, + 0x13, 0xa0, 0x1f, 0x43, 0xf3, 0xb5, 0x1c, 0x43, 0xb6, 0x5a, 0x0f, 0x43, 0xbd, 0xbc, 0xb8, 0x42, + 0x3d, 0x79, 0xc9, 0x42, 0x56, 0xfd, 0x07, 0x43, 0x24, 0x9e, 0x02, 0x43, 0x64, 0xed, 0x12, 0x43, + 0xfa, 0xb7, 0x1d, 0x43, 0x2c, 0x40, 0x1a, 0x43, 0xa5, 0x37, 0x42, 0x43, 0x1e, 0xed, 0x3f, 0x43, + 0x3b, 0x4a, 0x45, 0x43, 0x4d, 0x09, 0x1f, 0x43, 0x73, 0x3d, 0x1c, 0x43, 0x8c, 0xaa, 0x14, 0x43, + 0x29, 0xe6, 0xf6, 0x42, 0x57, 0x51, 0xc9, 0x42, 0x4b, 0x59, 0xcd, 0x42, 0x41, 0x39, 0x1f, 0x43, + 0x75, 0x0b, 0x0b, 0x43, 0xd5, 0x1c, 0x17, 0x43, 0xad, 0x94, 0x11, 0x43, 0xb8, 0x07, 0x24, 0x43, + 0xe5, 0xe9, 0x49, 0x43, 0x3b, 0xdf, 0x5e, 0x43, 0x7b, 0x7f, 0x42, 0x43, 0xd8, 0x40, 0x1b, 0x43, + 0xea, 0x7a, 0x1d, 0x43, 0x93, 0xf5, 0x0a, 0x43, 0x41, 0x91, 0x15, 0x43, 0x35, 0xe8, 0xb2, 0x42, + 0x4f, 0x39, 0xe8, 0x42, 0xff, 0xcb, 0x1c, 0x43, 0xc9, 0x3d, 0x01, 0x43, 0xb1, 0x85, 0x10, 0x43, + 0xde, 0x62, 0x26, 0x43, 0xe1, 0x97, 0x23, 0x43, 0x51, 0x37, 0x3a, 0x43, 0xf7, 0xac, 0x31, 0x43, + 0x68, 0x02, 0x11, 0x43, 0xf1, 0xcf, 0xec, 0x42, 0x9a, 0xc5, 0x00, 0x43, 0xc5, 0x20, 0x06, 0x43, + 0x9b, 0x91, 0x21, 0x43, 0x3f, 0xbc, 0xd4, 0x42, 0x7d, 0x29, 0xe0, 0x42, 0xf9, 0x72, 0x22, 0x43, + 0x15, 0xe9, 0xfd, 0x42, 0x8c, 0x7f, 0x11, 0x43, 0x76, 0x23, 0x23, 0x43, 0xdd, 0x70, 0x29, 0x43, + 0x4f, 0x92, 0x2c, 0x43, 0x8f, 0x2e, 0x2a, 0x43, 0x27, 0xcf, 0x1b, 0x43, 0xa3, 0x60, 0xfe, 0x42, + 0x3e, 0xee, 0xe1, 0x42, 0xd9, 0x41, 0x08, 0x43, 0x2f, 0xb5, 0x1b, 0x43, 0xaa, 0x6e, 0xee, 0x42, + 0x10, 0x4b, 0xc5, 0x42, 0x93, 0x46, 0x22, 0x43, 0xb8, 0xa2, 0x14, 0x43, 0x14, 0xe8, 0x22, 0x43, + 0x83, 0x2e, 0x19, 0x43, 0x41, 0x0d, 0x2a, 0x43, 0x3d, 0x94, 0x28, 0x43, 0x7f, 0x7a, 0x26, 0x43, + 0xcd, 0x1c, 0x07, 0x43, 0xdf, 0x39, 0x05, 0x43, 0x57, 0xda, 0x04, 0x43, 0xa3, 0x98, 0x0a, 0x43, + 0xdb, 0x40, 0x1a, 0x43, 0xdd, 0x43, 0xd7, 0x42, 0x9a, 0xd0, 0xce, 0x42, 0x2d, 0x1f, 0x23, 0x43, + 0x0a, 0x7e, 0x23, 0x43, 0x86, 0x54, 0x37, 0x43, 0x0b, 0x35, 0x2b, 0x43, 0x68, 0xf0, 0x2b, 0x43, + 0x6b, 0xdf, 0x1e, 0x43, 0x27, 0x4e, 0x1f, 0x43, 0x06, 0x74, 0x19, 0x43, 0x74, 0x45, 0x0e, 0x43, + 0x5d, 0x68, 0x13, 0x43, 0x8d, 0xf2, 0x16, 0x43, 0x41, 0x7d, 0x3c, 0x43, 0x8f, 0xa1, 0x0a, 0x43, + 0xab, 0xd3, 0xc5, 0x42, 0x6c, 0x88, 0x23, 0x43, 0xed, 0xed, 0x2a, 0x43, 0x94, 0x0c, 0x18, 0x43, + 0x24, 0x68, 0x08, 0x43, 0xd7, 0x70, 0x1b, 0x43, 0xed, 0x30, 0x20, 0x43, 0x30, 0x0f, 0x34, 0x43, + 0xf8, 0x3a, 0x14, 0x43, 0x77, 0x0f, 0x14, 0x43, 0x9a, 0xf1, 0x30, 0x43, 0x1d, 0xd3, 0x33, 0x43, + 0x45, 0x35, 0x3b, 0x43, 0x4f, 0xe5, 0xe6, 0x42, 0x72, 0x58, 0xc6, 0x42, 0x21, 0xff, 0x13, 0x43, + 0xd0, 0xe1, 0x04, 0x43, 0x32, 0x02, 0x0e, 0x43, 0x65, 0x72, 0xf6, 0x42, 0x09, 0xe2, 0x0e, 0x43, + 0xf1, 0xe4, 0x14, 0x43, 0xc5, 0x4b, 0x33, 0x43, 0x99, 0xde, 0x29, 0x43, 0xf7, 0x6c, 0x37, 0x43, + 0x9f, 0xde, 0x31, 0x43, 0xbc, 0xf7, 0x40, 0x43, 0x5e, 0x4a, 0x29, 0x43, 0x6b, 0x14, 0xe5, 0x42, + 0xb3, 0x32, 0xb9, 0x42, 0x50, 0xd7, 0x03, 0x43, 0x95, 0xca, 0xf0, 0x42, 0xbe, 0xf0, 0x00, 0x43, + 0xf3, 0x62, 0xfe, 0x42, 0x82, 0xdd, 0x00, 0x43, 0xf3, 0x07, 0x08, 0x43, 0xa3, 0x5e, 0x28, 0x43, + 0xc3, 0xfd, 0x32, 0x43, 0x20, 0xff, 0x39, 0x43, 0xc0, 0xc6, 0x28, 0x43, 0xec, 0x59, 0x1c, 0x43, + 0xde, 0xfa, 0x12, 0x43, 0x0e, 0x75, 0xbe, 0x42, 0x1a, 0xe3, 0x64, 0x42, 0x3d, 0x9c, 0x9d, 0x42, + 0xc9, 0xd9, 0x98, 0x42, 0x3b, 0x1a, 0xa0, 0x42, 0xd6, 0x79, 0xaf, 0x42, 0xd0, 0xfa, 0xa1, 0x42, + 0xb9, 0x9c, 0xc7, 0x42, 0xf9, 0xea, 0xe3, 0x42, 0x96, 0xd9, 0xf2, 0x42, 0x13, 0x88, 0x07, 0x43, + 0xc5, 0x59, 0xc8, 0x42, 0x70, 0xd9, 0xc1, 0x42, 0xaf, 0xd3, 0x98, 0x42, 0xe0, 0xae, 0x85, 0x42}; + +unsigned char conv2d_winograd_fp16_in[] = { + 0x3a, 0xb9, 0xc0, 0x30, 0x28, 0xbc, 0x72, 0xc1, 0x3c, 0xbe, 0xee, 0xc0, 0x1b, 0x3d, 0xf5, 0xbf, + 0x77, 0xbd, 0x05, 0xbd, 0x12, 0x2b, 0x5f, 0xb8, 0x73, 0xa2, 0xac, 0xbc, 0x19, 0xbf, 0x62, 0xc2, + 0xc5, 0xb7, 0x84, 0x3a, 0x70, 0xb4, 0xe9, 0xbd, 0xcf, 0xb9, 0x9b, 0xbe, 0xad, 0xb8, 0x4c, 0x39, + 0xaa, 0xc1, 0x50, 0xad, 0x4c, 0xbf, 0x8b, 0xb9, 0x9e, 0xbe, 0xbe, 0xb8, 0x05, 0xbf, 0x1c, 0xbc, + 0x7c, 0xbb, 0xce, 0xb3, 0x8a, 0x2c, 0xe7, 0xc1, 0xca, 0xb4, 0xde, 0x38, 0xe0, 0xbc, 0x46, 0xb9, + 0x37, 0xbf, 0xe0, 0x36, 0xef, 0xbd, 0xe9, 0xc0, 0x97, 0xc0, 0x5e, 0xbd, 0x5b, 0xbb, 0xf9, 0x2a, + 0x23, 0xb8, 0x6c, 0xbe, 0x09, 0xba, 0xd4, 0xbc, 0x39, 0xc0, 0x9d, 0xbd, 0xf8, 0xba, 0x7c, 0xb2, + 0x05, 0xc0, 0x14, 0xb5, 0xd0, 0x2e, 0x67, 0xb5, 0x20, 0xb9, 0x91, 0xb9, 0x3e, 0xa6, 0x78, 0xc0, + 0xcc, 0xbc, 0x10, 0xc1, 0x2f, 0xbd, 0x4a, 0xc1, 0x38, 0xbe, 0x2f, 0xb3, 0x01, 0xbc, 0x8d, 0x3b, + 0xcb, 0xc0, 0xa2, 0xbc, 0xb4, 0x22, 0x7c, 0xbe, 0x82, 0xbf, 0xa7, 0xbb, 0xf6, 0xbd, 0xd8, 0xbf, + 0x30, 0xb2, 0xb4, 0xb8, 0xe2, 0xbb, 0x5a, 0xbc, 0x93, 0xab, 0xb1, 0x3a, 0x08, 0xb8, 0x92, 0xbd, + 0xa7, 0xbc, 0x1a, 0xb8, 0x6f, 0xbe, 0xc8, 0xc1, 0xac, 0xbd, 0x32, 0xc0, 0x42, 0xbb, 0x60, 0x3c, + 0x3f, 0x34, 0x04, 0xbe, 0xed, 0xbe, 0x3e, 0x33, 0xbb, 0xbc, 0x4e, 0xbf, 0x48, 0xba, 0xaf, 0xbd, + 0x89, 0xb9, 0x06, 0x2b, 0x49, 0x38, 0x2d, 0xb9, 0x4f, 0xc0, 0xc7, 0xbd, 0xeb, 0x30, 0x47, 0x34, + 0x03, 0xbe, 0x47, 0xbe, 0x6d, 0xbf, 0x9a, 0xbe, 0x33, 0xbe, 0x89, 0xbf, 0x3b, 0x3a, 0xbc, 0x37, + 0xfb, 0xbd, 0xe4, 0xb9, 0x80, 0xb9, 0xd4, 0xbc, 0xe4, 0xc1, 0x63, 0xbb, 0xe6, 0x39, 0x0c, 0xc1, + 0x16, 0xbd, 0xdc, 0xaa, 0x06, 0xb5, 0x3b, 0xc0, 0xd4, 0xc4, 0x85, 0x28, 0x5c, 0xbf, 0x36, 0xbb, + 0x10, 0xbc, 0x3b, 0xbc, 0x28, 0x35, 0xe0, 0xb6, 0x99, 0xc0, 0x6f, 0xbe, 0xae, 0xbc, 0xe2, 0xac, + 0x21, 0xc0, 0x52, 0xc0, 0x7e, 0xb6, 0x0f, 0xc0, 0x9c, 0xb7, 0x44, 0xba, 0xb0, 0xb9, 0xd9, 0xc0, + 0xb9, 0xc0, 0x9f, 0xb9, 0x99, 0xaf, 0x71, 0xbd, 0x32, 0xc0, 0x53, 0x3b, 0x19, 0xc0, 0x78, 0x3a, + 0x6f, 0xb9, 0x43, 0xb9, 0x67, 0xbb, 0x20, 0xba, 0xf3, 0xb8, 0x1a, 0xb0, 0x45, 0xc2, 0x38, 0xaf, + 0x03, 0xbe, 0xbf, 0xb9, 0xae, 0xba, 0xc9, 0xb2, 0xb3, 0xbc, 0x1f, 0xbc, 0x35, 0xbc, 0x39, 0xc0, + 0x2a, 0xbe, 0x2f, 0xbd, 0x8c, 0xc0, 0xd4, 0xc1, 0x4e, 0x38, 0x13, 0xc1, 0x4c, 0xba, 0x31, 0xb9, + 0xa7, 0xbe, 0x7e, 0xc0, 0x1e, 0xb8, 0x86, 0xb4, 0xce, 0xbc, 0x51, 0xb7, 0x9d, 0xb0, 0xd7, 0xc1, + 0x89, 0xb4, 0xc4, 0x39, 0x55, 0xbc, 0x44, 0x33, 0x84, 0x3a, 0x29, 0xb9, 0x61, 0xb5, 0x8e, 0xbd, + 0xe2, 0xb2, 0x54, 0xa1, 0x46, 0xb5, 0xb5, 0x34, 0x4b, 0xc0, 0x84, 0xb8, 0x0d, 0x38, 0x31, 0xc4, + 0xe1, 0xbe, 0x40, 0x34, 0x47, 0xc0, 0xf4, 0xba, 0x4a, 0x39, 0x92, 0x2d, 0x62, 0x38, 0x44, 0xbd, + 0x72, 0xbc, 0xf1, 0xbc, 0x01, 0xbf, 0xed, 0xbb, 0xbd, 0x40, 0xa6, 0xc1, 0x2c, 0x40, 0xec, 0x2f, + 0x5f, 0xc1, 0x96, 0xbc, 0xfc, 0xba, 0xef, 0xbc, 0x3f, 0xbd, 0x0f, 0xbc, 0x9d, 0xba, 0x2b, 0xc2, + 0xda, 0xbd, 0x9c, 0xc2, 0x39, 0xb1, 0xd3, 0xbf, 0x59, 0xc1, 0xac, 0xc0, 0x01, 0xb4, 0x32, 0xb8, + 0xac, 0xb4, 0xfa, 0xbb, 0x44, 0xbd, 0xa8, 0xb5, 0x8a, 0xbd, 0x10, 0xbb, 0x34, 0xb8, 0x0c, 0x3d, + 0xfd, 0xac, 0x69, 0xbc, 0xd8, 0xc0, 0x60, 0xbc, 0x1c, 0x33, 0x16, 0xb7, 0x58, 0xc0, 0xad, 0xb8, + 0x35, 0xc3, 0xba, 0xbe, 0xec, 0xb5, 0x95, 0xc2, 0xeb, 0xbd, 0x72, 0xb5, 0x97, 0x38, 0x24, 0x30, + 0xc8, 0xba, 0xab, 0x3a, 0x4c, 0xbf, 0xef, 0xba, 0xe9, 0xb6, 0xa2, 0xb8, 0x64, 0xbe, 0x0e, 0xc0, + 0xfb, 0xbd, 0x06, 0x32, 0xd2, 0xbe, 0x65, 0xb8, 0xd4, 0x3a, 0xa4, 0xbb, 0x0d, 0x39, 0x7a, 0xbc, + 0x9d, 0x2a, 0x92, 0xb3, 0x02, 0xc0, 0x54, 0xbe, 0x12, 0x2e, 0x84, 0xc0, 0x44, 0xc3, 0x8a, 0xbc, + 0xfb, 0xbc, 0x8b, 0xba, 0x91, 0xbc, 0x74, 0xba, 0x25, 0xab, 0xb3, 0xba, 0xd0, 0xbc, 0x8e, 0x3a, + 0xb9, 0xb8, 0x6f, 0x22, 0x92, 0xbc, 0xdc, 0xc1, 0x58, 0xc1, 0xea, 0xba, 0xbf, 0xa4, 0xaf, 0x40, + 0x10, 0xbb, 0x93, 0xbf, 0x33, 0xb5, 0x8b, 0xbe, 0xbe, 0xc1, 0x3b, 0xb9, 0x1e, 0xbe, 0xb0, 0x37, + 0x7e, 0xc1, 0x5c, 0xb9, 0x26, 0xc0, 0x0c, 0xbd, 0x18, 0xbe, 0x37, 0x3c, 0xdb, 0x2d, 0xea, 0xb4, + 0x18, 0xbc, 0x09, 0xba, 0xee, 0xb2, 0xc0, 0xc0, 0xae, 0xbd, 0x73, 0xbc, 0x12, 0xc0, 0x69, 0x3b, + 0x14, 0xbc, 0x46, 0xc0, 0x8d, 0x38, 0xd8, 0xbb, 0x31, 0xbb, 0x88, 0xbc, 0x2e, 0x39, 0x22, 0xc0, + 0x67, 0xba, 0x14, 0x32, 0x24, 0xb7, 0x20, 0xc1, 0x72, 0xc0, 0xc8, 0x33, 0x0e, 0xbe, 0xab, 0x3a, + 0x95, 0xbd, 0x93, 0xb4, 0xf1, 0xb8, 0x72, 0xc0, 0x13, 0xc0, 0x2e, 0xc0, 0x2c, 0xbd, 0x4b, 0xc1, + 0x0a, 0x31, 0x34, 0xb3, 0x13, 0xb5, 0x4c, 0xb9, 0x45, 0xbe, 0x5d, 0xba, 0x4d, 0xbe, 0x15, 0x36, + 0xcb, 0xbe, 0x55, 0xc0, 0x53, 0xbd, 0x48, 0xb4, 0x39, 0xbc, 0xbd, 0xbc, 0x9a, 0x2d, 0x2c, 0xbc, + 0x84, 0x3b, 0xb4, 0xba, 0x32, 0xb2, 0x9b, 0xba, 0xba, 0xbc, 0x9f, 0xbc, 0xca, 0xb6, 0x32, 0xbe, + 0x36, 0x37, 0x3f, 0xbe, 0xe9, 0xbb, 0x51, 0xbc, 0x96, 0xb8, 0xb0, 0xbc, 0x4c, 0xbf, 0xad, 0xbc, + 0x03, 0xb6, 0x9d, 0xbe, 0xcc, 0xbf, 0x62, 0x29, 0x59, 0xbe, 0xaa, 0xb6, 0xcb, 0xbf, 0x1c, 0xb8, + 0x59, 0x3c, 0x8e, 0xb4, 0x2d, 0xb6, 0xb7, 0xac, 0x0b, 0xba, 0x91, 0xbe, 0x3a, 0xb5, 0xd7, 0xbe, + 0xea, 0xbe, 0x92, 0xb5, 0x40, 0xaf, 0x90, 0xb9, 0xa2, 0xbe, 0xab, 0x35, 0x22, 0xbc, 0xa0, 0xb8, + 0x10, 0x2e, 0xce, 0xbb, 0xd6, 0xbe, 0x2e, 0x32, 0x64, 0x32, 0x52, 0xb4, 0xe2, 0xc0, 0x95, 0xbd, + 0xb5, 0xc0, 0x33, 0xbe, 0x52, 0xb4, 0x5b, 0xbd, 0x77, 0x38, 0xe1, 0xbf, 0x2f, 0xbd, 0x94, 0xb9, + 0xd0, 0xb8, 0x47, 0xbc, 0xc2, 0xb5, 0xa0, 0x39, 0x0b, 0x42, 0xb1, 0xbc, 0x35, 0xbb, 0xd7, 0xb3, + 0xc1, 0xbe, 0xe7, 0xc0, 0x27, 0xb7, 0x7c, 0xb6, 0x57, 0x35, 0x93, 0xbd, 0x23, 0xb6, 0x5f, 0xbe, + 0xa7, 0xbc, 0x49, 0xb9, 0x5b, 0xb8, 0x36, 0xb6, 0xb8, 0xba, 0xc3, 0x33, 0x24, 0xb3, 0xef, 0xb8, + 0xba, 0xc0, 0x57, 0x39, 0x9c, 0xb6, 0xcf, 0xbe, 0x4c, 0xba, 0x4e, 0x34, 0x55, 0xbc, 0xaa, 0xb9, + 0xd8, 0xbe, 0xfc, 0x3a, 0xb9, 0xc1, 0x7b, 0x30, 0xb2, 0xbc, 0x0e, 0xa9, 0xb0, 0xb7, 0x31, 0xbc, + 0x13, 0xb1, 0x15, 0x3a, 0xbf, 0x32, 0x2f, 0x39, 0xb9, 0xc2, 0xb9, 0xbf, 0x04, 0xba, 0xf7, 0xbd, + 0x61, 0x37, 0x99, 0xbe, 0x8d, 0xb8, 0x5c, 0xb5, 0xc3, 0xc2, 0xb8, 0x32, 0xc5, 0xb4, 0xb1, 0xb6, + 0xe2, 0x2e, 0xb9, 0xbb, 0x95, 0x39, 0xc9, 0xbf, 0x58, 0xb4, 0xa3, 0xb9, 0xeb, 0xb5, 0x09, 0xc0, + 0x9f, 0xc1, 0x10, 0xba, 0x28, 0xbf, 0x09, 0xc0, 0x64, 0xb9, 0xd7, 0x3d, 0xad, 0xbc, 0xf6, 0xb8, + 0xa5, 0xba, 0x16, 0xbe, 0xec, 0x3c, 0xf8, 0xbb, 0x42, 0xbe, 0x90, 0xb8, 0x89, 0xb8, 0x91, 0xb8, + 0xa5, 0xbd, 0x63, 0xbb, 0xe8, 0xb3, 0x22, 0xb8, 0x8c, 0xba, 0x17, 0xbd, 0xc4, 0xba, 0x84, 0xbc, + 0x2f, 0xbf, 0xb2, 0xbc, 0x2c, 0xb6, 0xfe, 0xbc, 0x0b, 0xb9, 0xb7, 0xb3, 0x8f, 0xbe, 0xe9, 0xbd, + 0xe7, 0xbe, 0x78, 0xb8, 0x3c, 0x3d, 0xf8, 0xba, 0x7c, 0xb0, 0x3d, 0xbd, 0x62, 0xc0, 0xdf, 0xbc, + 0xc7, 0xb8, 0x5c, 0xc1, 0x3b, 0xbe, 0x9d, 0xb8, 0x63, 0xba, 0x26, 0xbb, 0x3c, 0xbf, 0x24, 0xbf, + 0x83, 0xbd, 0xb3, 0xc0, 0x89, 0x34, 0xf5, 0xb0, 0xf1, 0x32, 0xa0, 0xbb, 0xaf, 0xbf, 0x31, 0xbe, + 0xe3, 0x2f, 0x56, 0x36, 0x3d, 0xb4, 0x7a, 0x9b, 0x77, 0xbd, 0x9f, 0x31, 0xf1, 0xb8, 0xb3, 0x34, + 0xc4, 0xbe, 0xbd, 0x2d, 0xfc, 0xbb, 0xbb, 0xba, 0xc5, 0xbc, 0xa4, 0xb5, 0xd7, 0xb9, 0x1b, 0xbc, + 0x8b, 0xbd, 0x0e, 0xb8, 0x18, 0xbe, 0x6b, 0xb6, 0xee, 0x2d, 0xd2, 0xb1, 0xbf, 0xba, 0x36, 0xbf, + 0xc3, 0xba, 0xa7, 0x3b, 0x9f, 0xbd, 0x91, 0xbf, 0x3e, 0x2f, 0x55, 0xb9, 0x24, 0xbe, 0xb4, 0xbe, + 0x2d, 0x32, 0x42, 0xbe, 0x7a, 0x3d, 0x5b, 0xbf, 0x97, 0xc0, 0x69, 0xbc, 0xf9, 0xb2, 0xd5, 0xbf, + 0xe8, 0x39, 0xb4, 0xb3, 0xbb, 0xbe, 0xc9, 0xb7, 0x62, 0xbc, 0xd2, 0xbc, 0x1c, 0x38, 0xac, 0x3b, + 0xd2, 0x34, 0x58, 0xaf, 0x8c, 0xbc, 0xda, 0xbf, 0xb6, 0xb1, 0x21, 0xbf, 0x77, 0xb9, 0x70, 0xbe, + 0xbe, 0x38, 0xc3, 0x35, 0xe2, 0xbc, 0xa4, 0xb8, 0x7c, 0xb9, 0xad, 0xbc, 0x50, 0xc0, 0xcd, 0xba, + 0x3c, 0x35, 0x4e, 0xbf, 0x3f, 0xc0, 0xd2, 0xbe, 0xaa, 0xbc, 0x2e, 0xb9, 0x57, 0xb9, 0x04, 0xb3, + 0x47, 0xc0, 0x46, 0x30, 0xa6, 0x3e, 0x52, 0x39, 0x13, 0x3e, 0x4f, 0x36, 0x99, 0xbd, 0xf9, 0xbc, + 0x61, 0x38, 0x8a, 0xbc, 0xf6, 0xbb, 0x07, 0xaa, 0x27, 0xb3, 0x26, 0xbe, 0xfa, 0xbd, 0x8a, 0xbb, + 0xb1, 0xb0, 0x44, 0xc3, 0x71, 0xb6, 0x34, 0xc0, 0xfe, 0xbd, 0x23, 0xc0, 0xde, 0x2e, 0x68, 0xc0, + 0x74, 0xbd, 0xeb, 0xb2, 0x9e, 0xbb, 0xd7, 0xb3, 0x44, 0xbe, 0x8b, 0xc1, 0x35, 0xba, 0xfd, 0x30, + 0xc0, 0xbd, 0x7f, 0xc0, 0xb7, 0xc1, 0xb7, 0xbe, 0x25, 0xb9, 0xd0, 0xc0, 0xcb, 0xbd, 0x41, 0xc0, + 0x2e, 0x3b, 0x01, 0xbe, 0x72, 0xbc, 0xf4, 0x2f, 0x56, 0xb2, 0xc9, 0xbe, 0xfa, 0x3d, 0xc6, 0xba, + 0x33, 0xc0, 0xdf, 0xaa, 0xf8, 0xb9, 0xe0, 0xc0, 0x7e, 0xbc, 0x5a, 0x3a, 0xbd, 0xc0, 0x06, 0xbe, + 0xe0, 0xbe, 0x6b, 0xbb, 0x2a, 0xc0, 0xee, 0xbe, 0x88, 0xb2, 0x7c, 0xb2, 0xb7, 0xbe, 0xea, 0xc0, + 0x2d, 0xb3, 0x97, 0xb9, 0xf1, 0xb9, 0x5c, 0x28, 0xc7, 0xbc, 0x4d, 0xbd, 0x63, 0xb5, 0x51, 0xb1, + 0x6b, 0xbf, 0xf9, 0xbf, 0x36, 0xbb, 0xad, 0xab, 0x8d, 0xbd, 0xe5, 0xbc, 0x9e, 0xbd, 0x14, 0xc0, + 0x05, 0xba, 0xbe, 0xbf, 0xfe, 0xad, 0xfd, 0xbe, 0x3e, 0x2f, 0x03, 0x37, 0x78, 0x38, 0xc6, 0xb9, + 0xd3, 0x35, 0x6f, 0xbe, 0x55, 0xbb, 0x61, 0xbe, 0xa8, 0xb3, 0xdf, 0xbf, 0x63, 0xbd, 0x28, 0xbb, + 0xda, 0xbe, 0xf2, 0xbc, 0x15, 0xa1, 0xfd, 0xb8, 0x0d, 0xbe, 0x0e, 0x2e, 0x91, 0x38, 0x75, 0xbc, + 0x64, 0xb2, 0x32, 0xbe, 0x10, 0xc4, 0x6b, 0xbe, 0xa9, 0x39, 0x18, 0xbe, 0x26, 0xaf, 0xc5, 0xb4, + 0x58, 0xc2, 0xe6, 0x3c, 0xaa, 0xbe, 0x15, 0xbe, 0xab, 0xbe, 0xda, 0xbe, 0x95, 0xbc, 0x38, 0xc0, + 0x27, 0xc0, 0x6d, 0xbc, 0x27, 0xbb, 0x59, 0xba, 0x7c, 0xb9, 0xd1, 0xba, 0x8a, 0xbf, 0xa5, 0x40, + 0x07, 0x3c, 0x53, 0xbf, 0x9f, 0xc2, 0x6a, 0x39, 0x6e, 0xc0, 0x81, 0xbf, 0x73, 0xbd, 0x37, 0xbf, + 0x50, 0x24, 0xfc, 0xbe, 0x1f, 0xc1, 0x07, 0x32, 0x42, 0xb0, 0xa8, 0x39, 0x73, 0x39, 0x07, 0xb9, + 0xce, 0xc0, 0xb4, 0xbc, 0xfd, 0xbd, 0xa6, 0x30, 0xb7, 0xbf, 0xf7, 0xbb, 0x64, 0xc1, 0x6f, 0x39, + 0xf2, 0xbe, 0x9a, 0x3a, 0xc5, 0xbe, 0x8d, 0xb4, 0xd3, 0x35, 0x67, 0xbf, 0x40, 0xb9, 0xcf, 0xbc, + 0x7c, 0xbd, 0x2b, 0x32, 0x4c, 0xbe, 0xaa, 0xbe, 0xea, 0xc0, 0x9c, 0xb2, 0xa6, 0x34, 0x1b, 0x9b, + 0xde, 0xbc, 0x30, 0xbc, 0x52, 0xbc, 0x7b, 0xbc, 0x11, 0xc0, 0x03, 0xbb, 0x65, 0xbb, 0x8e, 0x3a, + 0x85, 0xba, 0x3f, 0x41, 0x84, 0xbd, 0xe0, 0xbf, 0x73, 0x35, 0xce, 0xb9, 0xac, 0x33, 0xcb, 0x3a, + 0x28, 0xb5, 0xd9, 0xbb, 0x7e, 0xbc, 0xe9, 0xbf, 0x33, 0xbc, 0x3c, 0xbf, 0x04, 0x36, 0xd4, 0xa0, + 0x76, 0xbe, 0x3c, 0x2d, 0x1e, 0xc0, 0x28, 0xbe, 0xcb, 0xc0, 0x41, 0x36, 0xcd, 0xba, 0x0d, 0xc0, + 0x6e, 0xc0, 0x58, 0xb8, 0x2b, 0xc0, 0x4d, 0xc4, 0x98, 0xbd, 0xa6, 0xbd, 0x16, 0x38, 0x6d, 0xb8, + 0x07, 0xbd, 0xd5, 0x3d, 0x2f, 0xbd, 0x0a, 0xba, 0x23, 0xba, 0x11, 0xb5, 0xf9, 0xbd, 0x67, 0xb6, + 0x60, 0xbc, 0x0e, 0xc0, 0xa9, 0xbc, 0x13, 0xba, 0xd1, 0xb4, 0xc4, 0xbe, 0xd1, 0xb1, 0x0e, 0xc0, + 0xa5, 0x2d, 0xd6, 0xb4, 0x68, 0xbb, 0xa3, 0xb9, 0x3d, 0xbd, 0x31, 0xbc, 0x11, 0xb4, 0xba, 0xb7, + 0xf2, 0x37, 0x91, 0xb6, 0x20, 0xbf, 0x0b, 0xc0, 0xd4, 0xbb, 0x0e, 0xb8, 0xad, 0xc1, 0x59, 0xbd, + 0xf9, 0xb7, 0x45, 0xc0, 0xe2, 0xba, 0x8f, 0xbf, 0xd1, 0x3a, 0xe2, 0xb9, 0x5b, 0xbc, 0x4d, 0xbe, + 0x75, 0xbd, 0x2e, 0xbc, 0xa2, 0x30, 0x4f, 0x28, 0xe3, 0xbf, 0x06, 0xb9, 0xd6, 0xbf, 0x18, 0xb8, + 0x2e, 0xc0, 0xc2, 0x38, 0x42, 0xb7, 0x08, 0xc1, 0xb3, 0xb8, 0xa7, 0xba, 0xc4, 0xb8, 0x31, 0xa6, + 0xbe, 0xc1, 0x79, 0xb4, 0x52, 0xb0, 0x43, 0xbb, 0x76, 0xba, 0x08, 0xba, 0x05, 0xc1, 0xfb, 0xc2, + 0x25, 0xc0, 0x9b, 0x3b, 0x49, 0x34, 0xda, 0x2d, 0xfd, 0xb9, 0xa8, 0x32, 0x05, 0x34, 0x59, 0xb8, + 0x5b, 0x33, 0x8f, 0xba, 0xd4, 0xb4, 0x60, 0xbd, 0x28, 0xc2, 0x31, 0xbb, 0xdf, 0xc0, 0x1c, 0xbf, + 0x23, 0xb6, 0x3a, 0xbd, 0x76, 0xb9, 0x43, 0xb9, 0xe8, 0xb7, 0x84, 0xbf, 0x8f, 0x34, 0xbf, 0xbb, + 0x4c, 0xc0, 0xfb, 0x3c, 0x6e, 0xbf, 0x82, 0xbd, 0xe1, 0xbd, 0x6d, 0xc1, 0x08, 0xbe, 0x01, 0xbc, + 0x28, 0xbc, 0xf4, 0xba, 0x77, 0xba, 0xa0, 0xc1, 0x64, 0xb8, 0xcc, 0xbc, 0x74, 0xc2, 0xed, 0xaf, + 0x26, 0xc0, 0x21, 0xbe, 0x07, 0xbd, 0x7b, 0xc1, 0xba, 0xba, 0x38, 0x39, 0xf7, 0xbc, 0xc1, 0xb4, + 0xc6, 0xc0, 0x92, 0xc0, 0x30, 0xbb, 0xdf, 0xbe, 0xcb, 0xb8, 0x91, 0xbd, 0x52, 0x3b, 0xa9, 0xb9, + 0x43, 0xba, 0xbd, 0xb8, 0xc3, 0xbd, 0x47, 0xbb, 0x93, 0xaa, 0xc8, 0xc1, 0xf6, 0x38, 0x62, 0xbb, + 0xba, 0xb6, 0xb8, 0xb1, 0xe8, 0xb8, 0xb4, 0xc0, 0x61, 0xb1, 0x6b, 0xba, 0xc3, 0xbe, 0x1a, 0xbb, + 0x81, 0xc0, 0x21, 0xbd, 0x0d, 0xc2, 0x49, 0xac, 0x80, 0xbe, 0xc0, 0x34, 0xe7, 0xac, 0x09, 0xb1, + 0xc0, 0xb5, 0x17, 0xbd, 0x45, 0xb9, 0xba, 0x35, 0x6f, 0xbd, 0x91, 0xbd, 0x01, 0xbf, 0xca, 0xb9, + 0x2c, 0xad, 0xd7, 0x3d, 0x1a, 0xbb, 0x63, 0xbc, 0x1b, 0xc2, 0x46, 0xb0, 0xe2, 0xba, 0x06, 0xbc, + 0x2e, 0xba, 0xc0, 0xb8, 0xeb, 0xbc, 0xed, 0xbc, 0xe5, 0xb9, 0x47, 0xba, 0xd0, 0x37, 0xf7, 0xbc, + 0x72, 0xbe, 0x00, 0xbd, 0xdb, 0x2e, 0xbc, 0xb8, 0x5b, 0xbe, 0x3c, 0xbd, 0x69, 0xbe, 0x5d, 0x34, + 0xd2, 0xbf, 0x4f, 0xbf, 0xb2, 0xb9, 0x50, 0xbe, 0xfc, 0xbc, 0x5c, 0xb9, 0x9d, 0xc0, 0xc9, 0xbf, + 0x38, 0xc1, 0xfa, 0xc0, 0xa5, 0x3c, 0x67, 0xbc, 0xc6, 0xc0, 0x5a, 0x32, 0x92, 0xbd, 0x10, 0xc1, + 0x79, 0xc0, 0xe3, 0xbf, 0x0d, 0xba, 0xb0, 0xc1, 0x5f, 0xba, 0xb1, 0xbc, 0x42, 0xbc, 0x4e, 0x3f, + 0x4b, 0xb8, 0x77, 0x2f, 0x87, 0xc1, 0x89, 0xc0, 0xf9, 0xc0, 0x12, 0xbe, 0x19, 0xbe, 0x75, 0xb6, + 0xe1, 0xc2, 0xad, 0xbb, 0x3e, 0xbc, 0x23, 0xba, 0xcd, 0xbc, 0xe1, 0x37, 0x7c, 0xb9, 0xa8, 0xb1, + 0x07, 0xb4, 0xe9, 0x38, 0x12, 0xb7, 0x06, 0xbd, 0x2d, 0xb0, 0x4e, 0xc1, 0xc6, 0xc0, 0x9a, 0x39, + 0x49, 0x3c, 0x00, 0xbe, 0x24, 0xb5, 0x86, 0xbd, 0x9f, 0xb4, 0x64, 0xbf, 0xf7, 0xba, 0x5f, 0xbe, + 0x31, 0x36, 0x64, 0xbe, 0x41, 0x35, 0x35, 0xc1, 0x81, 0xbf, 0x7f, 0xbf, 0xb2, 0xbe, 0xf9, 0xbd, + 0x65, 0xc2, 0x09, 0xba, 0x20, 0x30, 0x10, 0xbd, 0xf2, 0xc1, 0x64, 0xc0, 0xab, 0xbc, 0x43, 0xc0, + 0xd1, 0xb8, 0xd0, 0xbe, 0x09, 0xb9, 0xac, 0xbd, 0x27, 0xb8, 0x14, 0xb8, 0x3b, 0xc0, 0x26, 0xb7, + 0x57, 0xbd, 0x3a, 0xbb, 0x20, 0x3b, 0xe7, 0xb9, 0xb3, 0x36, 0xeb, 0xbd, 0x4a, 0xb8, 0x6a, 0x34, + 0xae, 0x3d, 0xc4, 0xb6, 0x78, 0xbf, 0xa6, 0xbe, 0x3e, 0x2c, 0xb3, 0x3a, 0xcd, 0xbb, 0x71, 0xbe, + 0x69, 0xbc, 0x5a, 0x27, 0x90, 0xbd, 0x65, 0xbf, 0x9d, 0xbc, 0x76, 0xad, 0x28, 0xb7, 0x54, 0xbd, + 0xe7, 0xbe, 0x68, 0xb6, 0xe8, 0xaa, 0x46, 0xbe, 0xc4, 0xbd, 0x1e, 0xc0, 0x15, 0x2a, 0x7c, 0xba, + 0xf9, 0xbd, 0x6b, 0xbd, 0x55, 0x3b, 0x07, 0xbd, 0x07, 0xc0, 0x85, 0xb8, 0xd5, 0xb4, 0x30, 0xc0, + 0x1c, 0x27, 0x27, 0xbb, 0xef, 0xbd, 0x37, 0xbb, 0x65, 0xb8, 0x76, 0x33, 0x9b, 0xbc, 0x89, 0xbc, + 0x64, 0xc2, 0x06, 0xba, 0x39, 0x3c, 0xd6, 0xb9, 0x35, 0xc0, 0xb9, 0xbf, 0xcf, 0xb6, 0x4d, 0xbf, + 0x72, 0xbb, 0x85, 0xbd, 0x34, 0xb0, 0xd1, 0xbe, 0x5c, 0xb9, 0x07, 0x35, 0x03, 0xb9, 0xea, 0xbc, + 0x00, 0xc0, 0x0d, 0xc1, 0x2f, 0xbc, 0x1b, 0xc0, 0x1f, 0xbf, 0x72, 0xbb, 0x83, 0xbc, 0x0e, 0xba, + 0xb0, 0xad, 0xd9, 0xb6, 0xc5, 0xbd, 0x80, 0xbf, 0xc6, 0xbc, 0x54, 0xb9, 0x8a, 0xbc, 0x95, 0xbc, + 0x67, 0xbe, 0x16, 0xa7, 0x9a, 0xbf, 0xc2, 0x33, 0xa6, 0xbd, 0xa3, 0xb9, 0x08, 0xc0, 0xe6, 0xbb, + 0xc5, 0x37, 0x12, 0xbc, 0xd8, 0xbf, 0x92, 0xbd, 0x71, 0xc0, 0xa7, 0x38, 0x43, 0xb8, 0x27, 0xbd, + 0x55, 0xbd, 0x21, 0xb8, 0xe8, 0xa9, 0x9e, 0x3d, 0x87, 0xbe, 0x43, 0xc0, 0xa8, 0xba, 0x66, 0xb2, + 0x0d, 0xb8, 0xa8, 0xb2, 0x50, 0xb4, 0x3b, 0xbe, 0xc0, 0xbe, 0xf4, 0x32, 0xda, 0xbd, 0x71, 0xbc, + 0x10, 0xbd, 0xc3, 0xb6, 0x0c, 0xbf, 0xb1, 0xbc, 0xbe, 0xbd, 0xf9, 0xba, 0xe5, 0x34, 0xfa, 0xbc, + 0x1e, 0xb9, 0xec, 0xb7, 0x72, 0xb8, 0x96, 0xbf, 0xa0, 0xbc, 0xea, 0xac, 0x36, 0x2c, 0xf8, 0xc0, + 0x5f, 0x38, 0xae, 0xc0, 0x80, 0x3c, 0xab, 0xc1, 0x3f, 0xbf, 0xde, 0xc1, 0x12, 0xb7, 0x85, 0xc0, + 0xc2, 0xbf, 0xa4, 0xba, 0x4d, 0xbd, 0x2e, 0x3a, 0x26, 0x30, 0x4e, 0xbe, 0x09, 0x38, 0x2d, 0xb9, + 0xa6, 0xbc, 0xe7, 0x38, 0x6c, 0xc0, 0x9e, 0x36, 0xd7, 0xbb, 0x86, 0xc0, 0xa1, 0xbd, 0xb9, 0xba, + 0x6c, 0xa4, 0x9b, 0xbe, 0x94, 0xbc, 0x91, 0xaa, 0x98, 0x3a, 0xb5, 0x3a, 0x1a, 0xc1, 0x36, 0xc2, + 0x28, 0xbd, 0x5d, 0xbc, 0x97, 0xbc, 0x2e, 0xbc, 0x55, 0xc0, 0x94, 0xbc, 0xa5, 0xbc, 0xcb, 0xa1, + 0x25, 0x9d, 0xe3, 0xbd, 0x19, 0xbf, 0x89, 0x1b, 0x9b, 0xbf, 0x9d, 0xbf, 0x59, 0xbc, 0xeb, 0xb2, + 0x4f, 0xb8, 0x6b, 0xbc, 0x20, 0xc2, 0xb6, 0xb4, 0xef, 0xc0, 0x72, 0xbe, 0xed, 0xba, 0xbd, 0xbe, + 0x5b, 0x32, 0x1a, 0xbd, 0x9c, 0xc2, 0xbd, 0xba, 0x19, 0xc0, 0x94, 0xc0, 0x75, 0x3b, 0x5f, 0xbe, + 0x8c, 0xbe, 0x8d, 0x32, 0xf2, 0xbd, 0xd1, 0xc0, 0xa8, 0xbd, 0xf7, 0x2e, 0xad, 0x36, 0x9c, 0xbd, + 0x75, 0x3c, 0x7d, 0xb8, 0x9e, 0xbe, 0xde, 0x29, 0x3d, 0xbf, 0x29, 0xc0, 0x47, 0xbd, 0x39, 0xbf, + 0x71, 0xbd, 0x32, 0xc1, 0x25, 0xb8, 0xb2, 0xb5, 0x7e, 0xae, 0x7c, 0x38, 0x5f, 0xbc, 0xa0, 0xb6, + 0xc9, 0xc0, 0xf2, 0xbc, 0x74, 0xbc, 0x2f, 0x37, 0xa0, 0xb2, 0xfc, 0xbc, 0x09, 0xc2, 0xc6, 0x35, + 0x45, 0xc1, 0x62, 0xc1, 0x18, 0xc4, 0x25, 0xbb, 0x74, 0xba, 0x83, 0xb9, 0x6b, 0x36, 0x7b, 0xbc, + 0xa2, 0xb0, 0xf8, 0xbe, 0x20, 0xbe, 0xfc, 0xba, 0x35, 0xbe, 0x51, 0xbe, 0xbf, 0xbd, 0x4d, 0x3d, + 0x15, 0xb4, 0xd8, 0xbd, 0x37, 0xc0, 0x93, 0xbc, 0x9d, 0xbc, 0xdd, 0xbd, 0xd5, 0xc0, 0x1c, 0xbe, + 0x09, 0xc1, 0x97, 0xc0, 0xe9, 0xba, 0x22, 0xba, 0xc6, 0xbe, 0x27, 0xbe, 0x38, 0xb9, 0x99, 0xb6, + 0xca, 0x38, 0x1d, 0xc1, 0xdc, 0xb4, 0x9c, 0xbe, 0xeb, 0xbe, 0x63, 0xba, 0x9f, 0xbc, 0xef, 0xc1, + 0xa8, 0xae, 0x9d, 0xbc, 0x21, 0x31, 0x5e, 0xbc, 0x34, 0xc1, 0x3f, 0xbd, 0x2b, 0xb0, 0x4c, 0xba, + 0x55, 0xbe, 0x83, 0xc0, 0x6f, 0xc1, 0x92, 0xb6, 0x99, 0x35, 0x94, 0x35, 0x0a, 0xb2, 0x11, 0xbf, + 0x0f, 0xa1, 0xb8, 0x1e, 0x69, 0xbe, 0x49, 0xba, 0xd2, 0xbd, 0xa4, 0x37, 0xb8, 0xb8, 0x1b, 0xb9, + 0x37, 0xbc, 0x7c, 0xbe, 0xba, 0x2c, 0x1b, 0xc3, 0x2a, 0x32, 0x25, 0xbb, 0x35, 0xc1, 0x44, 0xbe, + 0x91, 0xba, 0x39, 0xc0, 0xee, 0x34, 0xd7, 0xc2, 0xd4, 0x94, 0x2c, 0xbe, 0xd3, 0xc0, 0x6a, 0xb1, + 0x21, 0x34, 0x65, 0xb9, 0x78, 0x35, 0x30, 0x3d, 0xdc, 0xbe, 0x71, 0xbf, 0xa2, 0xb9, 0x02, 0xbd, + 0x67, 0xbc, 0x06, 0xc0, 0x49, 0xaa, 0x7c, 0xbd, 0xc7, 0xb0, 0xdc, 0xbf, 0x9c, 0xb8, 0x3c, 0xb9, + 0x35, 0xbc, 0xf7, 0xb5, 0xfa, 0xbe, 0x0c, 0x34, 0x3d, 0xbd, 0x68, 0xbf, 0xba, 0xb9, 0x20, 0xb7, + 0x6e, 0xbf, 0x0b, 0xad, 0x5a, 0xbf, 0xf9, 0xbd, 0xe8, 0xbc, 0x77, 0xc0, 0x30, 0xbe, 0x0b, 0xbf, + 0xeb, 0xae, 0x1e, 0xb8, 0xd6, 0xc1, 0x06, 0xb9, 0xf2, 0xbe, 0x0c, 0xbc, 0x65, 0xbc, 0x95, 0xbc, + 0xb5, 0xba, 0x7d, 0xb9, 0x76, 0xb8, 0x95, 0x34, 0x88, 0xbe, 0x53, 0xbe, 0x49, 0xbe, 0xd8, 0xbd, + 0xa4, 0xb9, 0xf2, 0xb8, 0x68, 0x21, 0x39, 0xc2, 0x88, 0xc0, 0x8d, 0xb8, 0x90, 0x37, 0xa2, 0xb5, + 0xce, 0xba, 0xa5, 0xbd, 0x27, 0xc0, 0x5a, 0xc0, 0x4a, 0xbd, 0x0c, 0xbf, 0x5c, 0xc0, 0x37, 0xb6, + 0x05, 0xc2, 0x58, 0xc1, 0xf5, 0xc1, 0xb4, 0xbb, 0xed, 0xb3, 0x5e, 0xbe, 0x17, 0xb6, 0xce, 0xb9, + 0xfb, 0xb6, 0x9f, 0xbc, 0xb6, 0xbc, 0xe1, 0x30, 0x82, 0xc0, 0x1d, 0xb9, 0xf0, 0xb9, 0x1e, 0xbd, + 0x11, 0xb2, 0x3e, 0x3b, 0x14, 0xb9, 0x93, 0xbd, 0xdf, 0xbd, 0x81, 0xbd, 0x6b, 0xbb, 0xbd, 0xbe, + 0xb9, 0xa5, 0x06, 0xbb, 0x43, 0xb4, 0x08, 0xbe, 0x5c, 0x34, 0x57, 0xc1, 0x2e, 0xc1, 0xb3, 0xb9, + 0xa3, 0xbc, 0xd7, 0xb8, 0x14, 0xc0, 0xff, 0xba, 0x4c, 0xc1, 0x47, 0xbd, 0xe3, 0x35, 0x6d, 0xbc, + 0xf5, 0xbd, 0x0f, 0xbd, 0x2d, 0x21, 0x9a, 0x36, 0x8d, 0xbf, 0x0b, 0xbe, 0x80, 0xb8, 0xec, 0xb8, + 0xba, 0xbf, 0x45, 0xc0, 0xd3, 0xb6, 0xfc, 0xbc, 0xff, 0xba, 0x2c, 0xc3, 0x5e, 0xb9, 0x56, 0xbd, + 0x75, 0xbc, 0x27, 0x34, 0x08, 0xbd, 0x1b, 0xbd, 0xf4, 0xb8, 0x43, 0xb9, 0x95, 0xb6, 0x79, 0xbf, + 0xbc, 0xba, 0x50, 0xbd, 0xc6, 0xbe, 0x79, 0xb7, 0xe9, 0xbc, 0xe1, 0xb8, 0x65, 0x2a, 0x07, 0xb1, + 0x66, 0x39, 0xbc, 0x38, 0xd7, 0xbe, 0xdc, 0xb8, 0x0e, 0x3a, 0x23, 0xbe, 0x8e, 0xbc, 0xa3, 0xbb, + 0x41, 0xbb, 0x56, 0x29, 0x58, 0x2b, 0xef, 0xbe, 0x69, 0xc0, 0xbd, 0xbd, 0x8c, 0xb5, 0x63, 0xbe, + 0xb1, 0xbf, 0x93, 0xbe, 0xf3, 0xb8, 0xbe, 0x36, 0x4b, 0xbd, 0x4f, 0x38, 0xb6, 0xbe, 0xe9, 0xbe, + 0xbb, 0xba, 0x5d, 0x3c, 0xdb, 0x25, 0x3e, 0xc1, 0x65, 0xbc, 0x41, 0xbd, 0x22, 0xbe, 0xfa, 0x31, + 0x32, 0xbd, 0x4e, 0x38, 0xb7, 0xbe, 0x3f, 0xbc, 0x81, 0xad, 0x82, 0xbb, 0x22, 0xba, 0xe2, 0xb3, + 0x39, 0xbc, 0x7d, 0xb4, 0x3e, 0xc0, 0x2b, 0xbc, 0xaf, 0xb9, 0x91, 0xbd, 0x51, 0xc0, 0x27, 0xc1}; +unsigned char conv2d_winograd_fp16_ker[] = { + 0x28, 0xbe, 0x1c, 0xc0, 0x38, 0xbe, 0xde, 0xbb, 0xad, 0xbf, 0x2a, 0xc1, 0x53, 0xc0, 0x29, 0xbd, + 0xea, 0xc0, 0xd5, 0xbc, 0x63, 0xba, 0x39, 0xbf, 0xe7, 0xc1, 0x9f, 0xbc, 0x45, 0xc4, 0x97, 0xc1, + 0xe0, 0xb9, 0x52, 0xc1, 0x1a, 0xc1, 0xa2, 0xc0, 0x6d, 0xc2, 0xb0, 0xbf, 0x7f, 0xc0, 0x4f, 0xb6, + 0x5d, 0xbc, 0x61, 0xbc, 0x0e, 0xbf, 0x43, 0xc2, 0xe8, 0xc0, 0x83, 0xc1, 0x02, 0xbf, 0x01, 0xba, + 0xeb, 0xc0, 0x83, 0xc4, 0x89, 0xbc, 0x10, 0xc3, 0xc8, 0xc0, 0xd1, 0xc0, 0x06, 0xb9, 0x1d, 0xc3, + 0x65, 0xc2, 0x91, 0xc1, 0xdc, 0xbe, 0x79, 0xbd, 0x29, 0xbe, 0x91, 0xc0, 0xd4, 0xbf, 0x98, 0xc1, + 0x4b, 0xc1, 0x68, 0xc4, 0x55, 0xc3, 0x9b, 0xbd, 0x2a, 0xc2, 0x66, 0xc2, 0x42, 0xb9, 0x59, 0xbe, + 0xe0, 0xc0, 0xa1, 0xbc, 0xe8, 0xc0, 0xbc, 0xbf, 0xd1, 0xc3, 0x11, 0xbe, 0xf2, 0xc1, 0xe8, 0xbb, + 0x0c, 0xb0, 0x63, 0xc3, 0x9e, 0xc0, 0xf5, 0xba, 0x8f, 0xc1, 0x1d, 0xbf, 0x05, 0xc0, 0x0e, 0xc2, + 0x50, 0xbf, 0xef, 0xbf, 0x37, 0xc0, 0x0e, 0xbc, 0x87, 0xbd, 0x72, 0xbe, 0xab, 0xb8, 0xbd, 0xc2, + 0xed, 0xbf, 0x5f, 0xbd, 0x2e, 0xc0, 0x0e, 0xbd, 0xfc, 0xbe, 0x93, 0xc1, 0x53, 0xc1, 0x7e, 0xbc, + 0x35, 0xc0, 0x38, 0xc1, 0xbb, 0xaf, 0xba, 0xbe, 0xde, 0xc1, 0xa4, 0xbc, 0x33, 0xbe, 0xcd, 0xc1, + 0x08, 0xbb, 0x0c, 0xc0, 0x31, 0xc0, 0xad, 0xbd, 0x64, 0xc0, 0x4e, 0xbf, 0x91, 0xb9, 0xd5, 0xc1, + 0x95, 0xc0, 0x7d, 0xbf, 0x1c, 0xc2, 0x83, 0xbe, 0x3f, 0xc0, 0xda, 0xbd, 0x7a, 0xbe, 0x07, 0xc2, + 0xa1, 0xbe, 0x45, 0xb9, 0x32, 0xae, 0x44, 0xc0, 0xde, 0xc1, 0xdf, 0xbd, 0x7f, 0xbe, 0xa6, 0xc3, + 0x65, 0xc3, 0x4c, 0xbc, 0xbd, 0xbd, 0xea, 0xc1, 0x80, 0xc1, 0x60, 0xc0, 0x84, 0xc0, 0x9d, 0xc1, + 0x74, 0xbd, 0x75, 0xbe, 0x87, 0xbe, 0xf7, 0xbd, 0x43, 0xbf, 0xfa, 0xc1, 0x2a, 0xc2, 0x84, 0xbb, + 0x2f, 0xbf, 0x37, 0xc1, 0xb6, 0xba, 0x91, 0xc1, 0xc5, 0xc1, 0xee, 0xc2, 0x38, 0xc0, 0xe2, 0xbe, + 0x4b, 0xbe, 0x4c, 0xbd, 0x5e, 0xbe, 0x61, 0xc2, 0x9a, 0xad, 0xbf, 0xbe, 0x51, 0xba, 0x3b, 0xc1, + 0x89, 0xc1, 0xaa, 0xbf, 0x01, 0xbd, 0x3f, 0xc2, 0x05, 0xbe, 0xcd, 0xbc, 0xc3, 0xc0, 0x3d, 0xc2, + 0xab, 0xc3, 0x1c, 0xbe, 0x49, 0xc1, 0x0e, 0xc0, 0x20, 0xc1, 0x88, 0xc2, 0xfc, 0xbf, 0x3f, 0xb9, + 0xf9, 0xb4, 0xc2, 0xb8, 0x94, 0xbe, 0xe1, 0xbf, 0x36, 0xbd, 0x24, 0xc2, 0x84, 0xc1, 0xc7, 0xc1, + 0x1f, 0x33, 0x2a, 0xbf, 0x4b, 0xc0, 0xa3, 0xbf, 0x57, 0xba, 0xbc, 0xba, 0x4f, 0xc0, 0xbe, 0x33, + 0x3d, 0xc3, 0x77, 0xc0, 0x65, 0xb4, 0x18, 0xbd, 0x51, 0xc1, 0xdc, 0xbe, 0xc8, 0xb9, 0x4c, 0xc0, + 0x16, 0x35, 0xbe, 0xbc, 0x31, 0xc1, 0xe4, 0xbd, 0x57, 0xbc, 0x49, 0xc1, 0xd4, 0xbd, 0xeb, 0xba, + 0x02, 0xc1, 0xa8, 0xbb, 0xcd, 0xc0, 0x7b, 0xc0, 0x21, 0xb2, 0x61, 0xc0, 0x8a, 0xc1, 0xe4, 0xbe, + 0x0f, 0xc2, 0xaf, 0xc0, 0x70, 0xc3, 0xd2, 0xbc, 0x67, 0xbd, 0xd9, 0xc1, 0x4e, 0xc2, 0x6e, 0xc1, + 0x1e, 0xc4, 0x09, 0xc3, 0x42, 0xbf, 0x50, 0xc1, 0x52, 0xbd, 0x77, 0xc3, 0x1d, 0xc0, 0x31, 0xbb, + 0xd2, 0xbe, 0x66, 0xc3, 0x9b, 0xbc, 0x4d, 0xbf, 0x66, 0xb6, 0x02, 0xc2, 0xbe, 0xc3, 0xd1, 0x28, + 0xef, 0xc2, 0x11, 0xbd, 0x9d, 0xc2, 0xd9, 0xbd, 0xb0, 0xbe, 0xd9, 0xbf, 0x49, 0xc2, 0x71, 0x9e, + 0x5b, 0xb5, 0x59, 0xc2, 0xf6, 0xbd, 0x4a, 0xb5, 0x12, 0xbd, 0x19, 0xbe, 0x73, 0xc3, 0xe5, 0xbc, + 0xec, 0xbc, 0x2d, 0xbf, 0x43, 0xbe, 0xfc, 0xc0, 0x68, 0xbc, 0x24, 0xc0, 0x7f, 0xc0, 0x8c, 0xc0, + 0x92, 0xba, 0x52, 0xba, 0x42, 0xc0, 0x18, 0xb9, 0x14, 0x3c, 0x11, 0xc2, 0xa2, 0xc2, 0x10, 0xbd, + 0xaa, 0xc0, 0x0f, 0xc0, 0x38, 0xc0, 0xa3, 0xc1, 0x58, 0xbe, 0x62, 0xc2, 0xe9, 0xc0, 0x36, 0xc0, + 0xc6, 0xc1, 0x21, 0xbc, 0xf5, 0xc2, 0x42, 0xbd, 0x35, 0xbc, 0xda, 0xc1, 0xcb, 0xbb, 0x5f, 0xba, + 0x2b, 0xbd, 0xff, 0xc2, 0x5f, 0xab, 0xc7, 0x2c, 0x41, 0xc0, 0x2e, 0xbe, 0x38, 0xc0, 0xf7, 0xc3, + 0x60, 0xbd, 0x73, 0xc2, 0x01, 0xbf, 0x3b, 0xc0, 0x8c, 0xc0, 0x88, 0xae, 0x26, 0xc0, 0x2a, 0xbf, + 0xd5, 0xc0, 0x9e, 0xc2, 0x75, 0xbe, 0x67, 0xc0, 0xc8, 0xbf, 0x7d, 0xbe, 0xf9, 0xc0, 0xaf, 0xbc, + 0x40, 0xba, 0x30, 0xbf, 0x19, 0xc1, 0x16, 0xc3, 0x10, 0xc0, 0x85, 0xb0, 0x31, 0xc3, 0xae, 0xbd, + 0xb0, 0xc0, 0xd4, 0xbd, 0x06, 0xc1, 0x72, 0xbf, 0x02, 0xc0, 0x83, 0xb7, 0x02, 0xc2, 0x56, 0xc2, + 0xa9, 0xc1, 0x7b, 0xbf, 0xce, 0xc0, 0x2a, 0xbf, 0x02, 0xc0, 0x97, 0xc1, 0x91, 0xba, 0xda, 0xb9, + 0xf2, 0xbd, 0xa5, 0xc1, 0xd3, 0xbf, 0x65, 0xbb, 0x32, 0xc0, 0x33, 0xbf, 0x93, 0xbb, 0x73, 0xc0, + 0xa2, 0xbf, 0xe6, 0xc2, 0x29, 0xc2, 0xbc, 0xc1, 0xfa, 0xc0, 0x3d, 0xc1, 0x28, 0xc2, 0xa4, 0xc2, + 0x44, 0xb9, 0x1d, 0xc4, 0x0d, 0xbf, 0x05, 0xc0, 0xe0, 0xc0, 0xc3, 0xbf, 0x25, 0x2c, 0xc3, 0xc1, + 0x03, 0xbf, 0x58, 0xbf, 0x21, 0xbe, 0x3c, 0xbd, 0x6f, 0xc3, 0x89, 0xc1, 0x14, 0xc0, 0xce, 0xc3, + 0xd3, 0xbd, 0xeb, 0xc1, 0x28, 0xc2, 0x79, 0xc1, 0x57, 0xbf, 0xe3, 0xbe, 0xa8, 0xbc, 0xca, 0xc0, + 0x5a, 0xbd, 0xaa, 0xbe, 0x40, 0xbd, 0x0d, 0xc1, 0x5b, 0xb9, 0x8f, 0xbc, 0xc5, 0xc1, 0xfd, 0xb9, + 0x1a, 0xc0, 0x6a, 0xc1, 0xac, 0xc1, 0x89, 0xbf, 0xf2, 0xbc, 0x7e, 0xc3, 0x04, 0xc2, 0xbe, 0xc0, + 0x3b, 0xc0, 0x2a, 0xc1, 0x4a, 0xc2, 0xa4, 0xc1, 0x60, 0xc2, 0x3b, 0xbd, 0x75, 0x35, 0xcc, 0xc0, + 0xbe, 0xc1, 0x74, 0xc0, 0x8e, 0xc0, 0xb6, 0xc0, 0xa1, 0xc0, 0x59, 0xc1, 0xbe, 0xc0, 0xe9, 0xbc, + 0x9f, 0xbe, 0x6e, 0xbe, 0x54, 0xc0, 0x28, 0xc2, 0x05, 0xbc, 0xf1, 0xc1, 0x26, 0xa7, 0x6b, 0xbe, + 0x4b, 0xbd, 0xc4, 0xb9, 0x48, 0xbe, 0x0b, 0xbb, 0x68, 0xbf, 0xe9, 0xbc, 0xe5, 0xbc, 0xdc, 0xc1, + 0xdc, 0xc4, 0xcd, 0xc1, 0xf7, 0xa4, 0xb1, 0x35, 0x32, 0xc0, 0x9c, 0xbe, 0x3a, 0xc0, 0x13, 0xc0, + 0x76, 0xb8, 0x47, 0xb9, 0x26, 0xc1, 0x25, 0xc2, 0x40, 0x38, 0x4c, 0xc2, 0xfb, 0x30, 0x32, 0xc0, + 0xb0, 0xb6, 0xaa, 0xbc, 0x7f, 0xc1, 0x42, 0xc0, 0xd5, 0xbf, 0x8d, 0xc1, 0xe0, 0xbe, 0x4b, 0xba, + 0x77, 0xbf, 0x16, 0xbe, 0xfc, 0xbf, 0x13, 0xc0, 0x52, 0xc0, 0x82, 0xc0, 0xf7, 0xbf, 0xe5, 0xb0, + 0x44, 0xc2, 0xe6, 0xbe, 0x8b, 0xba, 0x75, 0xbd, 0xb6, 0xc1, 0xcb, 0xbd, 0xb1, 0xc0, 0x28, 0xc3, + 0x09, 0xc3, 0xaa, 0xc0, 0xda, 0xbc, 0xde, 0xbd, 0x90, 0xb6, 0xeb, 0xc2, 0x13, 0xc0, 0x6e, 0xc2, + 0x40, 0xbd, 0x0a, 0xc0, 0xfb, 0xbc, 0x3c, 0xb8, 0xf1, 0xbf, 0x9f, 0xc0, 0xac, 0xc2, 0x8b, 0xc0, + 0x31, 0xc2, 0xbe, 0xc1, 0xc8, 0xbf, 0x19, 0xb9, 0x8f, 0xbc, 0x38, 0xbd, 0x2c, 0xc0, 0x4e, 0xc2, + 0xa9, 0xc3, 0x77, 0xc1, 0xa3, 0xbe, 0x2c, 0xc2, 0x67, 0xbe, 0x0b, 0xbe, 0xf1, 0xbc, 0xf6, 0xc0, + 0x58, 0xb7, 0x3a, 0xbf, 0xef, 0xbf, 0x6d, 0x3b, 0xe3, 0xc3, 0x04, 0xc4, 0x38, 0xc2, 0xdf, 0xbe, + 0x03, 0xbf, 0x88, 0xba, 0x13, 0xc0, 0x52, 0xbc, 0x85, 0xbe, 0x9a, 0xc4, 0x05, 0xbf, 0x96, 0xbb, + 0xab, 0xb3, 0x39, 0xb7, 0xfc, 0xc2, 0x64, 0xbf, 0x3a, 0xc2, 0xc1, 0xc1, 0xf3, 0xc1, 0x76, 0xbf, + 0x37, 0xbc, 0xd2, 0x33, 0xcb, 0xc0, 0x86, 0xc1, 0x10, 0xc1, 0x61, 0xc0, 0x60, 0xc1, 0xc8, 0xc0, + 0x36, 0xc0, 0x3d, 0xc0, 0xba, 0xb5, 0x60, 0xbc, 0x88, 0xbe, 0xe2, 0xbe, 0x52, 0xc1, 0xff, 0xc2, + 0xb7, 0xb1, 0x8f, 0xc0, 0x8a, 0xbd, 0xf6, 0xc0, 0xb7, 0xbe, 0x4f, 0xbe, 0x19, 0xc2, 0xa0, 0xc0, + 0xae, 0xbf, 0xf8, 0xc1, 0x94, 0xc3, 0xdc, 0xbd, 0x4b, 0xbf, 0x87, 0xbe, 0x43, 0xc0, 0x02, 0xc3, + 0xa2, 0xc2, 0x35, 0xbc, 0x47, 0xc3, 0xfc, 0x38, 0x0c, 0xbb, 0x71, 0xbd, 0xde, 0xc0, 0x2d, 0xbc, + 0x78, 0xbd, 0x65, 0xc2, 0x0e, 0xbc, 0x1c, 0xbc, 0x09, 0xc2, 0x22, 0xbe, 0xe2, 0xc1, 0xdd, 0xbb, + 0x58, 0xc0, 0x0e, 0xc0, 0x16, 0xc2, 0x80, 0xc1, 0xfc, 0xbc, 0x2c, 0xc2, 0x99, 0xc3, 0x07, 0xc1, + 0xa7, 0xbc, 0x4d, 0xc1, 0x4e, 0xc2, 0xb0, 0xba, 0x04, 0xbc, 0x27, 0xc0, 0x84, 0xbc, 0x68, 0xc0, + 0x91, 0xc2, 0x75, 0xb9, 0x54, 0xc0, 0x61, 0xc1, 0xdb, 0xbe, 0x77, 0xbb, 0x44, 0xbd, 0x80, 0xc2, + 0xf0, 0x2b, 0xe4, 0xbe, 0xcd, 0xb8, 0x5b, 0xc1, 0x21, 0xc0, 0x02, 0xba, 0xf2, 0xbd, 0x67, 0xc0, + 0xe6, 0xba, 0x58, 0xc2, 0x96, 0xbb, 0xa6, 0xc2, 0x44, 0xbf, 0x63, 0xc0, 0xde, 0xc0, 0x0d, 0xc1, + 0x72, 0xc1, 0x28, 0xc3, 0xd6, 0xc1, 0x1c, 0xb9, 0x4c, 0xbf, 0x49, 0xbf, 0xb8, 0xb4, 0xd5, 0xc2, + 0x9f, 0xc1, 0x53, 0xba, 0x09, 0xc2, 0xd8, 0x30, 0xd3, 0xc0, 0xd8, 0xbe, 0x28, 0xbe, 0x5e, 0xc0, + 0x2f, 0xc3, 0xf4, 0xbd, 0x3d, 0xbd, 0x37, 0xc0, 0xeb, 0xc0, 0x21, 0xc0, 0xe2, 0xb9, 0x20, 0xb9, + 0xa5, 0xc0, 0xe6, 0xbe, 0x16, 0xc4, 0x07, 0xbc, 0x93, 0xbd, 0x95, 0xc1, 0x91, 0xb5, 0xaa, 0xc1, + 0xa1, 0xbe, 0x8a, 0xba, 0xf4, 0xbc, 0xf1, 0xc1, 0x46, 0xc1, 0x8f, 0xbd, 0xa0, 0xbd, 0x21, 0xc0, + 0xc1, 0xc0, 0x9f, 0xbc, 0x3c, 0xc1, 0x61, 0xc1, 0xc4, 0xbe, 0x76, 0xbd, 0x69, 0xc0, 0xb0, 0xbe, + 0x21, 0xbc, 0x09, 0xc0, 0x86, 0xc1, 0x51, 0xbc, 0x7d, 0xbf, 0xad, 0xbf, 0xec, 0xbb, 0x98, 0xc0, + 0x0e, 0xc1, 0x13, 0xc1, 0x06, 0xc1, 0x38, 0xbd, 0x2e, 0xbe, 0xd1, 0xc0, 0x5c, 0xb4, 0xfd, 0xbd, + 0x49, 0xb0, 0x6b, 0xc0, 0x25, 0xc1, 0x7b, 0xbf, 0x91, 0xc0, 0x4a, 0xc4, 0x07, 0xc0, 0xf0, 0xbd, + 0x5a, 0xbf, 0x40, 0xc0, 0x17, 0xbf, 0xd4, 0xbf, 0xd2, 0xbe, 0x76, 0xc2, 0x33, 0xc2, 0x2a, 0xb2, + 0x28, 0xbd, 0x75, 0xc1, 0xa0, 0xbe, 0x0d, 0xc4, 0x57, 0xbc, 0x78, 0xc2, 0x2e, 0xc3, 0x62, 0xbe, + 0xfb, 0xbe, 0x48, 0xa9, 0x93, 0xc0, 0x9e, 0xc1, 0xaf, 0xc1, 0x76, 0xc0, 0x94, 0xc1, 0xfb, 0xbf, + 0xc8, 0xc1, 0xdc, 0xbe, 0xca, 0xbb, 0x23, 0xbe, 0xfd, 0xc4, 0x2c, 0xc0, 0x46, 0xc0, 0xd3, 0xc4, + 0xab, 0xc2, 0x84, 0xbb, 0x64, 0xc1, 0x2d, 0xb4, 0x25, 0xbd, 0x8c, 0xb8, 0xaa, 0xc1, 0x75, 0xc2, + 0x0f, 0xbf, 0x28, 0xc0, 0xde, 0xbf, 0x6e, 0xc2, 0xfc, 0xb7, 0x6d, 0xb9, 0x5c, 0xbe, 0xa4, 0xc4, + 0x27, 0xc0, 0xc4, 0xc2, 0x72, 0xb4, 0x43, 0xc2, 0xe8, 0xc2, 0xb5, 0xbd, 0x2b, 0xbe, 0xd6, 0xc3, + 0xc1, 0xb8, 0x5f, 0xc1, 0xde, 0xc0, 0x96, 0xbf, 0x99, 0xb9, 0x0e, 0xbd, 0x8b, 0xbb, 0x43, 0xbe, + 0xa3, 0xc1, 0x97, 0xbf, 0xa3, 0xbf, 0x08, 0xbf, 0x27, 0xbf, 0xae, 0xc1, 0x39, 0xbd, 0xf1, 0xbf, + 0x79, 0xc1, 0x54, 0xbf, 0xbc, 0xc2, 0xd6, 0xbe, 0x5a, 0xbc, 0x4d, 0xbe, 0x8d, 0xb9, 0xd2, 0xc2, + 0xe0, 0xc0, 0xd5, 0xc2, 0x7e, 0xbf, 0x31, 0xbf, 0x03, 0xbe, 0xa7, 0xbe, 0x22, 0xc0, 0x3a, 0xc0, + 0xf2, 0xbc, 0x39, 0xb9, 0x9c, 0x3c, 0x89, 0xbd, 0x2a, 0xc1, 0x02, 0xc0, 0x88, 0xc0, 0x07, 0xc2, + 0x92, 0xc1, 0xc3, 0xbb, 0x88, 0xbe, 0xe9, 0xba, 0x19, 0xbe, 0x70, 0xc1, 0xd4, 0xbc, 0xd5, 0xbc, + 0xb6, 0xbe, 0x1f, 0xc0, 0xdc, 0xbf, 0xa8, 0xc2, 0x88, 0xbf, 0xe5, 0xc0, 0x21, 0xc0, 0xeb, 0xbf, + 0xac, 0xbe, 0x3c, 0xc0, 0xb0, 0xc2, 0xdf, 0xc0, 0xb7, 0xc1, 0xa8, 0xc3, 0x2b, 0xb5, 0xd0, 0xb2, + 0x74, 0xbe, 0xe4, 0xb5, 0xb4, 0xbd, 0x44, 0xc1, 0x1c, 0xbb, 0x96, 0xc3, 0xfb, 0xba, 0xa2, 0xc3, + 0x84, 0xc1, 0x40, 0xbc, 0xe0, 0xbd, 0xd7, 0xbe, 0x80, 0xc1, 0x75, 0xc0, 0xb2, 0xc0, 0x7d, 0xc2, + 0xc0, 0xbc, 0x0e, 0xbc, 0xb9, 0xbe, 0x76, 0xb9, 0xc0, 0xc2, 0xcb, 0xbf, 0xef, 0xc0, 0x2f, 0xbe, + 0xb3, 0xbe, 0x22, 0xbe, 0x9b, 0xb8, 0xd4, 0xc0, 0x5b, 0xc1, 0xe8, 0xc1, 0x9a, 0xc0, 0x04, 0xbf, + 0x18, 0xbf, 0x87, 0xbc, 0x3e, 0xc0, 0x42, 0xc2, 0x24, 0xc0, 0xba, 0xbb, 0x1f, 0xc1, 0x4d, 0xbd, + 0xbe, 0xb9, 0x24, 0xc0, 0x22, 0xc0, 0x37, 0xbe, 0x61, 0xbd, 0xdd, 0xbb, 0xb8, 0xc1, 0x52, 0xbe, + 0x0e, 0xc0, 0x64, 0xb8, 0x4c, 0xbe, 0xd2, 0xba, 0xef, 0xc2, 0x82, 0xc3, 0x45, 0xb9, 0xa1, 0xba, + 0x63, 0xc0, 0x10, 0xc2, 0x14, 0xc2, 0xd1, 0xc1, 0x5d, 0xbf, 0x02, 0xbf, 0x1a, 0xac, 0x59, 0xc1, + 0x41, 0xbe, 0x99, 0xb4, 0x75, 0xc2, 0xf2, 0x37, 0xb7, 0xc0, 0x55, 0xc1, 0xb0, 0xba, 0x8d, 0xbe, + 0x65, 0xbd, 0x45, 0xc0, 0x1f, 0xbd, 0x77, 0xbc, 0x49, 0xc2, 0x39, 0xc1, 0xcb, 0xb8, 0x2d, 0xbe, + 0x90, 0xbb, 0x0e, 0xc2, 0x35, 0xc0, 0xad, 0xc3, 0x86, 0xba, 0xb5, 0xc2, 0x07, 0xc0, 0xcd, 0xbd, + 0x2f, 0xc1, 0x1c, 0xc1, 0x0d, 0xc2, 0x13, 0xc1, 0x16, 0xc1, 0xee, 0xba, 0x13, 0xba, 0xd7, 0xc4, + 0xf8, 0xc1, 0xfe, 0xba, 0xf1, 0xbe, 0xba, 0xbb, 0x67, 0xbf, 0xa4, 0xc4, 0xd2, 0xb5, 0x9b, 0xc2, + 0xdc, 0xc0, 0xe4, 0xbf, 0x94, 0xc0, 0x45, 0xbd, 0xf2, 0xc1, 0xa0, 0xbd, 0xd4, 0x33, 0x8b, 0xc3, + 0x51, 0xbf, 0x48, 0xbd, 0xc2, 0xb5, 0xcc, 0xc2, 0x05, 0xbf, 0x59, 0xc0, 0x18, 0xbe, 0x41, 0x32, + 0xf3, 0xc0, 0x0e, 0xbf, 0xe6, 0xba, 0xd8, 0xc3, 0x19, 0xc0, 0x2f, 0xbb, 0xb9, 0xbe, 0xb4, 0xc2, + 0x1e, 0xc0, 0x4a, 0xc1, 0xa2, 0x39, 0xad, 0xc2, 0x9a, 0xc2, 0x57, 0xc3, 0x64, 0xc0, 0xc5, 0xc3, + 0x89, 0xc3, 0x8f, 0xb6, 0x7b, 0xc2, 0x27, 0xc0, 0x41, 0xc0, 0x25, 0xc0, 0x7f, 0xc0, 0x3a, 0xc0, + 0x70, 0xc1, 0x5a, 0xb9, 0x99, 0xbd, 0x8e, 0x33, 0x65, 0xc1, 0x6d, 0xc0, 0x3c, 0xbe, 0x69, 0xbf, + 0x11, 0xc3, 0x26, 0xbc, 0x60, 0xc0, 0x52, 0xbf, 0xee, 0xc1, 0x9a, 0xbf, 0x27, 0xc0, 0xf7, 0xc0, + 0x81, 0xbe, 0xef, 0xc2, 0x7b, 0xbd, 0xc1, 0xc2, 0x2f, 0xc1, 0xcd, 0xbc, 0xa5, 0xc0, 0x0c, 0xbf, + 0x77, 0xc1, 0x60, 0xb8, 0xdc, 0xc0, 0x17, 0xb8, 0x67, 0xbd, 0xb0, 0xbc, 0x4f, 0xbf, 0x96, 0xc1, + 0x6e, 0xc1, 0xc2, 0xb5, 0x48, 0xbb, 0xcb, 0xbf, 0xc0, 0xc2, 0xba, 0xbf, 0x60, 0xba, 0xba, 0xb8, + 0x0f, 0xc4, 0x93, 0xc1, 0x2f, 0xc0, 0x69, 0xc1, 0x09, 0xc1, 0xa6, 0xb8, 0xe6, 0xbe, 0x02, 0xc1, + 0xdf, 0xc0, 0xca, 0xc0, 0x8b, 0xc0, 0x22, 0xc0, 0xa3, 0xc0, 0x5b, 0xbe, 0xea, 0xc3, 0x3d, 0xc0, + 0x87, 0xc1, 0xbe, 0xc3, 0x37, 0xc2, 0x86, 0xbd, 0x82, 0xbd, 0x59, 0xc0, 0x08, 0xbc, 0x10, 0xc2, + 0x81, 0xc1, 0xd3, 0xbc, 0xe7, 0xbd, 0xe5, 0xbe, 0x6c, 0xc0, 0x25, 0xbd, 0x41, 0x21, 0x62, 0xc1, + 0x2d, 0xbf, 0xdd, 0xc0, 0x53, 0xbf, 0x11, 0xbe, 0x33, 0xb7, 0x34, 0xb9, 0x5c, 0xc3, 0x5e, 0xc1, + 0x32, 0xc2, 0x0d, 0x34, 0xa7, 0xc0, 0xe3, 0xbc, 0xa2, 0xc2, 0x25, 0xc1, 0x1f, 0xc1, 0xa0, 0xbf, + 0xa3, 0xc0, 0x73, 0xc0, 0xe8, 0xbb, 0x4a, 0xc1, 0xbc, 0xc0, 0x47, 0xc1, 0x21, 0xc2, 0x4d, 0xc1, + 0x99, 0xbc, 0x90, 0xc1, 0x12, 0xc1, 0x98, 0xc0, 0x2e, 0xbc, 0x8c, 0xbc, 0x25, 0xbe, 0x13, 0xbc, + 0xae, 0xb9, 0x62, 0xc0, 0x41, 0xc0, 0x1b, 0xc4, 0x1a, 0xc1, 0x0d, 0xc3, 0xb5, 0xbd, 0x76, 0xc0, + 0x1e, 0xad, 0x64, 0xbf, 0xb5, 0xb9, 0xe8, 0xbf, 0x11, 0xc0, 0xf8, 0xbe, 0xc1, 0xc4, 0x16, 0xc1, + 0xa5, 0xc0, 0x23, 0xc0, 0x73, 0xbe, 0x9a, 0xbd, 0xd0, 0xc0, 0x5d, 0xbf, 0xd7, 0xbf, 0x84, 0xbf, + 0x61, 0xc3, 0x29, 0xc1, 0x32, 0xc2, 0xbb, 0xbc, 0x78, 0xc0, 0xe1, 0x31, 0xfe, 0xc0, 0xdd, 0x27, + 0x86, 0xb2, 0x59, 0xbc, 0x1f, 0x38, 0x10, 0xc2, 0xba, 0xbd, 0x78, 0xc1, 0x87, 0xc0, 0x64, 0xb5, + 0x62, 0xc1, 0x24, 0xc1, 0x41, 0xbd, 0x6f, 0xb4, 0x3b, 0xb9, 0x47, 0xc0, 0x87, 0xc0, 0x1d, 0xbe, + 0x56, 0xc2, 0x9f, 0xc0, 0x6a, 0xc0, 0xfa, 0xc0, 0x03, 0xc3, 0x39, 0xb3, 0x42, 0xc2, 0xc4, 0xc1, + 0x1a, 0xc4, 0xb6, 0xc0, 0x3d, 0xbf, 0x37, 0xba, 0x15, 0xbe, 0x0f, 0xc2, 0x5c, 0xc0, 0xb8, 0xbe, + 0x99, 0xbf, 0x66, 0xc1, 0xea, 0xbe, 0xf1, 0xc2, 0x3d, 0xc0, 0xd9, 0xbf, 0x29, 0xbf, 0x8e, 0xbe, + 0x70, 0xbb, 0x3a, 0xc1, 0xc8, 0xbf, 0x85, 0xbe, 0x1f, 0xc1, 0x50, 0xc2, 0xfa, 0xbd, 0x3f, 0xb9, + 0x36, 0xc3, 0x6f, 0xbf, 0x2e, 0xbe, 0x69, 0xc0, 0xd1, 0xc0, 0x01, 0xc0, 0xc1, 0xc1, 0x88, 0xbd, + 0x95, 0xbc, 0x91, 0xc2, 0x05, 0xc2, 0x2e, 0xc3, 0x39, 0xbf, 0xef, 0xc2, 0x78, 0xbd, 0x15, 0xc1, + 0x73, 0xbe, 0xff, 0xbe, 0x3b, 0xc0, 0xef, 0xbd, 0x22, 0xc0, 0x67, 0xbd, 0x20, 0xbb, 0xab, 0xbc, + 0xef, 0xb9, 0x80, 0xc0, 0x4d, 0xc1, 0xdb, 0xc0, 0xfe, 0xbd, 0x4f, 0xc0, 0x6a, 0xc3, 0x2c, 0xc0}; +unsigned char conv2d_winograd_fp16_ker1[] = { + 0x28, 0xbe, 0x50, 0xbf, 0x4b, 0xbe, 0x1e, 0xc4, 0x60, 0xbd, 0xd3, 0xbd, 0xb0, 0xb6, 0xab, 0xb3, 0x91, 0xc2, 0x21, 0xbc, 0x27, 0xc0, 0x74, 0xbe, 0x65, 0xbd, 0x70, 0xc1, 0x2d, 0xbf, 0x62, 0xc1, + 0xd5, 0xbc, 0x5f, 0xbd, 0xaa, 0xbf, 0x66, 0xc3, 0x9e, 0xc2, 0xaa, 0xbe, 0x16, 0xbe, 0xd2, 0x33, 0xe4, 0xbe, 0x13, 0xc1, 0x5f, 0xc1, 0x40, 0xbc, 0x0e, 0xc2, 0x26, 0xbc, 0x0d, 0x34, 0x9f, 0xc0, + 0x1a, 0xc1, 0xbb, 0xaf, 0x49, 0xc1, 0x9d, 0xc2, 0x19, 0xc1, 0xac, 0xc1, 0x8b, 0xba, 0xba, 0xb5, 0x96, 0xbb, 0x25, 0xc1, 0xa3, 0xbf, 0xb9, 0xbe, 0x0d, 0xc2, 0x7b, 0xbd, 0xe8, 0xbb, 0x3d, 0xbf, + 0x43, 0xc2, 0xad, 0xbd, 0xe1, 0xbf, 0x4a, 0xb5, 0x72, 0xbf, 0xa4, 0xc1, 0xde, 0xbd, 0xf6, 0xc0, 0x1c, 0xb9, 0xd4, 0xbf, 0xd6, 0xbe, 0xd4, 0xc0, 0xba, 0xbb, 0x17, 0xb8, 0x98, 0xc0, 0xf1, 0xc2, + 0xc8, 0xc0, 0x3f, 0xc0, 0x57, 0xba, 0x68, 0xbc, 0x02, 0xc0, 0xa1, 0xc0, 0xf1, 0xbf, 0x4b, 0xbf, 0xd3, 0xc0, 0x57, 0xbc, 0x03, 0xbe, 0x24, 0xc0, 0xf2, 0xc1, 0xc0, 0xc2, 0x1a, 0xc1, 0x1f, 0xc1, + 0x91, 0xc0, 0xdf, 0xbd, 0xdc, 0xbe, 0x11, 0xc2, 0x33, 0xbf, 0xf1, 0xc1, 0x38, 0xbd, 0x71, 0xbd, 0x21, 0xc0, 0x76, 0xc0, 0x02, 0xc0, 0xdd, 0xbb, 0x59, 0xc0, 0xa6, 0xb8, 0xf8, 0xbe, 0x01, 0xc0, + 0x42, 0xb9, 0x84, 0xc0, 0xd4, 0xbd, 0xe9, 0xc0, 0x28, 0xc2, 0xe5, 0xbc, 0xf1, 0xbc, 0xe2, 0xc1, 0x91, 0xb5, 0x46, 0xc0, 0xd4, 0xbc, 0x45, 0xb9, 0xb9, 0xbe, 0xea, 0xc3, 0xd7, 0xbf, 0x78, 0xbd, + 0xe8, 0xbb, 0x84, 0xbb, 0xe4, 0xbe, 0x5f, 0xba, 0xc3, 0xc1, 0x13, 0xc0, 0xdf, 0xbe, 0x07, 0xc1, 0x21, 0xc0, 0x75, 0xc2, 0xeb, 0xbf, 0x59, 0xc1, 0xc5, 0xc3, 0x10, 0xc2, 0xdd, 0x27, 0xab, 0xbc, + 0x2a, 0x3c, 0x16, 0x3a, 0xf0, 0x3c, 0xd9, 0x3f, 0xeb, 0x3c, 0xc3, 0x3c, 0x95, 0x3b, 0x7f, 0x3c, 0x7a, 0x3e, 0x84, 0x39, 0x00, 0x3e, 0x8c, 0x3c, 0x72, 0x39, 0x2f, 0x3b, 0x36, 0x3e, 0xa6, 0x3c, + 0x2e, 0x3e, 0x7d, 0x3b, 0xd0, 0x3d, 0x38, 0x3b, 0xb6, 0x3d, 0x7a, 0x39, 0xd2, 0x3a, 0x28, 0x3c, 0x53, 0x3d, 0xf5, 0x3c, 0x66, 0x3c, 0x45, 0x3e, 0xb4, 0x3c, 0xc4, 0x3d, 0x6b, 0x3c, 0xbb, 0x3f, + 0xf2, 0x3c, 0xae, 0x37, 0x87, 0x3d, 0xfb, 0x3c, 0x79, 0x3c, 0xba, 0x3f, 0x24, 0x3d, 0x03, 0x38, 0x36, 0x3d, 0xbb, 0x3f, 0xa6, 0x3e, 0xb6, 0x3c, 0x1c, 0x3e, 0xb6, 0x3c, 0x3f, 0x3c, 0xfd, 0x3d, + 0x2c, 0x40, 0x16, 0x3b, 0xcc, 0x3d, 0x32, 0x3d, 0xfc, 0x3d, 0x2e, 0x3c, 0xe8, 0x3c, 0x91, 0x3f, 0x21, 0x36, 0xea, 0x3e, 0x2c, 0x3d, 0x32, 0x3d, 0xde, 0x39, 0xcc, 0x38, 0x5a, 0x3d, 0x00, 0x3f, + 0xcf, 0x3e, 0xa6, 0x3c, 0xde, 0x31, 0xe4, 0x3c, 0x2c, 0x3c, 0x12, 0x3d, 0x84, 0x3d, 0xf8, 0x3f, 0x40, 0x3d, 0x6a, 0x3c, 0x62, 0x38, 0xda, 0x3c, 0x50, 0x3e, 0x62, 0x3d, 0xe2, 0x3c, 0x3b, 0x3c, + 0xa1, 0x3d, 0x38, 0x3f, 0x1a, 0x39, 0x45, 0x3f, 0xd8, 0x3d, 0x99, 0x3c, 0x4e, 0x3f, 0xac, 0x3a, 0xcb, 0x3c, 0xea, 0x3d, 0x06, 0x3d, 0xde, 0x3a, 0x9e, 0x3f, 0x84, 0x3c, 0xdc, 0x3c, 0xfc, 0x3d, + 0x16, 0x3d, 0x0e, 0x3d, 0xa1, 0x38, 0x09, 0x3c, 0x47, 0x40, 0x88, 0x3d, 0x35, 0x3e, 0x86, 0x3d, 0xc2, 0x3a, 0xc1, 0x3b, 0x93, 0x3c, 0xea, 0x3c, 0xc7, 0x3e, 0x17, 0x40, 0x05, 0x3e, 0x9b, 0x3c, + 0x82, 0x3c, 0xa9, 0x3c, 0x6f, 0x3f, 0x44, 0x38, 0x62, 0x3e, 0xe6, 0x3e, 0x6d, 0x3f, 0xe1, 0x3e, 0x60, 0x3d, 0x38, 0x3d, 0x11, 0x40, 0x9e, 0x3f, 0x16, 0x40, 0x26, 0x3d, 0xc7, 0x37, 0x4e, 0x3d, + 0xd5, 0x38, 0xf8, 0x34, 0xdf, 0xb1, 0x40, 0x3a, 0xa2, 0x34, 0xa0, 0xa6, 0x00, 0x17, 0xdb, 0x34, 0xca, 0x36, 0xb5, 0x32, 0xe8, 0x2e, 0x30, 0xa8, 0xe8, 0x31, 0x02, 0x3c, 0x08, 0x3b, 0x28, 0x3c, + 0x7a, 0x33, 0x1e, 0x31, 0x46, 0x3a, 0xcc, 0x39, 0x81, 0x38, 0x34, 0x36, 0xe7, 0xae, 0x78, 0xad, 0x9c, 0x36, 0x6c, 0x38, 0x50, 0x3a, 0x89, 0x35, 0x82, 0x3a, 0xe8, 0x2f, 0xe9, 0xb5, 0x00, 0x36, + 0x1e, 0x36, 0x90, 0xa8, 0x75, 0xac, 0xfa, 0x35, 0x39, 0x3c, 0x49, 0x34, 0x21, 0x39, 0x36, 0xb4, 0x41, 0x35, 0xc0, 0x26, 0x6b, 0x36, 0x00, 0x35, 0x92, 0x3c, 0x28, 0x39, 0x20, 0xaf, 0xd6, 0x30, + 0x3c, 0x3d, 0x9d, 0x38, 0x20, 0x33, 0xb2, 0xb5, 0x2c, 0x31, 0xca, 0x3c, 0x27, 0x35, 0x4c, 0x38, 0x08, 0x34, 0xa3, 0x35, 0xe0, 0x3b, 0x6c, 0x34, 0x94, 0x38, 0xcd, 0xb2, 0x3f, 0x39, 0xa2, 0x3b, + 0xd4, 0x2f, 0xa4, 0xb1, 0xa7, 0x34, 0xce, 0x32, 0xbd, 0x39, 0xc7, 0x39, 0xe5, 0x35, 0xf7, 0x36, 0x79, 0x35, 0x52, 0x36, 0x44, 0xb6, 0xff, 0x38, 0x3a, 0xae, 0x56, 0x3c, 0x5a, 0x33, 0x22, 0x3a, + 0x62, 0x33, 0x2c, 0x31, 0x3b, 0x3a, 0x41, 0x3a, 0xe8, 0x38, 0x7e, 0x38, 0xf0, 0x2f, 0x42, 0x33, 0x52, 0x31, 0x38, 0x32, 0xc0, 0x24, 0x74, 0xa9, 0x6f, 0x3a, 0x08, 0x2a, 0xd2, 0x31, 0xe7, 0x3b, + 0x0e, 0x3a, 0x5e, 0x38, 0xea, 0x30, 0x66, 0x38, 0xfc, 0x34, 0xfc, 0x2d, 0xfe, 0x39, 0xad, 0x37, 0xb4, 0x37, 0x6a, 0x38, 0x50, 0x33, 0x8c, 0xaf, 0x28, 0x38, 0x33, 0x35, 0xbd, 0x35, 0xfc, 0x35, + 0x88, 0x2e, 0x57, 0x3a, 0x98, 0x32, 0x0f, 0x38, 0x51, 0x3b, 0xa5, 0x38, 0x9c, 0x3b, 0x1d, 0x35, 0xc6, 0x31, 0xe5, 0x36, 0x62, 0x38, 0x82, 0x37, 0xd5, 0x38, 0x0c, 0x39, 0xb8, 0x39, 0xc4, 0x30, + 0x52, 0xb0, 0x67, 0xac, 0xe6, 0xaf, 0x46, 0xb2, 0xee, 0xb0, 0x1e, 0xb0, 0x1b, 0xb0, 0xa1, 0xb1, 0xf3, 0xb0, 0x16, 0xad, 0x28, 0xb1, 0x5f, 0xaf, 0x40, 0xac, 0x08, 0xae, 0xf2, 0xb2, 0x5f, 0xb0, + 0x80, 0xb2, 0xa2, 0xae, 0x30, 0xb2, 0x2f, 0xaa, 0x39, 0xb0, 0x44, 0xac, 0x97, 0xac, 0x1c, 0xb1, 0x36, 0xb1, 0x21, 0xb0, 0x5c, 0xaf, 0xf2, 0xb2, 0x70, 0xaf, 0x02, 0xb2, 0xfa, 0xb0, 0x69, 0xb3, + 0xa6, 0xaf, 0x3c, 0xac, 0x68, 0xaf, 0x18, 0xae, 0x57, 0xb0, 0xae, 0xb2, 0x52, 0xb2, 0x6b, 0xaa, 0xc4, 0xb1, 0x94, 0xb2, 0x97, 0xb2, 0x5d, 0xb0, 0xef, 0xb1, 0x3a, 0xb1, 0xb9, 0xaf, 0x80, 0xb1, + 0x63, 0xb4, 0x52, 0xaf, 0x35, 0xb1, 0x51, 0xb1, 0x74, 0xb1, 0xda, 0xaf, 0xd7, 0xb0, 0x4b, 0xb3, 0x3e, 0xaa, 0xc7, 0xb2, 0xf5, 0xb1, 0x1a, 0xb0, 0xd0, 0xae, 0xf8, 0xab, 0x06, 0xb1, 0x38, 0xb2, + 0xd1, 0xb1, 0x12, 0xae, 0x01, 0xa4, 0x09, 0xb1, 0x04, 0xb0, 0xc6, 0xb0, 0x16, 0xb1, 0x28, 0xb4, 0x46, 0xb0, 0xc6, 0xb0, 0x58, 0xa2, 0x9e, 0xb0, 0x40, 0xb0, 0x98, 0xb0, 0x04, 0xaf, 0x26, 0xaf, + 0xb0, 0xb0, 0x5a, 0xb3, 0xf4, 0xac, 0xbe, 0xb2, 0x13, 0xb2, 0x7f, 0xae, 0x93, 0xb3, 0xd6, 0xad, 0xa2, 0xaf, 0x08, 0xb1, 0xd8, 0xaf, 0x02, 0xae, 0x0e, 0xb4, 0xe0, 0xb0, 0x43, 0xb0, 0xa2, 0xb2, + 0x9e, 0xb2, 0x88, 0xb0, 0xe2, 0xa9, 0x34, 0xae, 0x7b, 0xb3, 0x7b, 0xb1, 0x54, 0xb3, 0x42, 0xb0, 0x74, 0xb0, 0x84, 0xae, 0x92, 0xb0, 0x02, 0xb1, 0x2b, 0xb3, 0x1d, 0xb2, 0xb6, 0xb1, 0xab, 0xb0, + 0x86, 0xb0, 0xdb, 0xb1, 0x6a, 0xb3, 0x0b, 0xad, 0x0c, 0xb2, 0x08, 0xb3, 0x4d, 0xb4, 0x16, 0xb2, 0x89, 0xb0, 0x06, 0xaf, 0x43, 0xb4, 0x0a, 0xb3, 0xa2, 0xb2, 0xe8, 0xaf, 0x06, 0xaf, 0x5c, 0xb1, + 0xd8, 0xad, 0x12, 0xa6, 0xb0, 0x24, 0x00, 0xad, 0xb1, 0xab, 0x48, 0x9f, 0x50, 0xa8, 0x01, 0xae, 0x82, 0xa8, 0x08, 0xa8, 0xa4, 0xa5, 0x80, 0x17, 0x4e, 0xa4, 0xb4, 0xae, 0xca, 0xb0, 0xf5, 0xaf, + 0x9d, 0xac, 0xaa, 0xa6, 0x0b, 0xb0, 0xd2, 0xa7, 0xd5, 0xa9, 0xb9, 0xa8, 0x38, 0x26, 0x0c, 0xaa, 0x8c, 0xac, 0xba, 0xab, 0x5e, 0xad, 0x12, 0xae, 0x1e, 0xad, 0x2a, 0xab, 0x38, 0xa0, 0xda, 0xac, + 0x5e, 0xa8, 0x7e, 0xa3, 0x87, 0x27, 0x1d, 0xa0, 0x23, 0xb0, 0x68, 0xa9, 0x43, 0xb0, 0xbe, 0x26, 0x4c, 0xad, 0xa8, 0xa3, 0x1a, 0xad, 0x60, 0xaa, 0xb4, 0xb0, 0x0a, 0xaf, 0x10, 0x9b, 0xc2, 0xa8, + 0x48, 0xb2, 0x58, 0xad, 0x25, 0xa9, 0x00, 0x91, 0xbe, 0xa8, 0x69, 0xb0, 0xc7, 0xab, 0xea, 0xad, 0x91, 0xa8, 0xbe, 0xac, 0xf8, 0xb0, 0xa7, 0xa6, 0xc7, 0xad, 0xd8, 0x24, 0xae, 0xad, 0x5a, 0xaf, + 0x10, 0xa6, 0x00, 0x29, 0xc1, 0xa6, 0x36, 0xab, 0xf2, 0xad, 0x0e, 0xae, 0x6c, 0xab, 0xa9, 0xae, 0xab, 0xa8, 0x02, 0xad, 0x36, 0x2d, 0x76, 0xad, 0x74, 0x28, 0x82, 0xaf, 0x5e, 0xa1, 0x48, 0xad, + 0x60, 0xa7, 0x31, 0xac, 0xdc, 0xad, 0xdb, 0xae, 0xb9, 0xae, 0x78, 0xa9, 0x42, 0xac, 0xc8, 0xa7, 0x1d, 0xa4, 0x5c, 0xa7, 0x00, 0x15, 0xb0, 0x9b, 0x97, 0xb0, 0xa6, 0xa9, 0xac, 0xa7, 0x01, 0xb1, + 0xf8, 0xb0, 0x7a, 0xac, 0x0c, 0x9b, 0x89, 0xaa, 0x8a, 0xaa, 0x6c, 0xa9, 0xc3, 0xb0, 0x81, 0xa9, 0x96, 0xae, 0xaf, 0xab, 0x91, 0xaa, 0x94, 0xa5, 0xd2, 0xae, 0xe6, 0xa4, 0x15, 0xac, 0x5c, 0xac, + 0xf5, 0xa8, 0xaa, 0xb0, 0x40, 0xac, 0xe1, 0xac, 0xbe, 0xaf, 0xbe, 0xae, 0xb5, 0xb1, 0x6b, 0xaa, 0x84, 0xa6, 0xb5, 0xa5, 0x89, 0xaf, 0xe5, 0xac, 0x0d, 0xac, 0x6e, 0xab, 0x3e, 0xb0, 0x60, 0xaa, + 0x50, 0xab, 0x0e, 0xab, 0xc9, 0xac, 0x3d, 0xb0, 0x27, 0xac, 0x6e, 0xac, 0x70, 0xa9, 0x6c, 0xa9, 0x0c, 0xaf, 0x2c, 0xa9, 0xba, 0xad, 0x79, 0xac, 0xd9, 0xa9, 0x40, 0xac, 0xf0, 0xac, 0xd4, 0xac, + 0xcf, 0xac, 0x19, 0xab, 0xe7, 0xac, 0x89, 0xad, 0x81, 0xae, 0x39, 0xaa, 0x82, 0xab, 0x9a, 0xa8, 0xb3, 0xac, 0x3d, 0xad, 0xe0, 0xac, 0x9e, 0xac, 0x66, 0xad, 0x77, 0xac, 0x5a, 0xa9, 0xcd, 0xae, + 0x61, 0xad, 0x3c, 0xa5, 0x30, 0xae, 0x37, 0xae, 0x8f, 0xac, 0x72, 0xaf, 0xe2, 0xaa, 0x2f, 0xa7, 0xc0, 0xab, 0x5c, 0xaf, 0xb5, 0xad, 0x5f, 0xac, 0xfa, 0xad, 0x8b, 0xab, 0x61, 0xab, 0x67, 0xad, + 0x4e, 0xaf, 0x5b, 0xaa, 0x66, 0xad, 0x84, 0xab, 0x72, 0xad, 0x90, 0xac, 0x41, 0xac, 0xc2, 0xae, 0x99, 0xa5, 0xf4, 0xad, 0x23, 0xac, 0x78, 0xad, 0xaa, 0xa8, 0x4e, 0xa8, 0x23, 0xad, 0x23, 0xaf, + 0x8a, 0xae, 0x33, 0xad, 0x36, 0xa4, 0xe2, 0xab, 0x10, 0xac, 0xef, 0xac, 0x21, 0xad, 0x60, 0xae, 0x73, 0xad, 0xfb, 0xaa, 0x75, 0xaa, 0x9e, 0xac, 0x09, 0xaf, 0xf6, 0xad, 0x77, 0xad, 0xa8, 0xac, + 0xa0, 0xad, 0xc5, 0xad, 0x78, 0xa9, 0xf8, 0xae, 0xef, 0xac, 0x7a, 0xad, 0xad, 0xad, 0x8b, 0xaa, 0xf2, 0xac, 0xc6, 0xad, 0x23, 0xad, 0x49, 0xaa, 0x3f, 0xae, 0x96, 0xaa, 0xa0, 0xac, 0xf0, 0xac, + 0x4c, 0xaa, 0x01, 0xad, 0xa4, 0xa9, 0x99, 0xac, 0x15, 0xb0, 0x8c, 0xac, 0x71, 0xac, 0x11, 0xae, 0x07, 0xa8, 0x2a, 0xac, 0xb3, 0xab, 0x7d, 0xab, 0x71, 0xad, 0x6f, 0xb0, 0x6a, 0xad, 0xd8, 0xab, + 0x5c, 0xab, 0x54, 0xaa, 0x22, 0xae, 0xe4, 0xa6, 0x2c, 0xae, 0xd8, 0xad, 0x87, 0xad, 0x8d, 0xae, 0x48, 0xad, 0x3b, 0xae, 0x8d, 0xae, 0x0c, 0xaf, 0x48, 0xb0, 0xd1, 0xad, 0x80, 0xa0, 0x4f, 0xac, + 0x84, 0xa8, 0x2c, 0xa8, 0xfc, 0x9b, 0xb3, 0xac, 0x93, 0xa4, 0x50, 0xa0, 0xf0, 0x1c, 0x70, 0x95, 0x73, 0xaa, 0x36, 0xa4, 0x18, 0xa5, 0xd9, 0xa1, 0x5a, 0xa5, 0x96, 0xac, 0x90, 0xa9, 0x6f, 0xac, + 0xe9, 0xa0, 0x45, 0xa4, 0x86, 0xa9, 0xf7, 0xac, 0x79, 0xab, 0x52, 0xa8, 0x75, 0xa1, 0x30, 0x25, 0x0e, 0xa7, 0x14, 0xaa, 0xc1, 0xab, 0xac, 0xa1, 0x3e, 0xac, 0xfc, 0x9b, 0x14, 0x28, 0x45, 0xa7, + 0x4c, 0xa9, 0x72, 0x1d, 0x2f, 0xa6, 0xdb, 0xaa, 0x5c, 0xac, 0x3d, 0xa8, 0x89, 0xa5, 0x36, 0x21, 0x0e, 0xa2, 0x0d, 0xa5, 0xac, 0xa6, 0x68, 0xa6, 0xbf, 0xac, 0x22, 0xa8, 0xf0, 0x13, 0x60, 0xa4, + 0xd0, 0xac, 0x61, 0xa8, 0xe8, 0xa5, 0x29, 0x26, 0xb4, 0xa4, 0x0c, 0xad, 0x6c, 0xa5, 0xd7, 0xa8, 0xd6, 0xa3, 0x33, 0xa6, 0x4d, 0xaa, 0x66, 0xa8, 0x43, 0xa7, 0x1a, 0x1f, 0xe8, 0xa9, 0x97, 0xac, + 0xea, 0xa5, 0x4a, 0xa3, 0x96, 0xa5, 0xa8, 0xa1, 0x0d, 0xaa, 0x60, 0xaa, 0x98, 0xa7, 0x94, 0xa5, 0xaf, 0xa8, 0xe2, 0xa4, 0x34, 0x9b, 0x79, 0xa9, 0xb1, 0xa6, 0x1e, 0xad, 0x97, 0xa8, 0x72, 0xab, + 0x73, 0xa7, 0x14, 0xa0, 0x60, 0xaa, 0x50, 0xab, 0x72, 0xa8, 0x30, 0xab, 0x58, 0x9b, 0x50, 0xa5, 0x92, 0xa6, 0xa6, 0xa6, 0xb4, 0xa4, 0x25, 0x9c, 0x74, 0xa9, 0x78, 0x19, 0x4b, 0xa5, 0x9e, 0xaa, + 0x02, 0xa6, 0x6a, 0xa9, 0xd8, 0xa5, 0x42, 0xaa, 0xa2, 0xa8, 0xc6, 0x9e, 0x7f, 0xa7, 0x5f, 0xaa, 0xcf, 0xa2, 0xa8, 0xa9, 0x5d, 0xa3, 0x82, 0x20, 0xbc, 0xa6, 0xba, 0xaa, 0xfe, 0xa6, 0xbe, 0xa5, + 0x56, 0x9e, 0xe2, 0xa7, 0xc0, 0xa2, 0x90, 0xa6, 0xfc, 0xab, 0x5f, 0xa8, 0x43, 0xa9, 0x25, 0xa8, 0x3e, 0xa6, 0xdd, 0xaa, 0x31, 0xa7, 0xe9, 0xa8, 0xea, 0xab, 0x6f, 0xab, 0x2e, 0xa5, 0x54, 0xa0, + 0x53, 0xc0, 0xab, 0xb8, 0x51, 0xba, 0x1d, 0xc0, 0x26, 0xc0, 0xa8, 0xbc, 0xe0, 0xbe, 0xf3, 0xc1, 0x44, 0xbd, 0xec, 0xbb, 0x2b, 0xbe, 0xfb, 0xba, 0xcb, 0xb8, 0x3c, 0xbe, 0x5c, 0xc3, 0x87, 0xc0, + 0x97, 0xc1, 0x7e, 0xbc, 0x3d, 0xc2, 0xd1, 0x28, 0xaf, 0xbc, 0xfd, 0xb9, 0xe5, 0xb0, 0xc8, 0xc0, 0x67, 0xc0, 0xfd, 0xbd, 0x43, 0xbe, 0x7d, 0xc2, 0xcd, 0xbd, 0xf7, 0xc0, 0xa0, 0xbf, 0xc4, 0xc1, + 0x5d, 0xbc, 0x08, 0xbb, 0xf9, 0xb4, 0x5b, 0xb5, 0xb0, 0xc0, 0x3b, 0xc0, 0x09, 0xc3, 0xb7, 0xb1, 0x72, 0xc1, 0x5a, 0xbf, 0x79, 0xc1, 0xb3, 0xbe, 0xf8, 0xc1, 0x77, 0xc1, 0x99, 0xbc, 0x99, 0xbf, + 0x83, 0xc4, 0x7d, 0xbf, 0x2a, 0xbf, 0x2d, 0xbf, 0x7b, 0xbf, 0x74, 0xc0, 0x0a, 0xc0, 0xf8, 0xc1, 0x53, 0xba, 0x75, 0xc1, 0xd5, 0xc2, 0x87, 0xbc, 0xe4, 0xbf, 0xc2, 0xb5, 0x62, 0xc0, 0x3a, 0xc1, + 0xdc, 0xbe, 0x32, 0xae, 0x65, 0xb4, 0x42, 0xc0, 0xd3, 0xbf, 0x54, 0xc0, 0xc8, 0xbf, 0x47, 0xc3, 0x3d, 0xbd, 0x93, 0xc0, 0x9c, 0x3c, 0x22, 0xc0, 0xc2, 0xb5, 0x2f, 0xc0, 0xb5, 0xb9, 0x2e, 0xbe, + 0x9b, 0xbd, 0xea, 0xc1, 0xe4, 0xbd, 0xa3, 0xc1, 0xbc, 0xc1, 0x0b, 0xbb, 0x2c, 0xc2, 0x1c, 0xbc, 0x07, 0xbc, 0x23, 0xbe, 0xe9, 0xba, 0xd2, 0xba, 0xd8, 0xc3, 0x22, 0xc0, 0x9a, 0xbd, 0x2e, 0xc3, + 0xd1, 0xc3, 0x43, 0xbf, 0x21, 0xb2, 0x35, 0xbc, 0xe0, 0xc0, 0x32, 0xc0, 0xe3, 0xc3, 0xfc, 0xbc, 0x46, 0xc1, 0x25, 0xbd, 0x88, 0xbf, 0x5d, 0xbf, 0x9a, 0xc2, 0x82, 0xbd, 0x78, 0xc0, 0x22, 0xc0, + 0x1d, 0xbf, 0xee, 0xc2, 0xd9, 0xc1, 0x2e, 0xbe, 0x89, 0xc1, 0x4c, 0xc2, 0x9a, 0xc4, 0x27, 0xc0, 0x76, 0xbd, 0x6d, 0xb9, 0xa8, 0xc3, 0x55, 0xc1, 0x25, 0xc0, 0x25, 0xbd, 0x78, 0xc1, 0x4f, 0xc0, + 0x94, 0x3c, 0x42, 0x3d, 0xfa, 0x3b, 0x32, 0x40, 0x9d, 0x3d, 0xa8, 0x3e, 0xb2, 0x3b, 0x70, 0x3b, 0x72, 0x3d, 0x2a, 0x3d, 0x19, 0x3d, 0x0e, 0x3a, 0x3c, 0x3c, 0x41, 0x3c, 0x62, 0x3d, 0xd8, 0x3d, + 0xc6, 0x3a, 0x2c, 0x3c, 0x97, 0x3d, 0xef, 0x3d, 0x55, 0x3e, 0xe4, 0x3c, 0xf0, 0x3c, 0x5e, 0x3c, 0x72, 0x3c, 0xa5, 0x3d, 0x3c, 0x3e, 0x8a, 0x3b, 0xf8, 0x3f, 0x7e, 0x3c, 0xdb, 0x39, 0x3a, 0x3e, + 0x2f, 0x3f, 0x36, 0x3c, 0x6d, 0x3e, 0xb9, 0x3d, 0x38, 0x3f, 0x4b, 0x3d, 0x7a, 0x3c, 0x7c, 0x39, 0x69, 0x3d, 0xfa, 0x3d, 0xd9, 0x3c, 0x1a, 0x3d, 0x34, 0x3f, 0x86, 0x3e, 0x55, 0x3d, 0x4d, 0x3b, + 0x69, 0x3f, 0xd6, 0x3c, 0xa2, 0x3d, 0x8c, 0x39, 0xb5, 0x3b, 0x80, 0x3e, 0xbe, 0x3c, 0x19, 0x3d, 0x9c, 0x3b, 0x20, 0x3e, 0xc4, 0x3b, 0x26, 0x3f, 0xa1, 0x3e, 0x64, 0x39, 0xf5, 0x3b, 0xb6, 0x3e, + 0xd3, 0x3c, 0xa0, 0x3c, 0xbc, 0x3a, 0xd1, 0x3c, 0xff, 0x3c, 0x8a, 0x3e, 0xc8, 0x3e, 0xf7, 0x3c, 0x08, 0x3d, 0x08, 0x3f, 0xa6, 0x3c, 0xfa, 0x3c, 0x5a, 0x3b, 0x6d, 0x3d, 0xaa, 0x3e, 0x68, 0x3e, + 0x42, 0x3e, 0x26, 0x3e, 0x13, 0x3c, 0xc4, 0x3e, 0x6b, 0x3c, 0x18, 0x3c, 0xd0, 0x3d, 0x4c, 0x3c, 0x1e, 0x3a, 0x3c, 0x3e, 0x79, 0x3e, 0xd2, 0x3c, 0x39, 0x3a, 0x46, 0x3c, 0x04, 0x40, 0x90, 0x3d, + 0x29, 0x3c, 0xb6, 0x3d, 0x4a, 0x3c, 0x9e, 0x3e, 0x46, 0x3e, 0x02, 0x40, 0x6c, 0x3b, 0x6a, 0x3d, 0x4c, 0x3c, 0x93, 0x40, 0x46, 0x3b, 0x8b, 0x3a, 0x4d, 0x3e, 0xda, 0x3f, 0xb1, 0x3e, 0xe8, 0x3c, + 0x46, 0x3c, 0xbf, 0x3c, 0x4e, 0x3e, 0xf7, 0x3c, 0xc0, 0x3d, 0xc9, 0x39, 0x9e, 0x3b, 0xa0, 0x3d, 0xfa, 0x3c, 0x49, 0x3e, 0x20, 0x3d, 0x05, 0x3c, 0x2a, 0x3f, 0x36, 0x3e, 0x7a, 0x34, 0x64, 0x3b, + 0x89, 0xba, 0x43, 0xba, 0x2c, 0xba, 0x4f, 0xbc, 0xbf, 0xba, 0x61, 0xbb, 0x26, 0xba, 0x14, 0xbb, 0x4e, 0xba, 0x84, 0xba, 0xd2, 0xbb, 0x2c, 0xbb, 0xd0, 0xb9, 0x7b, 0xba, 0x29, 0xbb, 0x8f, 0xba, + 0x42, 0xbb, 0x00, 0xbb, 0xd8, 0xbb, 0x5c, 0xbb, 0xaf, 0xba, 0x34, 0xba, 0xb4, 0xba, 0x7a, 0xbb, 0x23, 0xba, 0xb1, 0xb9, 0x66, 0xba, 0xb8, 0xba, 0x4b, 0xbc, 0xba, 0xbb, 0x2f, 0xbb, 0x95, 0xbc, + 0x8e, 0xba, 0x0f, 0xba, 0x33, 0xba, 0x89, 0xba, 0xbc, 0xbb, 0x86, 0xbc, 0xb0, 0xbb, 0xd8, 0xb9, 0x6f, 0xbc, 0x00, 0xbc, 0xb6, 0xbb, 0x1e, 0xba, 0xf6, 0xbb, 0x3d, 0xbb, 0x1c, 0xbc, 0xd8, 0xba, + 0x70, 0xbc, 0x10, 0xbb, 0xf3, 0xba, 0xfc, 0xb9, 0xa9, 0xbb, 0x8b, 0xbb, 0x34, 0xba, 0x32, 0xbc, 0xb6, 0xb9, 0xdc, 0xbb, 0x1c, 0xbb, 0xff, 0xbb, 0x8e, 0xbb, 0x5d, 0xb9, 0x62, 0xba, 0x26, 0xbb, + 0xbc, 0xbb, 0x32, 0xba, 0x5f, 0xb9, 0x5d, 0xb8, 0x2d, 0xba, 0x26, 0xbb, 0xbc, 0xbb, 0xdb, 0xba, 0x6c, 0xbb, 0x98, 0xbb, 0xc2, 0xb8, 0xcc, 0xb9, 0x8b, 0xba, 0x1e, 0xbc, 0x77, 0xba, 0xb2, 0xbb, + 0x06, 0xbd, 0x26, 0xbc, 0x3c, 0xb9, 0x48, 0xbc, 0x38, 0xbc, 0xcf, 0xb8, 0x23, 0xbc, 0x51, 0xba, 0x9f, 0xba, 0x30, 0xbc, 0x06, 0xbb, 0xf8, 0xba, 0x1c, 0xba, 0x10, 0xbb, 0x16, 0xbc, 0x74, 0xbc, + 0x5a, 0xbb, 0x85, 0xbb, 0x27, 0xba, 0x32, 0xbb, 0x9a, 0xbb, 0xe4, 0xba, 0x26, 0xbb, 0x5a, 0xbc, 0x0a, 0xba, 0x75, 0xbb, 0x0c, 0xbb, 0x72, 0xba, 0x40, 0xbc, 0x4b, 0xbc, 0x7a, 0xbb, 0xfd, 0xb9, + 0xf0, 0xba, 0x90, 0xbb, 0x60, 0xbc, 0x0e, 0xba, 0x4b, 0xbc, 0x50, 0xb9, 0x74, 0xba, 0x9a, 0xba, 0x0c, 0xbb, 0xc4, 0xbb, 0x69, 0xbb, 0xd4, 0xb9, 0x55, 0xbc, 0x77, 0xba, 0x2a, 0xb8, 0x60, 0xbb, + 0x67, 0xb4, 0x32, 0xb6, 0x80, 0xb4, 0x0a, 0xb5, 0x68, 0xb6, 0xcf, 0xb4, 0xce, 0xad, 0x14, 0xaf, 0x2f, 0xb4, 0x56, 0xb5, 0xfa, 0xb1, 0x95, 0xb1, 0x38, 0xaa, 0x92, 0xb5, 0x18, 0xb9, 0x22, 0xb8, + 0x1e, 0xad, 0x46, 0xb1, 0xa8, 0xb7, 0x78, 0xb2, 0x9e, 0xb3, 0xfe, 0xb4, 0x90, 0xb2, 0x81, 0xb2, 0x7a, 0xb5, 0xb6, 0xb4, 0x1a, 0xb8, 0x76, 0xaf, 0x69, 0xb7, 0x7f, 0xb4, 0x18, 0xac, 0x7e, 0xb7, + 0xe4, 0xb5, 0x85, 0xb2, 0x1b, 0xb2, 0x00, 0xb5, 0x54, 0xb7, 0x60, 0xb3, 0x77, 0xb3, 0xfc, 0x29, 0x72, 0xb6, 0x62, 0xb3, 0xbb, 0xb5, 0xa2, 0xb1, 0xb7, 0xb5, 0x22, 0xb6, 0x7c, 0xb2, 0x99, 0xb1, + 0xf6, 0xb8, 0xd0, 0xb4, 0x57, 0xb5, 0x6a, 0xb0, 0x6a, 0xac, 0x4d, 0xb7, 0x0d, 0xb0, 0x48, 0xb5, 0x78, 0xac, 0x3e, 0xb8, 0xc5, 0xb3, 0xca, 0xb6, 0x7c, 0xb5, 0xd4, 0x2a, 0x9c, 0xb7, 0x69, 0xb7, + 0xa0, 0xa6, 0xf6, 0xb3, 0x8a, 0xaf, 0x2e, 0xb1, 0x64, 0xb4, 0x34, 0xb7, 0xeb, 0xb0, 0x18, 0xad, 0xc2, 0xb2, 0xec, 0xb8, 0x64, 0xb2, 0x18, 0xb5, 0x4e, 0xb0, 0xb9, 0xb4, 0x5e, 0xb6, 0xb9, 0xb5, + 0x56, 0xb2, 0xcd, 0xb6, 0xfe, 0xb4, 0xe7, 0xb6, 0x22, 0xb3, 0xd3, 0xb3, 0x22, 0xb3, 0xa3, 0xb3, 0x60, 0xa6, 0xed, 0xb7, 0x88, 0xb5, 0x62, 0xb7, 0x29, 0xb4, 0xd0, 0xb1, 0xdb, 0xb6, 0x11, 0xb6, + 0xf2, 0xb5, 0x8f, 0xb7, 0xec, 0xb2, 0x32, 0xb5, 0x82, 0xb1, 0xde, 0xb8, 0xe4, 0xb8, 0x0e, 0xb5, 0x23, 0xb4, 0x61, 0xb8, 0xf3, 0xb0, 0x2c, 0x2d, 0x56, 0xb8, 0xf3, 0xb3, 0x1e, 0xb4, 0x13, 0xb3, + 0x78, 0xb4, 0xd8, 0xb4, 0x97, 0xb7, 0x64, 0xb8, 0xcf, 0xb6, 0x1a, 0xb1, 0x68, 0xb5, 0x54, 0xb5, 0x0e, 0xb2, 0x8f, 0xb6, 0xbe, 0xac, 0x39, 0xb3, 0x92, 0xb5, 0x62, 0xb4, 0x2f, 0xb0, 0x5e, 0xb4, + 0x48, 0x2e, 0xf3, 0x2d, 0x2d, 0x2e, 0xe0, 0x2e, 0x62, 0x2e, 0x44, 0x2e, 0x9d, 0x2d, 0xdc, 0x2e, 0x80, 0x2d, 0x22, 0x2e, 0x42, 0x2f, 0x8d, 0x2f, 0xe0, 0x2c, 0x9f, 0x2e, 0xc2, 0x2f, 0x68, 0x2e, + 0x28, 0x2f, 0xb4, 0x2e, 0xf6, 0x2f, 0x52, 0x2e, 0x68, 0x2d, 0xd5, 0x2d, 0x12, 0x2e, 0x4c, 0x2f, 0x16, 0x2e, 0xc4, 0x2c, 0xff, 0x2d, 0x6c, 0x2e, 0x88, 0x2f, 0xc8, 0x2f, 0x43, 0x2f, 0x96, 0x30, + 0x36, 0x2d, 0xae, 0x2d, 0x9f, 0x2c, 0xca, 0x2d, 0xe6, 0x2e, 0x64, 0x30, 0x96, 0x2f, 0x68, 0x2d, 0x86, 0x30, 0x26, 0x2f, 0xd0, 0x2f, 0x2e, 0x2d, 0xdc, 0x2e, 0x70, 0x2e, 0xab, 0x2f, 0xd2, 0x2e, + 0x57, 0x30, 0xde, 0x2e, 0x68, 0x2e, 0x24, 0x2e, 0x5c, 0x2f, 0x0b, 0x2f, 0x51, 0x2d, 0x34, 0x30, 0x0c, 0x2d, 0xdd, 0x2f, 0x3c, 0x2f, 0x24, 0x2f, 0xa2, 0x2e, 0xd4, 0x2c, 0x0f, 0x2f, 0x7b, 0x2e, + 0xca, 0x2e, 0xc4, 0x2d, 0x08, 0x2d, 0x60, 0x2a, 0xa0, 0x2d, 0x88, 0x2e, 0x29, 0x2e, 0xd4, 0x2d, 0xe6, 0x2e, 0x56, 0x2f, 0xbc, 0x2b, 0x52, 0x2d, 0x52, 0x2e, 0xf0, 0x2f, 0x7b, 0x2d, 0xf6, 0x2e, + 0xad, 0x30, 0x05, 0x30, 0x1a, 0x2d, 0x06, 0x30, 0x3e, 0x30, 0x5f, 0x2c, 0x8c, 0x2f, 0x0c, 0x2e, 0x64, 0x2e, 0x28, 0x30, 0x14, 0x2e, 0x44, 0x2f, 0x80, 0x2e, 0xc7, 0x2e, 0xe4, 0x2e, 0x78, 0x30, + 0xcc, 0x2f, 0x7e, 0x2f, 0xc9, 0x2d, 0x25, 0x2e, 0x55, 0x2e, 0xf0, 0x2d, 0x47, 0x30, 0x49, 0x30, 0xc8, 0x2d, 0xd1, 0x2d, 0x02, 0x2f, 0xba, 0x2d, 0x4a, 0x30, 0xea, 0x2e, 0x3a, 0x2e, 0x48, 0x2d, + 0xf2, 0x2e, 0x82, 0x2f, 0x54, 0x30, 0x60, 0x2e, 0x4c, 0x30, 0x58, 0x2d, 0xcb, 0x2e, 0xfe, 0x2d, 0x68, 0x2e, 0x47, 0x2f, 0x65, 0x2e, 0x92, 0x2d, 0xb1, 0x2f, 0x52, 0x2d, 0xd1, 0x2c, 0xc1, 0x2f, + 0xa0, 0x29, 0xd6, 0x2a, 0x14, 0x2a, 0xfd, 0x27, 0x19, 0x2b, 0x94, 0x28, 0x1c, 0x25, 0xa3, 0x27, 0x42, 0x28, 0x1c, 0x2a, 0x67, 0x28, 0xdc, 0x29, 0x44, 0x20, 0x54, 0x2b, 0x1b, 0x2e, 0x78, 0x2c, + 0x76, 0x27, 0x51, 0x28, 0xbf, 0x2c, 0xe4, 0x26, 0x54, 0x26, 0xbc, 0x29, 0x09, 0x28, 0x3a, 0x29, 0xbc, 0x2a, 0x30, 0x28, 0x29, 0x2c, 0x32, 0x27, 0x42, 0x2b, 0xca, 0x2a, 0xb4, 0x27, 0xd6, 0x2c, + 0xa8, 0x28, 0x45, 0x28, 0x10, 0x23, 0x20, 0x29, 0x49, 0x2b, 0x06, 0x2a, 0xdb, 0x29, 0xd8, 0x1e, 0x88, 0x2c, 0x74, 0x28, 0xdf, 0x2b, 0xa8, 0x25, 0x8c, 0x29, 0x32, 0x2a, 0xc8, 0x28, 0xf0, 0x28, + 0x8c, 0x2d, 0x45, 0x2a, 0xf7, 0x29, 0x7a, 0x28, 0xb2, 0x26, 0xdc, 0x2b, 0xab, 0x24, 0x9e, 0x2b, 0x68, 0x23, 0xf8, 0x2c, 0x33, 0x2a, 0xec, 0x2a, 0x8f, 0x29, 0x40, 0x1a, 0xff, 0x2c, 0x74, 0x2b, + 0x28, 0x22, 0xce, 0x28, 0xf1, 0x25, 0xd9, 0x21, 0xe2, 0x28, 0x62, 0x2b, 0xa0, 0x23, 0xdc, 0x22, 0xa1, 0x28, 0x32, 0x2d, 0x5b, 0x25, 0x72, 0x29, 0xcd, 0x27, 0x79, 0x2a, 0xb5, 0x29, 0x32, 0x2a, + 0x2a, 0x29, 0x1f, 0x2c, 0xd5, 0x29, 0xea, 0x2b, 0x52, 0x2a, 0x2d, 0x28, 0xb5, 0x28, 0x0d, 0x29, 0x04, 0x25, 0xc8, 0x2c, 0x5a, 0x29, 0xa5, 0x2c, 0x8f, 0x2a, 0x94, 0x28, 0x33, 0x2a, 0x3c, 0x2c, + 0x4b, 0x2c, 0x80, 0x2c, 0x7f, 0x28, 0xee, 0x28, 0x68, 0x25, 0x52, 0x2c, 0xc0, 0x2e, 0x42, 0x2b, 0x36, 0x29, 0xb6, 0x2a, 0xba, 0x28, 0x00, 0x13, 0x3f, 0x2d, 0x89, 0x26, 0x92, 0x27, 0x9e, 0x27, + 0x5d, 0x2a, 0xcc, 0x2a, 0xb2, 0x2c, 0x0b, 0x2d, 0x74, 0x2c, 0x3b, 0x28, 0x96, 0x2b, 0xae, 0x29, 0xee, 0x27, 0x62, 0x2b, 0x8a, 0x23, 0xb4, 0x28, 0xfa, 0x29, 0x9c, 0x27, 0xa6, 0x28, 0x32, 0x2b, + 0xeb, 0x29, 0xf1, 0x29, 0x60, 0x29, 0x92, 0x2c, 0x66, 0x2a, 0x7e, 0x2b, 0x99, 0x29, 0x0d, 0x2a, 0x44, 0x2a, 0x21, 0x2a, 0x1d, 0x2b, 0xa3, 0x29, 0xab, 0x29, 0x99, 0x29, 0x26, 0x2a, 0x3a, 0x2a, + 0x08, 0x2a, 0x29, 0x2a, 0xeb, 0x2a, 0x42, 0x2b, 0x02, 0x2b, 0xd5, 0x29, 0x54, 0x2a, 0x7e, 0x2a, 0x81, 0x29, 0xf4, 0x29, 0x58, 0x2a, 0xda, 0x29, 0x54, 0x2c, 0x96, 0x2a, 0xc5, 0x29, 0x09, 0x2c, + 0x2c, 0x2b, 0x8e, 0x29, 0xd7, 0x2a, 0x79, 0x2a, 0xc2, 0x2b, 0xe3, 0x2b, 0xa0, 0x2a, 0x0b, 0x29, 0x8f, 0x2b, 0x9e, 0x2b, 0xa3, 0x2a, 0x1b, 0x2a, 0x06, 0x2c, 0x35, 0x2b, 0x73, 0x2b, 0xc6, 0x29, + 0x24, 0x2c, 0x57, 0x2a, 0xa3, 0x2a, 0xd9, 0x28, 0x8b, 0x2a, 0x43, 0x2b, 0x0c, 0x2a, 0x3a, 0x2b, 0x58, 0x29, 0x23, 0x2b, 0xf2, 0x29, 0xee, 0x2b, 0x85, 0x2b, 0xbe, 0x28, 0x30, 0x29, 0x20, 0x2b, + 0x2b, 0x2b, 0xca, 0x29, 0xd4, 0x28, 0xee, 0x28, 0xee, 0x29, 0x0e, 0x2b, 0x01, 0x2c, 0xa2, 0x2a, 0xcc, 0x2a, 0x53, 0x2b, 0x04, 0x29, 0x9f, 0x29, 0xab, 0x29, 0x5d, 0x2b, 0xcc, 0x2a, 0x76, 0x2b, + 0x86, 0x2c, 0x93, 0x2b, 0xd0, 0x28, 0x06, 0x2c, 0x10, 0x2b, 0xad, 0x28, 0xb5, 0x2b, 0xb2, 0x29, 0x90, 0x29, 0x8e, 0x2b, 0x1c, 0x2b, 0x03, 0x2a, 0xe8, 0x28, 0x3a, 0x2a, 0x40, 0x2c, 0xb4, 0x2b, + 0x08, 0x2a, 0xcd, 0x2a, 0xa1, 0x29, 0x53, 0x2b, 0xa2, 0x2b, 0x6f, 0x2b, 0x4a, 0x29, 0x9b, 0x2b, 0x81, 0x29, 0x3d, 0x2c, 0xea, 0x29, 0xbe, 0x29, 0x99, 0x2b, 0x78, 0x2c, 0xa5, 0x2b, 0xdb, 0x29, + 0x00, 0x2a, 0x95, 0x2a, 0xda, 0x2b, 0x67, 0x29, 0x88, 0x2b, 0x7a, 0x28, 0x5c, 0x29, 0x6f, 0x2a, 0x94, 0x2a, 0x5a, 0x2b, 0x10, 0x2b, 0x47, 0x29, 0x3b, 0x2c, 0xc0, 0x2a, 0xc6, 0x25, 0xf5, 0x29, + 0xe9, 0x24, 0xd2, 0x26, 0x7c, 0x24, 0x43, 0x28, 0x21, 0x27, 0x08, 0x27, 0x09, 0x21, 0x66, 0x20, 0xcb, 0x25, 0x1a, 0x26, 0x1a, 0x24, 0x12, 0x20, 0x50, 0x21, 0x49, 0x25, 0x7e, 0x28, 0x4a, 0x28, + 0xea, 0x1d, 0x78, 0x22, 0x6b, 0x27, 0x53, 0x25, 0x5c, 0x26, 0xba, 0x25, 0x8d, 0x24, 0x3c, 0x23, 0x92, 0x25, 0x90, 0x26, 0x82, 0x28, 0xe9, 0x20, 0xc0, 0x28, 0x65, 0x24, 0xec, 0x1a, 0x77, 0x27, + 0x4a, 0x28, 0x06, 0x24, 0x33, 0x26, 0x7e, 0x26, 0x80, 0x28, 0x3d, 0x24, 0xe0, 0x23, 0x58, 0x10, 0x17, 0x26, 0x64, 0x25, 0x86, 0x25, 0xaf, 0x24, 0xec, 0x27, 0xbe, 0x27, 0x58, 0x24, 0xad, 0x21, + 0x27, 0x29, 0x37, 0x25, 0x6f, 0x26, 0x8d, 0x1f, 0xd6, 0x1e, 0x26, 0x28, 0x96, 0x23, 0x40, 0x25, 0xe4, 0x20, 0x3e, 0x28, 0x40, 0x23, 0x40, 0x28, 0x56, 0x27, 0xe8, 0x12, 0x41, 0x26, 0x5f, 0x28, + 0x9a, 0x20, 0xd8, 0x24, 0x26, 0x21, 0xf1, 0x24, 0x7f, 0x25, 0x38, 0x28, 0x9c, 0x25, 0xa2, 0x22, 0x6c, 0x24, 0x30, 0x29, 0xf8, 0x24, 0x0c, 0x26, 0x32, 0x21, 0x54, 0x25, 0x2c, 0x28, 0x30, 0x27, + 0x8e, 0x24, 0x52, 0x27, 0x40, 0x25, 0xee, 0x27, 0xf2, 0x22, 0xca, 0x24, 0x08, 0x25, 0x59, 0x24, 0xce, 0x1a, 0x08, 0x28, 0x69, 0x27, 0xc8, 0x26, 0xbf, 0x22, 0xf4, 0x22, 0xb5, 0x28, 0x02, 0x26, + 0x10, 0x25, 0x9e, 0x27, 0x30, 0x24, 0x4a, 0x27, 0x4e, 0x25, 0xd1, 0x29, 0xf7, 0x26, 0x54, 0x25, 0xa6, 0x24, 0x04, 0x2a, 0x18, 0x21, 0x50, 0x10, 0x43, 0x28, 0x52, 0x27, 0x9e, 0x26, 0xf6, 0x24, + 0x77, 0x24, 0xf2, 0x24, 0xc7, 0x27, 0x12, 0x28, 0xc8, 0x26, 0xfc, 0x20, 0xb9, 0x24, 0x8e, 0x26, 0x4e, 0x24, 0x89, 0x27, 0x6e, 0x22, 0x1f, 0x24, 0x84, 0x27, 0xaa, 0x26, 0x42, 0x1a, 0x37, 0x23, + 0x40, 0x3d, 0x46, 0x3d, 0x7c, 0x3d, 0x24, 0x3c, 0x95, 0x3d, 0x5a, 0x3c, 0xc5, 0x3b, 0x3d, 0x3d, 0x01, 0x3c, 0x2b, 0x3d, 0x61, 0x3d, 0x9e, 0x3e, 0x7d, 0x39, 0x2c, 0x3e, 0x34, 0x40, 0x30, 0x3e, + 0x80, 0x3d, 0x30, 0x3d, 0x8a, 0x3f, 0x2c, 0x3c, 0xaa, 0x3a, 0xe5, 0x3c, 0x74, 0x3c, 0xe1, 0x3d, 0x8a, 0x3d, 0xd3, 0x3a, 0x95, 0x3d, 0xd7, 0x3c, 0xdc, 0x3d, 0xbc, 0x3e, 0xbb, 0x3d, 0x26, 0x40, + 0x04, 0x3b, 0x71, 0x3c, 0x88, 0x38, 0x71, 0x3c, 0x9c, 0x3d, 0xf8, 0x3e, 0x46, 0x3e, 0xd4, 0x3a, 0x19, 0x40, 0x19, 0x3d, 0x0d, 0x3f, 0xba, 0x3a, 0xf9, 0x3c, 0x13, 0x3d, 0xbe, 0x3d, 0x9f, 0x3d, + 0x14, 0x40, 0xd3, 0x3d, 0x32, 0x3d, 0x33, 0x3d, 0x64, 0x3d, 0x18, 0x3e, 0xbf, 0x3a, 0x52, 0x3f, 0x80, 0x3a, 0x7e, 0x3f, 0x4c, 0x3e, 0xac, 0x3d, 0xf6, 0x3c, 0xb8, 0x39, 0x7c, 0x3f, 0x80, 0x3d, + 0x1c, 0x3c, 0x97, 0x3c, 0x7a, 0x3b, 0x34, 0x36, 0x6c, 0x3c, 0x8e, 0x3d, 0x9e, 0x3a, 0xed, 0x3a, 0x39, 0x3d, 0x0e, 0x3f, 0x00, 0x39, 0x6a, 0x3c, 0xe6, 0x3c, 0x80, 0x3e, 0x32, 0x3c, 0x79, 0x3d, + 0xd4, 0x3e, 0x04, 0x3f, 0x9f, 0x3c, 0xc0, 0x3e, 0x16, 0x3f, 0x0a, 0x3b, 0x82, 0x3d, 0xf5, 0x3c, 0x9d, 0x3c, 0xa6, 0x3f, 0x8a, 0x3c, 0x2c, 0x3f, 0x1a, 0x3e, 0x4f, 0x3d, 0x05, 0x3d, 0xe0, 0x3f, + 0x76, 0x3f, 0x02, 0x3f, 0x94, 0x3c, 0x67, 0x3c, 0xab, 0x3b, 0x36, 0x3d, 0xeb, 0x40, 0x3a, 0x3f, 0xd4, 0x3c, 0x2a, 0x3c, 0xaf, 0x3d, 0x7f, 0x3a, 0x13, 0x40, 0x0d, 0x3c, 0x0a, 0x3c, 0xa7, 0x3b, + 0x0e, 0x3e, 0x7c, 0x3e, 0xd0, 0x3f, 0xca, 0x3e, 0xbe, 0x3f, 0x86, 0x3c, 0x7e, 0x3e, 0xce, 0x3c, 0xa8, 0x3c, 0x24, 0x3e, 0xc2, 0x3b, 0x91, 0x3c, 0xb8, 0x3d, 0x0e, 0x3b, 0xbe, 0x3c, 0x10, 0x3f, + 0x64, 0x33, 0xf1, 0x36, 0x8c, 0x36, 0x4a, 0x38, 0x60, 0xa7, 0x9b, 0x35, 0x1b, 0x37, 0xd5, 0x39, 0x3c, 0x3c, 0x52, 0x36, 0x50, 0xb3, 0xbf, 0x38, 0x04, 0x2f, 0x22, 0x3a, 0x3e, 0x34, 0x1b, 0x35, + 0xe0, 0x37, 0x58, 0x2f, 0xbc, 0x3a, 0xc6, 0x3b, 0xec, 0x3a, 0x1e, 0x39, 0x8f, 0x35, 0x00, 0x27, 0xc1, 0x3a, 0xb9, 0x34, 0xa4, 0x37, 0xa2, 0x34, 0x3b, 0x3c, 0xd4, 0x30, 0xd2, 0xb4, 0x9b, 0x38, + 0x21, 0x3a, 0xe2, 0x34, 0xa6, 0x39, 0x40, 0x3a, 0x60, 0x33, 0xc7, 0x37, 0x1b, 0x38, 0x60, 0x32, 0xfc, 0xaf, 0x4e, 0x39, 0xe4, 0x36, 0xc6, 0x3b, 0x64, 0x39, 0x26, 0x30, 0x10, 0x31, 0x8a, 0x38, + 0x1b, 0x3a, 0x76, 0x33, 0xa4, 0x3a, 0x2e, 0x30, 0xa5, 0x2c, 0xb0, 0x32, 0x04, 0x3c, 0x3a, 0x38, 0x84, 0x30, 0x30, 0x3a, 0xce, 0x37, 0xc8, 0x38, 0xae, 0x3a, 0xb8, 0x2c, 0x3e, 0x38, 0xe4, 0x39, + 0x57, 0x30, 0x0d, 0x38, 0x7b, 0x37, 0x8c, 0x34, 0xc0, 0x1e, 0x26, 0x37, 0x5a, 0x39, 0x20, 0x38, 0xf8, 0x37, 0x1e, 0x35, 0xc7, 0x36, 0x84, 0x3a, 0xb3, 0x34, 0xf7, 0x37, 0x70, 0x2e, 0x64, 0x32, + 0x8e, 0x39, 0x85, 0x3a, 0x95, 0x39, 0xfc, 0x32, 0x78, 0x39, 0x0a, 0x3c, 0x36, 0x38, 0x80, 0x9e, 0x01, 0x37, 0x1c, 0x35, 0xe4, 0x38, 0x38, 0xac, 0x78, 0x2e, 0xd6, 0x34, 0xb8, 0xae, 0x38, 0x2f, + 0x5c, 0x35, 0xca, 0x31, 0x80, 0x39, 0xc0, 0x39, 0xec, 0x2d, 0x9c, 0x39, 0x98, 0xb1, 0x57, 0x3b, 0xe4, 0xb1, 0x94, 0x30, 0xf6, 0x35, 0x32, 0x37, 0x80, 0x2d, 0x16, 0x3c, 0xb4, 0x3a, 0x3c, 0x2e, + 0x0c, 0x3c, 0x39, 0x36, 0x60, 0x33, 0x56, 0x39, 0x45, 0x39, 0x9a, 0x37, 0x8e, 0x31, 0x1d, 0x3b, 0xfc, 0x31, 0x4c, 0x3a, 0x51, 0x38, 0xf8, 0x34, 0x84, 0x2f, 0x48, 0x35, 0x0f, 0x32, 0xc2, 0x38, + 0xc0, 0xb4, 0x8c, 0xaf, 0xfa, 0xb5, 0x15, 0xb8, 0xf1, 0xaf, 0xcd, 0xb2, 0x1d, 0xb6, 0x92, 0xb5, 0x65, 0xb3, 0x84, 0xb2, 0x64, 0x2d, 0x57, 0xb6, 0xd0, 0xaa, 0xb7, 0xb4, 0x88, 0xb5, 0x9c, 0xb5, + 0x22, 0xb9, 0xf3, 0xb1, 0xc1, 0xb5, 0x60, 0xb1, 0x06, 0xb7, 0x4a, 0xb5, 0xfa, 0xae, 0x64, 0xb4, 0x63, 0xb8, 0xce, 0xb2, 0x03, 0xb1, 0xb8, 0xb5, 0x76, 0xb4, 0x6e, 0xb6, 0xf1, 0xb1, 0x01, 0xb8, + 0x2a, 0xb4, 0xa5, 0xb3, 0x1b, 0xb5, 0x46, 0xaa, 0x95, 0xaf, 0x4c, 0xb6, 0xd6, 0xb5, 0x54, 0xb0, 0xfc, 0xb1, 0x80, 0xb6, 0xb0, 0xb7, 0xd5, 0xb4, 0xab, 0xb8, 0x9c, 0xb4, 0x11, 0xb2, 0xc0, 0xb4, + 0x74, 0xb9, 0xf0, 0xac, 0xce, 0xb3, 0x90, 0xb5, 0xb8, 0xb2, 0x56, 0xb1, 0xb4, 0xb4, 0x80, 0xb4, 0x7c, 0x2e, 0x0e, 0xb9, 0x27, 0xb4, 0xa6, 0xb5, 0xb6, 0xb2, 0x7e, 0xb1, 0x26, 0xb6, 0x49, 0xb5, + 0x74, 0xb4, 0x1a, 0xb4, 0xbe, 0xae, 0x4e, 0xb2, 0x20, 0xb4, 0x2e, 0xb1, 0xed, 0xb5, 0xe0, 0xb6, 0x4b, 0xb2, 0xc4, 0xb1, 0x81, 0xb1, 0x7a, 0xb6, 0x38, 0xb1, 0x78, 0xb1, 0x1f, 0xb4, 0xea, 0xac, + 0x2c, 0xb5, 0xfe, 0xb7, 0xbc, 0xb5, 0x2c, 0xb6, 0x04, 0xb6, 0x82, 0xb5, 0x6a, 0xb6, 0x1d, 0x2c, 0x28, 0xb7, 0x01, 0xaf, 0x85, 0xb6, 0x28, 0xb2, 0x94, 0xb3, 0xea, 0xb3, 0x0a, 0xaf, 0x3c, 0xb5, + 0xee, 0xb5, 0xa0, 0xb2, 0x5e, 0xb3, 0x99, 0xab, 0x1d, 0xb4, 0x81, 0xb6, 0x3c, 0xab, 0x2d, 0xb6, 0xea, 0xb2, 0x44, 0xb0, 0x37, 0xb5, 0x02, 0xb4, 0x07, 0xb7, 0x7e, 0xb5, 0x62, 0xb7, 0xfe, 0xb0, + 0x91, 0xb8, 0x8e, 0xb4, 0xd6, 0xb5, 0xdb, 0xb6, 0x8e, 0xb8, 0x24, 0xb5, 0xa9, 0xb5, 0x22, 0xb8, 0x33, 0xb1, 0x27, 0xb5, 0xd4, 0xb7, 0x52, 0xb8, 0x8c, 0xb4, 0xdf, 0xb5, 0xbe, 0x25, 0xc9, 0xb3, + 0x4c, 0xb0, 0xe8, 0x1c, 0x58, 0x2e, 0x80, 0xa1, 0x25, 0xb0, 0xf3, 0x29, 0xd8, 0xad, 0x0e, 0xb2, 0x70, 0xa9, 0xec, 0xad, 0x30, 0x32, 0xd3, 0x30, 0x9c, 0x1c, 0xda, 0xb5, 0xa4, 0xb1, 0xa6, 0xb1, + 0x84, 0xa9, 0xa0, 0xa6, 0x0e, 0xae, 0x80, 0xa9, 0x2b, 0xb1, 0xe8, 0xad, 0x03, 0x2d, 0x58, 0x26, 0x5a, 0xb4, 0x56, 0xb4, 0xa2, 0xaf, 0xc0, 0xac, 0x4b, 0xb5, 0xe1, 0xad, 0x82, 0x30, 0x3c, 0x30, + 0x10, 0xb4, 0xbc, 0x20, 0x21, 0xb0, 0x48, 0xb1, 0x1c, 0xb5, 0x8b, 0xad, 0x67, 0xae, 0x84, 0x2f, 0xf2, 0x28, 0x7a, 0xa8, 0x40, 0xb0, 0x6a, 0xb2, 0x1a, 0xae, 0x34, 0xb3, 0xb6, 0x27, 0x0a, 0xaa, + 0x70, 0xb5, 0x80, 0xac, 0x75, 0xb4, 0x58, 0x25, 0xd7, 0x2a, 0xeb, 0xb0, 0x7e, 0xb4, 0xd4, 0xa4, 0x87, 0x2c, 0xf6, 0xb2, 0x98, 0xb4, 0xee, 0xaf, 0x29, 0xb4, 0x00, 0x14, 0x35, 0xb4, 0x28, 0xb3, + 0x10, 0x28, 0x56, 0xab, 0x42, 0xb4, 0x2f, 0x26, 0xe6, 0xaa, 0xd0, 0xa9, 0x64, 0xb1, 0xeb, 0xb4, 0x0a, 0xb2, 0x1c, 0xb2, 0x59, 0xa5, 0x7f, 0xb0, 0x17, 0x31, 0xa0, 0xb3, 0x62, 0x30, 0xe8, 0xb1, + 0x54, 0xb0, 0x57, 0xae, 0x02, 0xb4, 0xb9, 0xb0, 0x2b, 0xb3, 0x27, 0xb0, 0x1e, 0xb0, 0x2f, 0xa4, 0xe0, 0x23, 0x92, 0x2e, 0x50, 0xac, 0xfc, 0x2e, 0x37, 0xad, 0xdc, 0xa4, 0xc2, 0x2c, 0xb7, 0xb3, + 0xec, 0xb4, 0xe3, 0xab, 0xd8, 0xb0, 0x7a, 0xb1, 0x8c, 0x31, 0x09, 0xb1, 0x4c, 0xb1, 0xe2, 0xb2, 0x90, 0x28, 0x7e, 0xb4, 0x50, 0x22, 0x8e, 0x28, 0x12, 0x33, 0x35, 0xb0, 0x56, 0xb6, 0xdb, 0xa9, + 0xf2, 0xb0, 0x23, 0xb2, 0x48, 0x24, 0x92, 0xb2, 0xc8, 0xb1, 0xc4, 0xb6, 0x4c, 0xae, 0x0d, 0xb2, 0x70, 0x99, 0x56, 0xb6, 0x88, 0xaf, 0xb4, 0x2b, 0x36, 0x29, 0x8b, 0xb1, 0xf5, 0xb1, 0xb3, 0xac, + 0x94, 0x29, 0x28, 0x1a, 0xcb, 0x28, 0x94, 0x2b, 0xab, 0x26, 0x94, 0x24, 0x07, 0x2a, 0xca, 0x28, 0x10, 0x16, 0xa2, 0x25, 0x86, 0xa3, 0x1c, 0x28, 0x34, 0x1c, 0xbb, 0x28, 0xa2, 0x2a, 0x80, 0x2a, + 0x40, 0x2d, 0x7e, 0x26, 0xd8, 0x27, 0x28, 0x9d, 0xc4, 0x29, 0x36, 0x28, 0xf0, 0x12, 0xfe, 0x28, 0x56, 0x2c, 0x8f, 0x28, 0xf0, 0x22, 0x1a, 0x2a, 0xea, 0x25, 0xa9, 0x2b, 0xe6, 0x27, 0x4e, 0x2a, + 0xb6, 0x26, 0x72, 0x26, 0xe2, 0x27, 0x58, 0xa1, 0xab, 0x26, 0x0a, 0x2a, 0x74, 0x29, 0xf6, 0x1f, 0xc0, 0x27, 0x32, 0x29, 0x1e, 0x2c, 0xe0, 0x25, 0x78, 0x2c, 0x6c, 0x2a, 0x8c, 0x25, 0x26, 0x27, + 0x00, 0x2e, 0x80, 0x1f, 0xd8, 0x25, 0xfb, 0x29, 0xc2, 0x26, 0x97, 0x26, 0x52, 0x26, 0x87, 0x26, 0x00, 0xa6, 0x1c, 0x2d, 0xbd, 0x28, 0x06, 0x29, 0x42, 0x24, 0xf4, 0x25, 0xcc, 0x2a, 0xa2, 0x28, + 0x90, 0x28, 0x42, 0x26, 0xec, 0x22, 0xd6, 0x24, 0x45, 0x29, 0xfa, 0x21, 0x56, 0x29, 0x02, 0x2c, 0xb8, 0x25, 0xa4, 0x26, 0x82, 0x22, 0x32, 0x29, 0xde, 0x1e, 0x5d, 0x25, 0x48, 0x27, 0x37, 0x23, + 0x1c, 0x28, 0xc0, 0x2a, 0x82, 0x29, 0x6c, 0x2b, 0xbc, 0x29, 0x44, 0x26, 0x46, 0x2a, 0x69, 0xa0, 0xa0, 0x2a, 0x30, 0x11, 0xa6, 0x29, 0x8e, 0x26, 0xa3, 0x28, 0x28, 0x27, 0x2b, 0x24, 0x58, 0x2b, + 0xa8, 0x2b, 0x32, 0x27, 0xea, 0x24, 0xa5, 0x9e, 0xdc, 0x26, 0xd4, 0x29, 0xc8, 0x25, 0xe0, 0x28, 0xb4, 0x28, 0x7e, 0x27, 0x96, 0x28, 0x54, 0x25, 0xa7, 0x2a, 0x26, 0x26, 0xd3, 0x2b, 0xc0, 0x25, + 0xa2, 0x2b, 0x0a, 0x29, 0xe9, 0x29, 0xb2, 0x2a, 0xa4, 0x2c, 0xd9, 0x2a, 0xae, 0x2a, 0x58, 0x2b, 0xad, 0x24, 0x5b, 0x29, 0xd8, 0x2b, 0x56, 0x2c, 0xb2, 0x28, 0xc0, 0x2a, 0xe0, 0x8d, 0x36, 0x25, + 0xfd, 0x26, 0xc0, 0xa0, 0x03, 0xa1, 0xbc, 0x20, 0xcf, 0x26, 0x1c, 0xa0, 0x9d, 0x24, 0x80, 0x25, 0x72, 0xa4, 0x93, 0x21, 0x8e, 0xa6, 0xc7, 0xa5, 0xd5, 0x96, 0xa3, 0x29, 0x77, 0x28, 0x46, 0x28, + 0xaa, 0x25, 0xca, 0x20, 0x38, 0x1c, 0x7d, 0xa4, 0x70, 0x24, 0xa4, 0x20, 0x94, 0xa4, 0xa7, 0x20, 0x22, 0x29, 0x4e, 0x29, 0x06, 0x21, 0xf5, 0x24, 0x3e, 0x27, 0x62, 0x27, 0x40, 0x97, 0x2d, 0xa3, + 0x8a, 0x26, 0x40, 0x0e, 0x08, 0x22, 0xc0, 0x14, 0xe8, 0x29, 0x38, 0x24, 0x21, 0x24, 0x7a, 0xa4, 0xe2, 0x1f, 0x14, 0x1b, 0x92, 0x27, 0x8e, 0x22, 0x62, 0x25, 0x9d, 0x29, 0x20, 0x8d, 0x20, 0x1a, + 0xa2, 0x2b, 0xce, 0x1e, 0xbc, 0x26, 0x10, 0x21, 0x80, 0x04, 0x42, 0x26, 0xfc, 0x25, 0xe8, 0x91, 0x37, 0xa5, 0xec, 0x28, 0x18, 0x29, 0x2d, 0x24, 0x8c, 0x25, 0x23, 0x1e, 0x40, 0x29, 0x89, 0x26, + 0xc0, 0x1c, 0xa6, 0x1c, 0x94, 0x27, 0xa2, 0x9a, 0x52, 0x25, 0x70, 0x01, 0x81, 0x25, 0x74, 0x2a, 0x84, 0x25, 0xea, 0x26, 0xe5, 0x98, 0x40, 0x23, 0x86, 0xa6, 0x16, 0x27, 0xa4, 0xa1, 0x5e, 0x26, + 0xd0, 0x22, 0x7f, 0x22, 0x21, 0x28, 0x5f, 0x28, 0x95, 0x27, 0x2d, 0x1a, 0x95, 0x25, 0x80, 0x90, 0x9c, 0x1d, 0x44, 0xa5, 0x2c, 0x21, 0xa8, 0x9c, 0x4c, 0x25, 0x42, 0x1d, 0x68, 0x98, 0x3e, 0x2a, + 0xda, 0x2a, 0xf2, 0x22, 0xcb, 0x21, 0xcc, 0x1c, 0x28, 0xa4, 0x44, 0x25, 0x4a, 0x28, 0x5e, 0x25, 0xef, 0x21, 0xa3, 0x29, 0x24, 0x18, 0xfc, 0x9f, 0x2c, 0xa3, 0xd4, 0x19, 0xfb, 0x2a, 0xd9, 0x21, + 0x9e, 0x24, 0xb1, 0x27, 0x1c, 0x20, 0xae, 0x27, 0x4e, 0x28, 0x13, 0x2c, 0xd2, 0x26, 0x4c, 0x26, 0xb0, 0x18, 0x4e, 0x2a, 0x2d, 0x26, 0xd8, 0x1f, 0x46, 0x1c, 0x45, 0x28, 0x40, 0x25, 0xe4, 0x1a, + 0xd0, 0x22, 0x00, 0x22, 0xa6, 0x25, 0x27, 0x27, 0x3b, 0x19, 0x5e, 0x23, 0x45, 0x25, 0xd9, 0x25, 0x99, 0x26, 0xb0, 0x22, 0xa9, 0x9c, 0xd1, 0x26, 0xf2, 0x1b, 0xee, 0x24, 0xc5, 0x23, 0x29, 0x24, + 0x10, 0x28, 0x62, 0x20, 0xbe, 0x26, 0x9f, 0x25, 0x63, 0x27, 0x9c, 0x25, 0x8e, 0x21, 0xa9, 0x21, 0xff, 0x27, 0xde, 0x20, 0x52, 0x22, 0x6e, 0x24, 0x21, 0x26, 0x35, 0x24, 0x92, 0x1b, 0xaf, 0x27, + 0xf4, 0x24, 0x3f, 0x23, 0xaa, 0x25, 0x8e, 0x22, 0x64, 0x1c, 0x92, 0x25, 0x5f, 0x25, 0x0c, 0x21, 0x8c, 0x1d, 0x9e, 0x26, 0x06, 0x26, 0x67, 0x26, 0x16, 0x28, 0x32, 0x21, 0x2f, 0x21, 0x1e, 0x25, + 0x5a, 0x28, 0x0b, 0x1e, 0xfe, 0x24, 0xeb, 0x23, 0xfe, 0x20, 0x00, 0x20, 0x36, 0x26, 0xe3, 0x24, 0x9c, 0x90, 0x55, 0x28, 0x4e, 0x23, 0x8b, 0x25, 0xbf, 0x24, 0xdc, 0x1f, 0x1c, 0x25, 0x99, 0x25, + 0xb4, 0x22, 0x70, 0x24, 0x4e, 0x20, 0x5c, 0x22, 0xaa, 0x20, 0xa1, 0x22, 0xdf, 0x25, 0x57, 0x25, 0xd1, 0x22, 0xf8, 0x20, 0xc7, 0x22, 0xe9, 0x26, 0xa7, 0x22, 0x02, 0x22, 0xae, 0x22, 0xfd, 0x1b, + 0xa2, 0x25, 0xf5, 0x27, 0x9e, 0x25, 0x2a, 0x24, 0xcf, 0x25, 0x33, 0x27, 0xae, 0x25, 0x93, 0x99, 0x36, 0x26, 0x7c, 0x21, 0x4e, 0x26, 0xa9, 0x1f, 0xe3, 0x20, 0x34, 0x23, 0xc4, 0x1a, 0xb3, 0x21, + 0x01, 0x24, 0x20, 0x21, 0xa8, 0x24, 0x2c, 0x22, 0xd6, 0x22, 0x65, 0x26, 0xdd, 0x99, 0xee, 0x26, 0x9c, 0x1d, 0xee, 0x1b, 0xc1, 0x24, 0x70, 0x24, 0x4a, 0x25, 0x3b, 0x27, 0xbe, 0x26, 0x12, 0x1f, + 0xa7, 0x28, 0x90, 0x23, 0x8a, 0x24, 0x4c, 0x26, 0xb3, 0x27, 0x84, 0x23, 0xa4, 0x23, 0x13, 0x28, 0xb2, 0x20, 0x2e, 0x25, 0x9f, 0x26, 0xb4, 0x26, 0xb8, 0x22, 0x60, 0x24, 0xc0, 0x80, 0xa3, 0x24, + 0x4c, 0x1d, 0xb4, 0x1d, 0x5f, 0x96, 0x0d, 0x1a, 0xca, 0x19, 0xe0, 0x15, 0x32, 0x1e, 0x9f, 0x23, 0x2f, 0x24, 0xaf, 0x1f, 0x20, 0xa1, 0xde, 0x91, 0x11, 0x14, 0xd6, 0x25, 0xde, 0x1e, 0xbd, 0x1f, + 0x54, 0x17, 0x94, 0x10, 0xb2, 0x22, 0x05, 0x24, 0xae, 0x23, 0x1d, 0x21, 0x1a, 0x16, 0x00, 0x9c, 0x75, 0x24, 0x5c, 0x22, 0x5e, 0x21, 0x6e, 0x1a, 0xcb, 0x26, 0xa0, 0x13, 0x75, 0xa2, 0xaf, 0x96, + 0xde, 0x24, 0x2f, 0x17, 0x75, 0x22, 0xc3, 0x24, 0x58, 0x23, 0x95, 0x1e, 0xec, 0x1f, 0xb0, 0x99, 0x02, 0x9e, 0x74, 0x1f, 0x4c, 0x1e, 0x1b, 0x25, 0xb0, 0x1f, 0x2b, 0x1f, 0x12, 0x93, 0xb1, 0x1f, + 0x56, 0x24, 0xb1, 0x1d, 0x70, 0x25, 0x0c, 0x9a, 0x00, 0x9b, 0x56, 0x1f, 0x0a, 0x26, 0x24, 0x1e, 0xec, 0x13, 0xbc, 0x22, 0x01, 0x24, 0x37, 0x21, 0x63, 0x25, 0xca, 0x93, 0x1e, 0x23, 0x3b, 0x24, + 0xef, 0x98, 0x8e, 0x1f, 0x36, 0x24, 0x98, 0x15, 0x7a, 0x94, 0x4b, 0x1f, 0x92, 0x22, 0x8e, 0x23, 0x9d, 0x22, 0x3e, 0x21, 0xd3, 0x1d, 0x0e, 0x23, 0x5c, 0x9a, 0xba, 0x23, 0xd7, 0x9e, 0xc1, 0x20, + 0x74, 0x22, 0xaa, 0x21, 0x3c, 0x24, 0x74, 0x1c, 0xba, 0x23, 0x73, 0x24, 0x66, 0x20, 0x5e, 0x15, 0xa0, 0x14, 0xc0, 0x06, 0xcc, 0x1f, 0xc7, 0x9f, 0x4a, 0x17, 0x70, 0x19, 0x0e, 0x9e, 0x00, 0x1f, + 0x64, 0x22, 0x9b, 0x19, 0x4d, 0x23, 0x84, 0x24, 0x2e, 0xa0, 0x67, 0x22, 0x4f, 0x19, 0xbe, 0x24, 0x45, 0x9f, 0xc2, 0x21, 0xdb, 0x16, 0x35, 0x1a, 0x50, 0xa2, 0x85, 0x24, 0xe6, 0x25, 0xb0, 0x15, + 0xf2, 0x23, 0x2e, 0x21, 0x88, 0x95, 0xe4, 0x22, 0x72, 0x21, 0x0f, 0x25, 0x6a, 0x18, 0xea, 0x23, 0xdb, 0x14, 0x20, 0x26, 0x75, 0x1f, 0xdc, 0x9b, 0x67, 0x9a, 0x96, 0x1f, 0x56, 0x21, 0xee, 0x20, + 0xeb, 0x39, 0xba, 0xb1, 0xb0, 0x32, 0x17, 0x39, 0xcc, 0x38, 0x00, 0x26, 0xf9, 0x38, 0xb0, 0x37, 0x18, 0xb7, 0x57, 0x34, 0x57, 0xb6, 0x40, 0xa9, 0x00, 0x00, 0xc2, 0x39, 0x46, 0x3b, 0xf0, 0x3a, + 0x32, 0x3c, 0xca, 0x35, 0xc8, 0x31, 0xf3, 0xb7, 0xc6, 0x37, 0xff, 0x34, 0x75, 0xb4, 0x46, 0x38, 0x02, 0x3c, 0x59, 0x3a, 0x98, 0x30, 0x75, 0x39, 0x06, 0x35, 0xaf, 0x3b, 0x71, 0x36, 0x0c, 0x34, + 0x42, 0x36, 0xc4, 0x32, 0xd7, 0x34, 0xcc, 0xb4, 0x0a, 0x3a, 0xc2, 0x38, 0x43, 0x38, 0x18, 0xb1, 0x56, 0x37, 0x47, 0x35, 0x98, 0x3b, 0x1a, 0x31, 0xd6, 0x3a, 0x1a, 0x3c, 0x6c, 0x32, 0x9a, 0x32, + 0x24, 0x3e, 0x54, 0x2e, 0x86, 0x35, 0xd6, 0x38, 0x7a, 0x34, 0x19, 0x38, 0xa3, 0x34, 0xb2, 0x30, 0x3a, 0xb8, 0x8d, 0x3c, 0xf0, 0x39, 0x9c, 0x37, 0x1c, 0x33, 0xf8, 0x34, 0x69, 0x3b, 0x16, 0x38, + 0xb5, 0x36, 0x62, 0x32, 0xe5, 0x35, 0xb8, 0x2e, 0x90, 0x39, 0x18, 0x25, 0x5a, 0x38, 0x92, 0x3c, 0xe8, 0x35, 0x20, 0x38, 0x40, 0x21, 0x9a, 0x36, 0x8f, 0xb4, 0xc4, 0x36, 0x78, 0x31, 0x37, 0x36, + 0x67, 0x35, 0x42, 0x38, 0x62, 0x39, 0xdc, 0x3b, 0x55, 0x39, 0x48, 0x2b, 0x53, 0x39, 0x80, 0xae, 0x46, 0x38, 0xde, 0xb4, 0x51, 0x37, 0x0e, 0x34, 0xec, 0x38, 0xc0, 0x34, 0xee, 0x31, 0xaa, 0x3c, + 0xc4, 0x3c, 0xc8, 0x36, 0x7c, 0x31, 0x65, 0xb2, 0x24, 0x2e, 0x8c, 0x38, 0x66, 0x39, 0xf6, 0x36, 0xb8, 0x38, 0x50, 0x3a, 0x54, 0x35, 0x5a, 0x28, 0x7e, 0x36, 0x00, 0x2a, 0x5e, 0x3c, 0xb6, 0x35, + 0xf8, 0x38, 0x78, 0x39, 0x6b, 0x38, 0x13, 0x3a, 0x1e, 0x3c, 0xcd, 0x3c, 0xc0, 0x3a, 0x9a, 0x39, 0xe8, 0x31, 0xa1, 0x3a, 0x9c, 0x3a, 0x2a, 0x3a, 0xe3, 0x36, 0x16, 0x3b, 0xbd, 0x31, 0x90, 0x2f, + 0x38, 0xb0, 0x0e, 0xb1, 0x87, 0xaf, 0x8c, 0xb2, 0x42, 0xb1, 0xff, 0xb2, 0xe3, 0xb0, 0x5f, 0xb1, 0xba, 0xb0, 0xbb, 0xb1, 0x13, 0xaf, 0xba, 0xad, 0xa2, 0xaf, 0xdd, 0xae, 0xf8, 0xb0, 0xa6, 0xb0, + 0x22, 0xaf, 0x85, 0xaf, 0x03, 0xb2, 0xba, 0xb0, 0x74, 0xb1, 0x1e, 0xb1, 0xdb, 0xb0, 0x8c, 0xb1, 0xd8, 0xb0, 0x8b, 0xb0, 0x62, 0xb1, 0xb5, 0xaf, 0x04, 0xb4, 0x86, 0xb0, 0x9a, 0xae, 0xee, 0xb1, + 0x1e, 0xb3, 0x69, 0xb1, 0x06, 0xb2, 0x98, 0xb0, 0x50, 0xb2, 0x1b, 0xb0, 0x52, 0xb1, 0x74, 0xae, 0x48, 0xb1, 0x7e, 0xb1, 0x79, 0xb0, 0xe1, 0xb1, 0x7e, 0xb2, 0x92, 0xb2, 0x98, 0xb1, 0xb7, 0xae, + 0xc6, 0xb2, 0xa9, 0xb0, 0xfe, 0xb1, 0x60, 0xae, 0x82, 0xad, 0x21, 0xb1, 0xbb, 0xb1, 0x51, 0xb0, 0x1d, 0xb0, 0x80, 0xb2, 0x3e, 0xaf, 0xf6, 0xb2, 0x21, 0xb4, 0x9e, 0xad, 0x5a, 0xae, 0x88, 0xb1, + 0xe9, 0xae, 0x19, 0xb0, 0xe3, 0xaf, 0x10, 0xb1, 0xbc, 0xaf, 0x18, 0xb2, 0x17, 0xb3, 0xd6, 0xb0, 0x43, 0xb0, 0xca, 0xb3, 0xa8, 0xb0, 0x10, 0xb1, 0xd6, 0xaa, 0x69, 0xaf, 0x71, 0xb1, 0x48, 0xb1, + 0x2e, 0xb2, 0x1d, 0xb3, 0x2e, 0xb0, 0x3c, 0xb1, 0x7a, 0xb0, 0xb6, 0xae, 0x6e, 0xb2, 0x66, 0xaf, 0x56, 0xac, 0x9c, 0xb1, 0x9c, 0xb2, 0xa1, 0xb0, 0x50, 0xaa, 0xfe, 0xb0, 0x9d, 0xb3, 0xb9, 0xb0, + 0xd4, 0xb0, 0xbc, 0xb0, 0xb2, 0xb0, 0x7a, 0xb2, 0x64, 0xb0, 0xb7, 0xb4, 0xb6, 0xad, 0xd5, 0xb0, 0x7e, 0xb0, 0x77, 0xb4, 0x58, 0xaf, 0xe2, 0xaf, 0xe8, 0xb1, 0xd8, 0xb2, 0x48, 0xb3, 0x96, 0xb0, + 0xb1, 0xb1, 0x51, 0xb1, 0x18, 0xb2, 0x2c, 0xb2, 0xe2, 0xb0, 0x1e, 0xac, 0x0a, 0xae, 0x7d, 0xb1, 0x11, 0xb0, 0x5c, 0xb1, 0xe6, 0xb0, 0xf1, 0xac, 0xa4, 0xb0, 0xc0, 0xb0, 0xb8, 0xaa, 0x24, 0xb0, + 0x91, 0x2e, 0x44, 0x2e, 0xf8, 0x2d, 0xcc, 0x2f, 0xe0, 0x2d, 0xff, 0x2e, 0x94, 0x2e, 0x34, 0x2f, 0xda, 0x2c, 0x00, 0x2f, 0xea, 0x2d, 0x74, 0x2f, 0xb2, 0x2d, 0xce, 0x2e, 0x63, 0x2e, 0x82, 0x2e, + 0x78, 0x2f, 0x00, 0x2f, 0x76, 0x2f, 0x72, 0x2f, 0x61, 0x2e, 0x0a, 0x2f, 0x92, 0x2e, 0xa2, 0x2f, 0x48, 0x2e, 0xde, 0x2c, 0xd6, 0x2d, 0xdc, 0x2d, 0x59, 0x30, 0x7c, 0x2f, 0xdc, 0x2e, 0x3c, 0x30, + 0x0f, 0x2e, 0x03, 0x2f, 0x82, 0x2d, 0x58, 0x2d, 0x47, 0x2f, 0x01, 0x30, 0xa7, 0x2f, 0x62, 0x2e, 0x3b, 0x30, 0xc6, 0x2e, 0x3e, 0x2f, 0xc9, 0x2d, 0x15, 0x30, 0x18, 0x2f, 0x1b, 0x30, 0xf7, 0x2d, + 0x30, 0x30, 0xd6, 0x2e, 0x0a, 0x2e, 0x85, 0x2d, 0xad, 0x2e, 0x5b, 0x2f, 0xc2, 0x2d, 0xf0, 0x2e, 0xb6, 0x2d, 0xc4, 0x2f, 0x9a, 0x2e, 0xf6, 0x2f, 0x16, 0x30, 0xb5, 0x2d, 0x08, 0x2e, 0xe7, 0x2d, + 0x8b, 0x2e, 0xc8, 0x2d, 0x64, 0x2e, 0x85, 0x2a, 0x03, 0x2e, 0x63, 0x2e, 0x88, 0x2f, 0x59, 0x2d, 0xbd, 0x2e, 0x56, 0x2f, 0x18, 0x2d, 0xa9, 0x2d, 0x01, 0x2d, 0x8e, 0x2f, 0xfc, 0x2d, 0x39, 0x2f, + 0x0c, 0x31, 0xb5, 0x2f, 0x12, 0x2e, 0xa2, 0x2f, 0x1c, 0x30, 0x6c, 0x2c, 0x52, 0x2f, 0x98, 0x2d, 0xd2, 0x2e, 0x3e, 0x2f, 0x0d, 0x2f, 0x25, 0x2f, 0x08, 0x2c, 0xdc, 0x2e, 0x91, 0x2f, 0x46, 0x30, + 0x4c, 0x2f, 0xfe, 0x2e, 0xe2, 0x2e, 0xac, 0x2e, 0x62, 0x2d, 0x9e, 0x2e, 0x74, 0x2d, 0x5e, 0x30, 0x22, 0x2e, 0x4e, 0x2f, 0x0e, 0x2f, 0xe9, 0x2d, 0xff, 0x2f, 0x0c, 0x2f, 0x3d, 0x2f, 0x3c, 0x2d, + 0xc0, 0x2f, 0x7f, 0x2f, 0xb4, 0x2f, 0x7f, 0x2f, 0x5c, 0x30, 0x75, 0x2b, 0xea, 0x2c, 0xea, 0x2d, 0x1b, 0x2e, 0x90, 0x2f, 0x1f, 0x2e, 0xa9, 0x2c, 0xf2, 0x2e, 0x30, 0x2e, 0xc6, 0x2b, 0xcd, 0x2e, + 0x22, 0x27, 0x14, 0x2a, 0x8e, 0x29, 0x94, 0x25, 0x48, 0x2b, 0x96, 0x29, 0xa8, 0x24, 0x12, 0x24, 0xf5, 0x26, 0x1a, 0x2a, 0x8a, 0x23, 0x0f, 0x25, 0x40, 0x13, 0xc0, 0x28, 0xcd, 0x2c, 0x9d, 0x2a, + 0x8c, 0x1e, 0x4b, 0x25, 0x71, 0x2a, 0x40, 0x20, 0x67, 0x26, 0xed, 0x28, 0xcc, 0x27, 0x1d, 0x28, 0x8d, 0x2a, 0x08, 0x29, 0x3f, 0x2b, 0xf4, 0x20, 0x8a, 0x2b, 0x74, 0x29, 0x9c, 0x23, 0x8b, 0x2a, + 0x10, 0x2b, 0x00, 0x28, 0xc8, 0x28, 0x76, 0x29, 0x56, 0x2a, 0xb0, 0x27, 0xc4, 0x24, 0x40, 0x9c, 0x14, 0x2a, 0x8a, 0x28, 0x06, 0x2a, 0xa5, 0x26, 0x2c, 0x25, 0x0b, 0x2a, 0x2e, 0x28, 0x08, 0x26, + 0x3e, 0x2c, 0x86, 0x27, 0x3f, 0x2b, 0x08, 0x28, 0xc0, 0x18, 0xa4, 0x28, 0xb6, 0x25, 0x2c, 0x28, 0xe0, 0x95, 0x0a, 0x2d, 0x9c, 0x24, 0xc4, 0x2b, 0xce, 0x29, 0xe0, 0x93, 0x02, 0x2c, 0x42, 0x2a, + 0xa0, 0x97, 0xe5, 0x29, 0x72, 0x25, 0x1a, 0x24, 0x1d, 0x25, 0xd9, 0x29, 0xea, 0x24, 0x8a, 0x22, 0x99, 0x27, 0xae, 0x2d, 0xa3, 0x29, 0x58, 0x28, 0xf0, 0x23, 0x26, 0x25, 0xca, 0x29, 0xc8, 0x28, + 0x5c, 0x27, 0x0d, 0x2c, 0x47, 0x28, 0xe4, 0x29, 0x31, 0x26, 0x40, 0x26, 0x93, 0x28, 0x6f, 0x27, 0x60, 0x99, 0x04, 0x2c, 0xfe, 0x2a, 0x2b, 0x2c, 0x2c, 0x24, 0xd2, 0x26, 0xfa, 0x2a, 0x9c, 0x28, + 0xe1, 0x29, 0x23, 0x2b, 0x76, 0x28, 0xfa, 0x28, 0xf0, 0x1e, 0x24, 0x2e, 0xcd, 0x2c, 0x66, 0x29, 0x60, 0x25, 0xfa, 0x2c, 0xaa, 0x23, 0x23, 0xa1, 0xc6, 0x2a, 0x3b, 0x28, 0xf1, 0x29, 0xe2, 0x25, + 0xee, 0x29, 0x50, 0x27, 0x0e, 0x2c, 0xd7, 0x2c, 0x6e, 0x29, 0x96, 0x26, 0x1d, 0x26, 0x3d, 0x2a, 0xc8, 0x25, 0x48, 0x2c, 0x90, 0x95, 0xce, 0x23, 0xb4, 0x27, 0x58, 0x27, 0x34, 0x1d, 0x14, 0x29, + 0x58, 0xa2, 0x0a, 0xa2, 0x44, 0xa2, 0x6c, 0xa2, 0xb3, 0xa1, 0xc7, 0xa1, 0xc1, 0xa1, 0x30, 0xa2, 0x00, 0xa0, 0x90, 0xa2, 0x97, 0xa1, 0xec, 0xa3, 0xd2, 0xa0, 0x48, 0xa3, 0xf0, 0xa2, 0xa8, 0xa2, + 0x39, 0xa3, 0xec, 0xa2, 0x09, 0xa3, 0xa0, 0xa2, 0x64, 0xa1, 0xb4, 0xa2, 0x14, 0xa2, 0x07, 0xa3, 0x43, 0xa2, 0x6d, 0xa0, 0x94, 0xa1, 0x3a, 0xa1, 0xa8, 0xa3, 0xa9, 0xa3, 0xda, 0xa2, 0x29, 0xa4, + 0xe2, 0xa0, 0x54, 0xa2, 0x5c, 0xa0, 0x0e, 0xa1, 0xa2, 0xa2, 0x18, 0xa4, 0xd2, 0xa2, 0xd4, 0xa1, 0x46, 0xa4, 0x1c, 0xa2, 0x82, 0xa3, 0x7a, 0xa0, 0xda, 0xa2, 0x38, 0xa2, 0xb4, 0xa3, 0xf8, 0xa1, + 0x12, 0xa4, 0x79, 0xa2, 0x80, 0xa1, 0xbc, 0xa1, 0x9a, 0xa2, 0x05, 0xa3, 0x6d, 0xa0, 0xdc, 0xa2, 0xa0, 0xa0, 0xde, 0xa3, 0x74, 0xa2, 0x63, 0xa3, 0x90, 0xa2, 0x54, 0xa1, 0x27, 0xa3, 0x63, 0xa1, + 0xf6, 0xa1, 0xee, 0xa1, 0x1c, 0xa2, 0x7e, 0x9a, 0xaa, 0xa1, 0x96, 0xa1, 0xbf, 0xa1, 0x2b, 0xa0, 0x94, 0xa2, 0xec, 0xa2, 0xc2, 0xa0, 0xf6, 0xa0, 0x7e, 0xa1, 0x9c, 0xa3, 0x70, 0xa1, 0xcf, 0xa2, + 0xc6, 0xa4, 0x14, 0xa3, 0xe8, 0xa1, 0x8c, 0xa3, 0x0c, 0xa4, 0x23, 0xa0, 0x46, 0xa2, 0x7c, 0xa1, 0xf5, 0xa2, 0x4f, 0xa3, 0x56, 0xa2, 0xc6, 0xa3, 0x69, 0xa0, 0x3f, 0xa2, 0x6a, 0xa2, 0x55, 0xa4, + 0x5d, 0xa3, 0x4c, 0xa3, 0xa6, 0xa2, 0x8e, 0xa1, 0x4b, 0xa0, 0x5e, 0xa1, 0xf1, 0xa2, 0x7f, 0xa4, 0x82, 0xa1, 0xff, 0xa1, 0xe4, 0xa2, 0xbd, 0xa0, 0xcd, 0xa3, 0xa8, 0xa1, 0xff, 0xa1, 0x6d, 0xa0, + 0x74, 0xa3, 0xe5, 0xa2, 0x9e, 0xa3, 0xa4, 0xa3, 0x78, 0xa4, 0x09, 0xa0, 0xe6, 0xa0, 0x6a, 0xa1, 0xbc, 0xa1, 0xee, 0xa3, 0xb3, 0xa0, 0xaa, 0xa0, 0xa1, 0xa2, 0xa0, 0xa1, 0xcc, 0x9f, 0xf8, 0xa2, + 0x08, 0x9d, 0xeb, 0x9e, 0x6c, 0x9f, 0x36, 0x99, 0xd3, 0x9f, 0xd6, 0x9c, 0xb0, 0x99, 0xf6, 0x98, 0x08, 0x9a, 0xce, 0x9e, 0x70, 0x9a, 0xf2, 0x9d, 0xf0, 0x90, 0x7c, 0x9f, 0xab, 0xa1, 0x18, 0xa0, + 0x50, 0x9a, 0xc4, 0x9c, 0x48, 0x9f, 0x62, 0x98, 0x42, 0x9a, 0x16, 0x9e, 0xc2, 0x9c, 0x22, 0x9d, 0xb7, 0x9f, 0x16, 0x9d, 0x9f, 0x9f, 0x22, 0x98, 0x71, 0x9f, 0xb5, 0x9f, 0x41, 0x9c, 0x28, 0xa0, + 0xbc, 0x9d, 0xa3, 0x9c, 0x74, 0x9b, 0xee, 0x9d, 0xb2, 0x9e, 0xa2, 0x9e, 0x9e, 0x9a, 0x14, 0x94, 0x37, 0xa0, 0x02, 0x9d, 0x20, 0xa0, 0xb4, 0x98, 0x6c, 0x99, 0xec, 0x9d, 0x9e, 0x9d, 0xce, 0x9c, + 0xd5, 0xa0, 0x04, 0x9d, 0x23, 0x9f, 0xdb, 0x9d, 0x84, 0x99, 0xfc, 0x9d, 0xb0, 0x97, 0xf3, 0x9d, 0x80, 0x02, 0xaf, 0xa1, 0x2e, 0x9c, 0x20, 0xa0, 0xb2, 0x9c, 0xae, 0x95, 0x88, 0xa1, 0x55, 0x9e, + 0x0c, 0x95, 0x55, 0x9f, 0x2d, 0x9c, 0x47, 0x0d, 0x81, 0x9b, 0xa2, 0x9d, 0xa6, 0x95, 0x56, 0x94, 0x69, 0x9d, 0x98, 0xa1, 0xe0, 0x9d, 0x57, 0x9c, 0x90, 0x9c, 0x2f, 0x9d, 0xef, 0x9d, 0xe3, 0x9d, + 0xe6, 0x9d, 0x26, 0xa0, 0x87, 0x9d, 0x98, 0x9f, 0x85, 0x9d, 0x68, 0x9b, 0x7d, 0x9c, 0xfe, 0x9c, 0x7c, 0x99, 0xba, 0xa0, 0xfc, 0x9e, 0x64, 0xa1, 0x0b, 0x9c, 0x40, 0x9c, 0x4e, 0x9e, 0x58, 0x9f, + 0xc0, 0x9f, 0x8f, 0xa0, 0xdc, 0x9d, 0x69, 0x9c, 0x4c, 0x91, 0xfc, 0xa0, 0x6b, 0xa2, 0x28, 0xa0, 0xb0, 0x9a, 0x22, 0xa0, 0x07, 0x9c, 0x0a, 0x14, 0x1d, 0xa0, 0xd8, 0x9a, 0x29, 0x9d, 0xaa, 0x99, + 0x3f, 0x9f, 0xb8, 0x9c, 0xb4, 0xa0, 0x84, 0xa1, 0x23, 0xa0, 0xbe, 0x9c, 0x61, 0x9c, 0x5c, 0x9e, 0x03, 0x9c, 0x4e, 0xa1, 0x40, 0x10, 0xdf, 0x9a, 0x3a, 0x9d, 0x4a, 0x9c, 0x33, 0x98, 0x1e, 0x9f, + 0xd6, 0x9d, 0xd8, 0x9d, 0x08, 0x9d, 0xe4, 0x9f, 0x9a, 0x9d, 0x5e, 0x9f, 0x52, 0x9e, 0xfc, 0x9e, 0x15, 0x9d, 0xa6, 0x9e, 0x40, 0x9d, 0xbe, 0x9d, 0x71, 0x9d, 0x70, 0x9d, 0x86, 0x9d, 0xb9, 0x9d, + 0x4d, 0x9e, 0xf1, 0x9d, 0x09, 0x9f, 0xe1, 0x9e, 0x6a, 0x9e, 0x76, 0x9e, 0x24, 0x9e, 0x20, 0x9f, 0xb2, 0x9d, 0xd7, 0x9c, 0xa5, 0x9d, 0x72, 0x9d, 0x5f, 0xa0, 0x5c, 0x9e, 0xb0, 0x9d, 0x98, 0x9f, + 0xb9, 0x9e, 0xb1, 0x9e, 0x0b, 0x9e, 0x1c, 0x9d, 0x1a, 0x9f, 0x9b, 0x9e, 0x31, 0x9f, 0x9a, 0x9d, 0x48, 0x9f, 0x8a, 0x9e, 0x20, 0x9e, 0x46, 0x9e, 0x0d, 0xa0, 0x28, 0x9f, 0x7d, 0x9f, 0x12, 0x9d, + 0xca, 0x9f, 0x36, 0x9e, 0x18, 0x9e, 0x9c, 0x9c, 0x72, 0x9d, 0xaf, 0x9e, 0x39, 0x9e, 0x0c, 0x9e, 0xa4, 0x9d, 0x1e, 0x9f, 0xa9, 0x9d, 0xb5, 0x9f, 0x5d, 0xa0, 0xef, 0x9c, 0x83, 0x9c, 0xde, 0x9d, + 0xcb, 0x9d, 0x10, 0x9d, 0xaa, 0x9d, 0x73, 0x9c, 0x6a, 0x9d, 0x80, 0x9e, 0x06, 0xa0, 0xa0, 0x9d, 0xf0, 0x9d, 0x6c, 0x9f, 0xf9, 0x9c, 0xaa, 0x9d, 0x6e, 0x9b, 0x40, 0x9e, 0xea, 0x9d, 0xac, 0x9e, + 0x7e, 0xa0, 0x9c, 0x9f, 0x71, 0x9d, 0xc8, 0x9e, 0x02, 0x9f, 0x20, 0x9c, 0x5d, 0x9f, 0xf5, 0x9c, 0x48, 0x9d, 0x89, 0x9e, 0x0e, 0x9f, 0xec, 0x9d, 0x1a, 0x9a, 0x6f, 0x9e, 0xe8, 0x9f, 0x2f, 0x9f, + 0x5e, 0x9e, 0x02, 0x9e, 0x30, 0x9e, 0xf7, 0x9e, 0x7c, 0x9d, 0xc5, 0x9f, 0xaa, 0x9b, 0x48, 0x9f, 0xcf, 0x9d, 0x13, 0xa0, 0xff, 0x9d, 0xc7, 0x9d, 0x45, 0x9f, 0x75, 0x9f, 0xa2, 0x9f, 0x4b, 0x9d, + 0x18, 0x9f, 0xf8, 0x9e, 0x0f, 0x9f, 0xd2, 0x9e, 0x4b, 0x9f, 0x03, 0x9a, 0x40, 0x9c, 0xda, 0x9d, 0x8c, 0x9d, 0x8a, 0x9e, 0x4d, 0x9e, 0xd6, 0x9b, 0x41, 0x9e, 0xe1, 0x9d, 0x54, 0x9a, 0xc8, 0x9d, + 0x03, 0x98, 0x88, 0x9a, 0xf5, 0x98, 0x8b, 0x99, 0xa0, 0x9b, 0x02, 0x9c, 0xf4, 0x97, 0x11, 0x98, 0x2f, 0x99, 0xf8, 0x9a, 0x12, 0x95, 0x5c, 0x92, 0x6e, 0x93, 0x94, 0x97, 0x41, 0x9c, 0x52, 0x9a, + 0x3c, 0x91, 0x9a, 0x95, 0x4a, 0x9b, 0x66, 0x95, 0x2f, 0x99, 0x99, 0x99, 0xe3, 0x98, 0x55, 0x99, 0x96, 0x9a, 0xeb, 0x99, 0xc0, 0x9b, 0x04, 0x95, 0xcf, 0x9c, 0x1a, 0x99, 0xab, 0x93, 0xdc, 0x9a, + 0xb3, 0x9c, 0x5c, 0x99, 0x16, 0x9b, 0x09, 0x9a, 0x9f, 0x9b, 0xf5, 0x96, 0x0c, 0x98, 0x8a, 0x8c, 0xe5, 0x99, 0xde, 0x99, 0x7c, 0x99, 0xf4, 0x99, 0x36, 0x99, 0xcc, 0x9b, 0x30, 0x99, 0x01, 0x96, + 0x7c, 0x9c, 0x7e, 0x98, 0x2a, 0x9c, 0x38, 0x97, 0x18, 0x8b, 0x50, 0x99, 0x85, 0x99, 0x54, 0x98, 0xf5, 0x93, 0xef, 0x9c, 0x34, 0x95, 0x73, 0x9c, 0x83, 0x9c, 0x18, 0x8d, 0xc8, 0x99, 0x4a, 0x9b, + 0x6a, 0x8f, 0x99, 0x99, 0x90, 0x96, 0x53, 0x99, 0x82, 0x96, 0x73, 0x9b, 0x06, 0x9a, 0xff, 0x97, 0x21, 0x98, 0x18, 0x9e, 0x50, 0x9a, 0xbd, 0x99, 0x0a, 0x90, 0xf3, 0x94, 0xe2, 0x9a, 0x9c, 0x99, + 0xc4, 0x98, 0xae, 0x9c, 0x98, 0x98, 0x10, 0x9a, 0xb0, 0x96, 0x5e, 0x97, 0xac, 0x9a, 0xe1, 0x97, 0xea, 0x0c, 0xe8, 0x9b, 0x36, 0x9c, 0x15, 0x9b, 0x4b, 0x91, 0xa0, 0x98, 0xa4, 0x9c, 0x63, 0x98, + 0xc2, 0x99, 0x88, 0x9a, 0xef, 0x98, 0x39, 0x9b, 0x18, 0x96, 0x63, 0x9f, 0xa0, 0x9a, 0xe6, 0x98, 0xe4, 0x97, 0x48, 0x9e, 0x7e, 0x94, 0x48, 0x90, 0x2a, 0x9b, 0x06, 0x9b, 0x38, 0x9c, 0x90, 0x98, + 0x86, 0x9a, 0xdd, 0x98, 0x25, 0x9c, 0xb2, 0x9c, 0xf9, 0x98, 0x5d, 0x95, 0x2d, 0x96, 0x3b, 0x9b, 0x40, 0x97, 0xd4, 0x9b, 0x58, 0x95, 0xc5, 0x93, 0x7b, 0x98, 0xcb, 0x98, 0x04, 0x8d, 0xbe, 0x98, + 0x2e, 0xb1, 0x6b, 0xb1, 0x14, 0xb2, 0x96, 0xaf, 0x72, 0xb1, 0x04, 0xb0, 0xa6, 0xaf, 0xc3, 0xaf, 0x62, 0xad, 0x94, 0xb1, 0x2e, 0xb0, 0xf2, 0xb2, 0x4f, 0xad, 0xe5, 0xb2, 0x9f, 0xb3, 0x65, 0xb2, + 0x4c, 0xb1, 0x9b, 0xb1, 0x02, 0xb2, 0x40, 0xb0, 0x0e, 0xaf, 0x93, 0xb1, 0xbc, 0xb0, 0x5c, 0xb1, 0xe8, 0xb1, 0x89, 0xaf, 0x42, 0xb1, 0xe1, 0xae, 0xfe, 0xb1, 0xf8, 0xb2, 0x7e, 0xb1, 0x4a, 0xb3, + 0x59, 0xaf, 0xbb, 0xb0, 0xbe, 0xad, 0x7e, 0xb0, 0x71, 0xb1, 0x10, 0xb3, 0xa2, 0xb0, 0x4e, 0xaf, 0xa9, 0xb3, 0xaf, 0xb0, 0x10, 0xb3, 0x02, 0xad, 0x26, 0xb0, 0xd0, 0xb0, 0xfe, 0xb1, 0x0d, 0xb1, + 0x6e, 0xb3, 0x27, 0xb1, 0xe1, 0xb0, 0x4b, 0xb1, 0xec, 0xb0, 0xc2, 0xb1, 0xb6, 0xac, 0xd6, 0xb1, 0x6c, 0xac, 0xe8, 0xb3, 0x1a, 0xb1, 0x5c, 0xb2, 0x34, 0xb0, 0x17, 0xaf, 0x10, 0xb4, 0xa2, 0xb0, + 0x8b, 0xaf, 0xb9, 0xb1, 0xc8, 0xb0, 0xb0, 0x1f, 0x56, 0xb0, 0x63, 0xb0, 0x88, 0xad, 0x35, 0xac, 0x76, 0xb1, 0xda, 0xb2, 0x3f, 0xb0, 0x80, 0xaf, 0x0a, 0xb1, 0x3f, 0xb2, 0x8d, 0xb0, 0x8a, 0xb1, + 0x3e, 0xb3, 0x1e, 0xb2, 0x0e, 0xb1, 0xab, 0xb2, 0x84, 0xb2, 0xc0, 0xae, 0x62, 0xb0, 0xb0, 0xb0, 0x51, 0xb1, 0x10, 0xb3, 0x42, 0xb1, 0x0d, 0xb4, 0x21, 0xb0, 0xa4, 0xb0, 0xd4, 0xb0, 0x7e, 0xb3, + 0xb2, 0xb2, 0x27, 0xb3, 0x90, 0xb1, 0xad, 0xaf, 0x4f, 0xac, 0xec, 0xb0, 0x57, 0xb4, 0x05, 0xb4, 0xda, 0xaf, 0xe4, 0xb0, 0x58, 0xb1, 0xf6, 0xab, 0xe7, 0xb2, 0xf0, 0xae, 0x36, 0xb0, 0xec, 0xad, + 0x5e, 0xb2, 0x2e, 0xb1, 0x24, 0xb3, 0xb6, 0xb3, 0xfb, 0xb3, 0xfc, 0xaf, 0x3b, 0xb0, 0xac, 0xb0, 0x6e, 0xb0, 0xf8, 0xb3, 0x92, 0xab, 0xbe, 0xaf, 0x57, 0xb1, 0x47, 0xb0, 0x1f, 0xae, 0x5e, 0xb2, + 0x30, 0xa9, 0x79, 0xac, 0x86, 0xab, 0x2d, 0xaa, 0x66, 0xa5, 0x94, 0xad, 0x74, 0xae, 0xbb, 0xb0, 0x84, 0xaf, 0xb8, 0xad, 0x16, 0x29, 0xae, 0xac, 0x42, 0xa6, 0xf6, 0xac, 0x0d, 0xaa, 0xf9, 0xa7, + 0xdc, 0xac, 0x50, 0xa6, 0x3c, 0xb0, 0x30, 0xae, 0x50, 0xae, 0x80, 0xae, 0x0a, 0xac, 0x46, 0xac, 0xfc, 0xaf, 0xc5, 0xa7, 0xb0, 0xab, 0x10, 0xab, 0x0a, 0xb1, 0xaa, 0xa9, 0x00, 0x0a, 0x94, 0xad, + 0xa7, 0xaf, 0x0a, 0xae, 0x49, 0xae, 0x07, 0xad, 0x1a, 0xa9, 0xb5, 0xa9, 0xc2, 0xae, 0xb0, 0xaa, 0x74, 0xa4, 0xaa, 0xad, 0xe3, 0xab, 0xe9, 0xb0, 0xc6, 0xad, 0x17, 0xab, 0x59, 0xab, 0x82, 0xac, + 0x93, 0xae, 0x28, 0xaa, 0x24, 0xb0, 0x8a, 0xa9, 0x20, 0x1a, 0xc6, 0xa4, 0x26, 0xb1, 0xb9, 0xab, 0x14, 0xaa, 0x14, 0xb0, 0x26, 0xac, 0x4f, 0xae, 0xa0, 0xb1, 0x1a, 0xa7, 0xc2, 0xaa, 0x08, 0xad, + 0xe0, 0x93, 0x09, 0xac, 0x7e, 0xad, 0x3c, 0xac, 0xf0, 0x1a, 0x96, 0xac, 0x9c, 0xaf, 0x09, 0xad, 0x50, 0xab, 0x62, 0xae, 0x94, 0xac, 0x61, 0xaf, 0xa9, 0x20, 0xac, 0xa7, 0xe0, 0xa2, 0x54, 0xa6, + 0xcb, 0xae, 0xce, 0xb0, 0x4d, 0xae, 0x2a, 0xa4, 0x43, 0xae, 0xa2, 0xae, 0xec, 0xae, 0x98, 0x9f, 0x7d, 0xa8, 0x96, 0xaa, 0xc6, 0xae, 0x80, 0xa4, 0xb4, 0x24, 0x15, 0xad, 0xd8, 0xa6, 0x0e, 0xa5, + 0x24, 0xad, 0xf8, 0xa5, 0xee, 0xae, 0xf6, 0xae, 0x4e, 0x21, 0x44, 0xb1, 0x5f, 0x25, 0xdf, 0xae, 0x74, 0xa3, 0x25, 0xac, 0xd2, 0xab, 0x87, 0xad, 0x92, 0xa8, 0xa8, 0xaf, 0x9b, 0xb0, 0xc8, 0xa7, + 0x83, 0xb1, 0x88, 0xad, 0x27, 0xab, 0x56, 0xb0, 0xc8, 0xac, 0x10, 0xa9, 0x35, 0xa4, 0xab, 0xaf, 0x7f, 0xa5, 0xb2, 0xad, 0x0e, 0xad, 0xf0, 0x9f, 0xb6, 0x25, 0xb0, 0xa6, 0x90, 0xa9, 0x2c, 0xae, + 0x7c, 0x2a, 0x08, 0x28, 0xda, 0x2a, 0x2a, 0x2c, 0x13, 0x25, 0xe8, 0x28, 0x1c, 0x2c, 0x8c, 0x2b, 0x24, 0x25, 0x2e, 0x2a, 0x06, 0xa3, 0x40, 0x2c, 0x8a, 0x25, 0x02, 0x2b, 0xbe, 0x29, 0x04, 0x2b, + 0xc5, 0x2d, 0x2e, 0x29, 0xfb, 0x2a, 0x45, 0x29, 0xb0, 0x2b, 0x31, 0x2c, 0xa4, 0x27, 0xd0, 0x2a, 0xe2, 0x2c, 0xb6, 0x26, 0xde, 0x26, 0x8d, 0x29, 0x36, 0x2b, 0xc1, 0x2b, 0x9a, 0x28, 0x59, 0x2c, + 0xf8, 0x28, 0x62, 0x2b, 0x31, 0x29, 0xe2, 0x1e, 0x64, 0x27, 0x8f, 0x2a, 0xac, 0x2b, 0x22, 0x29, 0x20, 0x29, 0xf4, 0x29, 0x26, 0x2c, 0xaa, 0x29, 0x88, 0x2d, 0x4e, 0x2a, 0xb0, 0x29, 0xc9, 0x28, + 0xa4, 0x2d, 0xb8, 0x26, 0x11, 0x28, 0xea, 0x29, 0x8c, 0x27, 0xc6, 0x28, 0x5e, 0x29, 0x0b, 0x28, 0xee, 0x1f, 0x86, 0x2d, 0x28, 0x29, 0xa2, 0x2b, 0xf0, 0x2a, 0x08, 0x29, 0xc6, 0x2a, 0x99, 0x28, + 0x46, 0x28, 0xec, 0x28, 0x8a, 0x29, 0x1f, 0x24, 0x72, 0x29, 0xd7, 0x26, 0x6c, 0x2b, 0x3c, 0x29, 0x1e, 0x28, 0xd4, 0x28, 0xc5, 0x28, 0x3b, 0x2b, 0x83, 0x22, 0x21, 0x28, 0xeb, 0x28, 0x3d, 0x26, + 0x1f, 0x2c, 0x44, 0x2c, 0x2d, 0x2c, 0x82, 0x2a, 0xbe, 0x2b, 0x8e, 0x29, 0x88, 0x2a, 0x27, 0x18, 0x62, 0x2c, 0xfc, 0x24, 0x0b, 0x2c, 0x9a, 0x29, 0x42, 0x21, 0x95, 0x29, 0xf2, 0x26, 0x08, 0x2b, + 0x94, 0x2b, 0x9c, 0x28, 0xdf, 0x2a, 0x32, 0x25, 0x90, 0x23, 0x6c, 0x2b, 0xf0, 0x15, 0x3e, 0x2c, 0x60, 0x29, 0x64, 0x28, 0x02, 0x2b, 0xba, 0x28, 0x02, 0x2c, 0xbe, 0x28, 0x36, 0x2c, 0xc2, 0x25, + 0xda, 0x2d, 0x8c, 0x2a, 0x12, 0x2a, 0x66, 0x2d, 0x7f, 0x2d, 0x24, 0x26, 0x08, 0x28, 0xde, 0x2b, 0x0c, 0x26, 0xd4, 0x2a, 0xa6, 0x2a, 0xea, 0x2a, 0x56, 0x27, 0xba, 0x2a, 0x9b, 0x1e, 0xea, 0x28, + 0x0e, 0x23, 0x40, 0x20, 0x7c, 0x1d, 0x72, 0xa0, 0xa1, 0x27, 0x94, 0x20, 0xaa, 0x24, 0x16, 0x26, 0x6e, 0x1d, 0x06, 0x26, 0xf4, 0xa5, 0x8b, 0xa2, 0x80, 0x9c, 0xfa, 0x28, 0x6a, 0x27, 0x86, 0x24, + 0x80, 0x19, 0xf8, 0x1e, 0x20, 0x22, 0xc9, 0x9f, 0x70, 0x24, 0x3a, 0x24, 0x50, 0x18, 0xc2, 0x20, 0xa7, 0x29, 0xbb, 0x28, 0x69, 0x24, 0x96, 0x1d, 0xd8, 0x29, 0x12, 0x26, 0xb8, 0x9d, 0x93, 0xa0, + 0x99, 0x29, 0x1e, 0x21, 0xfc, 0x27, 0x28, 0x27, 0x90, 0x28, 0x00, 0x24, 0x68, 0x1d, 0xb8, 0xa1, 0xf8, 0x1a, 0x13, 0x24, 0x46, 0x26, 0x48, 0x27, 0x1c, 0x9e, 0x07, 0x28, 0xca, 0x20, 0x84, 0x21, + 0xe7, 0x28, 0x82, 0x1e, 0x89, 0x2a, 0xfd, 0x23, 0x17, 0xa1, 0x22, 0x1e, 0xd2, 0x28, 0xf6, 0x15, 0xef, 0xa3, 0x12, 0x2a, 0xbc, 0x25, 0xd4, 0x27, 0xbe, 0x28, 0x60, 0x1c, 0x4c, 0x29, 0x60, 0x26, + 0xc4, 0x9f, 0xe4, 0x26, 0xa0, 0x28, 0x54, 0x99, 0x50, 0x98, 0x64, 0x1d, 0x4a, 0x25, 0x8e, 0x28, 0x08, 0x27, 0xec, 0x29, 0xb2, 0x26, 0x24, 0x24, 0x1d, 0xa3, 0x5e, 0x23, 0x3d, 0xa0, 0x21, 0x25, + 0xbf, 0x25, 0xdc, 0x27, 0xf9, 0x26, 0x87, 0x24, 0x3a, 0x26, 0xa2, 0x22, 0xbd, 0x26, 0xfe, 0x1e, 0xc0, 0x9e, 0xf0, 0x19, 0xd9, 0x26, 0x26, 0x20, 0x70, 0x96, 0x4d, 0x21, 0x59, 0x1c, 0xb0, 0x25, + 0x0f, 0x29, 0x84, 0x23, 0x47, 0x27, 0x0a, 0x26, 0x1e, 0xa7, 0x84, 0x2a, 0xcc, 0x27, 0x1c, 0x28, 0x54, 0xa0, 0x8c, 0x2a, 0x00, 0x97, 0xa2, 0x9d, 0xfa, 0xa5, 0x77, 0x25, 0xb8, 0x2b, 0x99, 0x1d, + 0x54, 0x28, 0x7c, 0x24, 0x53, 0x23, 0x4b, 0x29, 0x94, 0x24, 0xaa, 0x2a, 0x00, 0x85, 0x66, 0x28, 0xf4, 0x1b, 0x31, 0x2c, 0x4c, 0x1a, 0xd8, 0xa2, 0x84, 0xa0, 0xc4, 0x24, 0x74, 0x21, 0x0a, 0x25, + 0x1c, 0x9f, 0xc0, 0x99, 0x7c, 0x9e, 0xe6, 0x9f, 0xb3, 0x9b, 0xbd, 0x9a, 0x5a, 0x9f, 0x3b, 0x9d, 0x38, 0x11, 0x92, 0x9d, 0xc4, 0x17, 0x0b, 0x9f, 0x87, 0x98, 0xc1, 0x9f, 0xc3, 0x9e, 0x21, 0xa0, + 0xb2, 0xa1, 0xb9, 0x9d, 0x78, 0x9c, 0x44, 0x99, 0xbe, 0x9e, 0x72, 0x9f, 0x14, 0x99, 0x60, 0x9e, 0xdc, 0xa0, 0xe1, 0x9c, 0xe8, 0x99, 0x0e, 0x9d, 0x9d, 0x9d, 0x82, 0xa0, 0x3e, 0x9d, 0x00, 0x9f, + 0x1e, 0x9c, 0x27, 0x9e, 0x7a, 0x9c, 0xfd, 0x0a, 0xe6, 0x9c, 0x26, 0x9f, 0xdf, 0x9d, 0x9c, 0x9b, 0xbc, 0x9d, 0xf0, 0x9c, 0x8e, 0xa0, 0x07, 0x9a, 0xf2, 0xa0, 0x48, 0x9f, 0x58, 0x9d, 0xc7, 0x9b, + 0xff, 0xa1, 0x6e, 0x99, 0x29, 0x9a, 0x6b, 0x9e, 0x20, 0x9c, 0x67, 0x9d, 0xc2, 0x99, 0xbd, 0x99, 0xe6, 0x14, 0xaa, 0xa1, 0x1e, 0x9d, 0x5d, 0x9f, 0x3a, 0x9c, 0x4f, 0x9d, 0x39, 0xa0, 0x1c, 0x9c, + 0xb4, 0x9c, 0x22, 0x9d, 0x92, 0x9d, 0x60, 0x85, 0x8a, 0x9e, 0xb8, 0x97, 0xe8, 0x9d, 0x6b, 0x9d, 0x65, 0x9c, 0xc2, 0x9c, 0x9f, 0x9c, 0x92, 0x9d, 0x38, 0x96, 0xab, 0x9c, 0x04, 0x9d, 0xda, 0x9b, + 0x6a, 0x9f, 0xaf, 0x9e, 0x08, 0xa0, 0x14, 0xa0, 0x32, 0x9f, 0x94, 0x9b, 0x7e, 0x9d, 0x87, 0x8e, 0x7e, 0xa0, 0x89, 0x95, 0x76, 0x9f, 0x8a, 0x9e, 0x3f, 0x98, 0x85, 0x9c, 0xf2, 0x9a, 0x73, 0xa0, + 0x30, 0xa0, 0x7c, 0x9d, 0x02, 0x9e, 0xe6, 0x90, 0xce, 0x94, 0xe6, 0x9d, 0x00, 0x99, 0x16, 0xa0, 0x85, 0x9d, 0xae, 0x9d, 0x3d, 0x9e, 0xe3, 0x98, 0x33, 0x9f, 0x66, 0x99, 0x24, 0xa0, 0x66, 0x99, + 0xf3, 0xa0, 0xce, 0x9d, 0x1c, 0x9e, 0x39, 0xa1, 0xc8, 0xa1, 0x00, 0x9d, 0x46, 0x9c, 0x1e, 0x9f, 0x31, 0x9a, 0x41, 0xa0, 0x70, 0x9d, 0x5a, 0x9f, 0xb6, 0x9c, 0x11, 0xa0, 0x7a, 0x83, 0x3b, 0x9b, + 0xa9, 0x9a, 0x05, 0x90, 0x36, 0x96, 0xa0, 0x8d, 0xdf, 0x9c, 0xc6, 0x8d, 0x49, 0x99, 0x84, 0x96, 0x56, 0x18, 0x14, 0x9a, 0xd6, 0x19, 0x3c, 0x14, 0x6e, 0x0e, 0x20, 0x9e, 0x22, 0x9d, 0x75, 0x9c, + 0xde, 0x98, 0x8d, 0x98, 0x30, 0x87, 0xb8, 0x18, 0xbe, 0x98, 0xda, 0x98, 0xda, 0x0c, 0x80, 0x97, 0x6d, 0x9e, 0xfb, 0x9d, 0xe0, 0x97, 0xb2, 0x94, 0x86, 0x9c, 0x3d, 0x9d, 0xf0, 0x93, 0xfc, 0x12, + 0xa0, 0x9c, 0x11, 0x95, 0x08, 0x9b, 0xd2, 0x97, 0x96, 0x9d, 0x00, 0x9b, 0x6c, 0x8c, 0xc8, 0x15, 0xee, 0x97, 0x37, 0x97, 0xfb, 0x9c, 0x97, 0x95, 0xe0, 0x8e, 0x76, 0x9d, 0x02, 0x97, 0xa8, 0x94, + 0xe5, 0x9e, 0x6c, 0x91, 0x0e, 0x9d, 0x92, 0x9a, 0x80, 0x85, 0x64, 0x98, 0xe4, 0x98, 0xf8, 0x0a, 0x36, 0x1b, 0x5a, 0x9f, 0x93, 0x9a, 0x63, 0x9c, 0xf0, 0x98, 0xe2, 0x96, 0x43, 0x9f, 0xf6, 0x99, + 0x14, 0x90, 0xe7, 0x9b, 0xd6, 0x9c, 0x54, 0x17, 0xe0, 0x97, 0x4c, 0x0c, 0x2c, 0x98, 0xe0, 0x9c, 0xd3, 0x9b, 0xa1, 0x9d, 0xf7, 0x9a, 0xc3, 0x95, 0xb0, 0x14, 0xad, 0x99, 0x80, 0x8b, 0xf6, 0x9a, + 0x3f, 0x9a, 0x01, 0x9a, 0x2b, 0x9c, 0xc4, 0x9c, 0xfc, 0x9a, 0x66, 0x91, 0xec, 0x99, 0x32, 0x93, 0x30, 0x95, 0x03, 0x0d, 0x76, 0x9b, 0xc6, 0x99, 0x49, 0x93, 0x75, 0x94, 0x84, 0x94, 0x8d, 0x9d, + 0x5c, 0x9e, 0x99, 0x9a, 0xd8, 0x9a, 0x96, 0x93, 0x51, 0x1a, 0x2b, 0x9d, 0x78, 0x9d, 0xa9, 0x9c, 0xb0, 0x8d, 0x67, 0x9f, 0x50, 0x8f, 0x9a, 0x17, 0x64, 0x16, 0x5d, 0x94, 0xb7, 0x9f, 0xa2, 0x93, + 0xfe, 0x9b, 0xb9, 0x98, 0x70, 0x99, 0x0a, 0x9e, 0x98, 0x9c, 0xe0, 0x9f, 0x28, 0x94, 0x56, 0x9c, 0x4c, 0x94, 0xe0, 0xa0, 0xf8, 0x8e, 0x38, 0x8f, 0x00, 0x93, 0xa8, 0x9c, 0xc0, 0x8f, 0xcc, 0x96, + 0xec, 0x98, 0x97, 0x98, 0xfa, 0x99, 0xdd, 0x9a, 0xea, 0x91, 0xb0, 0x99, 0xe6, 0x9b, 0x7c, 0x9c, 0x6a, 0x99, 0x28, 0x9a, 0x40, 0x13, 0xea, 0x9b, 0x38, 0x95, 0xdb, 0x99, 0x51, 0x98, 0xd2, 0x98, + 0xb8, 0x9c, 0x8d, 0x97, 0x33, 0x9c, 0xa7, 0x9a, 0x88, 0x9b, 0x0a, 0x9c, 0x65, 0x98, 0x21, 0x9a, 0x7e, 0x9c, 0x34, 0x94, 0x56, 0x97, 0x13, 0x99, 0x44, 0x9c, 0x8e, 0x99, 0xcb, 0x95, 0x35, 0x9c, + 0xf1, 0x99, 0x65, 0x9b, 0xab, 0x99, 0xf7, 0x94, 0x54, 0x95, 0x0e, 0x99, 0x09, 0x9c, 0x26, 0x99, 0x2a, 0x97, 0x2f, 0x9a, 0x74, 0x9a, 0xe7, 0x9b, 0xef, 0x9c, 0xde, 0x98, 0x21, 0x99, 0xff, 0x98, + 0xa6, 0x9c, 0xfd, 0x96, 0x8c, 0x99, 0xa0, 0x98, 0xd6, 0x94, 0xb3, 0x96, 0xbf, 0x9b, 0x73, 0x98, 0x3a, 0x95, 0xe1, 0x9c, 0xbb, 0x98, 0x29, 0x9b, 0x9b, 0x9c, 0xb6, 0x97, 0xcf, 0x98, 0xce, 0x98, + 0x94, 0x95, 0x6a, 0x98, 0x54, 0x99, 0x20, 0x97, 0x68, 0x96, 0x5d, 0x98, 0xfb, 0x9b, 0xea, 0x98, 0x6e, 0x97, 0x1e, 0x99, 0xa2, 0x98, 0xdc, 0x9b, 0x17, 0x90, 0x43, 0x96, 0x3e, 0x97, 0x68, 0x94, + 0xfe, 0x9b, 0xba, 0x9c, 0xac, 0x9b, 0x02, 0x98, 0x5d, 0x9b, 0x89, 0x9a, 0xf0, 0x9a, 0x7d, 0x87, 0xa0, 0x9a, 0x5c, 0x96, 0xc7, 0x9b, 0x82, 0x97, 0x72, 0x86, 0xcb, 0x99, 0xc7, 0x95, 0x53, 0x98, + 0x47, 0x9a, 0x5e, 0x96, 0x17, 0x9b, 0xab, 0x98, 0xf8, 0x91, 0x62, 0x9c, 0x89, 0x11, 0xf0, 0x9b, 0xe0, 0x97, 0x13, 0x97, 0x56, 0x9a, 0xed, 0x99, 0x9f, 0x9a, 0x61, 0x9a, 0x36, 0x9c, 0x40, 0x95, + 0x05, 0x9e, 0x6d, 0x9a, 0x2f, 0x99, 0x06, 0x9d, 0x58, 0x9c, 0x6f, 0x93, 0xf6, 0x95, 0xef, 0x9b, 0xe6, 0x94, 0x75, 0x99, 0x9a, 0x9a, 0xc2, 0x98, 0xaa, 0x92, 0x73, 0x98, 0xd3, 0x92, 0xbc, 0x99, + 0x94, 0x90, 0x6f, 0x94, 0x2e, 0x90, 0x79, 0x0d, 0x00, 0x95, 0x5c, 0x95, 0x64, 0x96, 0x5d, 0x99, 0xcc, 0x97, 0x3f, 0x97, 0x95, 0x15, 0xe8, 0x05, 0xd7, 0x01, 0x3b, 0x98, 0x61, 0x95, 0x10, 0x90, + 0xd8, 0x8b, 0x40, 0x89, 0x09, 0x98, 0xa5, 0x92, 0x53, 0x96, 0x1d, 0x96, 0x33, 0x92, 0x86, 0x92, 0xb2, 0x99, 0x68, 0x96, 0x5e, 0x95, 0xc3, 0x90, 0x70, 0x9b, 0x21, 0x93, 0x6a, 0x11, 0x61, 0x8c, + 0x73, 0x9a, 0x04, 0x95, 0xb5, 0x98, 0xa7, 0x98, 0xb7, 0x96, 0xa4, 0x91, 0xf4, 0x94, 0x40, 0x03, 0x68, 0x05, 0x1a, 0x96, 0xa6, 0x94, 0x49, 0x9a, 0x88, 0x8a, 0x16, 0x96, 0x28, 0x92, 0x8c, 0x94, + 0x32, 0x98, 0xd3, 0x91, 0x86, 0x9b, 0xf8, 0x91, 0x7c, 0x12, 0x88, 0x84, 0x4f, 0x9b, 0x74, 0x91, 0x64, 0x87, 0xc8, 0x99, 0xce, 0x95, 0x2f, 0x98, 0x75, 0x9b, 0x48, 0x88, 0x40, 0x97, 0x59, 0x97, + 0xa0, 0x11, 0x78, 0x96, 0x98, 0x98, 0x94, 0x92, 0x77, 0x11, 0x54, 0x94, 0x29, 0x98, 0x5f, 0x98, 0x78, 0x96, 0xfe, 0x99, 0xfe, 0x96, 0x76, 0x97, 0xd1, 0x12, 0x34, 0x91, 0xa5, 0x10, 0x08, 0x93, + 0x65, 0x97, 0xc7, 0x99, 0x91, 0x97, 0xe6, 0x8c, 0x52, 0x97, 0xd7, 0x96, 0x68, 0x98, 0xce, 0x8d, 0x40, 0x10, 0xec, 0x91, 0x0c, 0x98, 0x40, 0x83, 0x0a, 0x10, 0xc7, 0x94, 0x5a, 0x8c, 0xca, 0x8f, + 0x42, 0x98, 0x00, 0x90, 0x81, 0x98, 0x01, 0x99, 0xbb, 0x15, 0x05, 0x9c, 0x58, 0x92, 0x6e, 0x98, 0xe8, 0x10, 0x42, 0x99, 0x6a, 0x8c, 0x52, 0x92, 0x5a, 0x14, 0xc5, 0x98, 0xdb, 0x9b, 0x0c, 0x8f, + 0x21, 0x9a, 0xf9, 0x95, 0x2c, 0x93, 0xa3, 0x99, 0xe4, 0x92, 0xbc, 0x98, 0x06, 0x08, 0x27, 0x99, 0xd3, 0x8a, 0xb4, 0x9a, 0x68, 0x92, 0x49, 0x14, 0x58, 0x14, 0xce, 0x8f, 0x46, 0x94, 0xa4, 0x97, + 0xb1, 0xae, 0x08, 0xa5, 0xb7, 0xac, 0x08, 0xad, 0xcc, 0xad, 0xea, 0xa4, 0x6a, 0xad, 0xeb, 0xa8, 0xb9, 0x2b, 0xa0, 0xac, 0x91, 0x29, 0x14, 0xaa, 0x58, 0xa3, 0x43, 0xb0, 0x9c, 0xaf, 0x39, 0xb0, + 0x41, 0xb0, 0x3b, 0xad, 0x6e, 0xa4, 0x6c, 0x25, 0xd5, 0xac, 0x51, 0xad, 0x80, 0xa0, 0xbc, 0xac, 0xae, 0xb0, 0x28, 0xaf, 0x00, 0xa9, 0x00, 0xab, 0x9c, 0xac, 0xa8, 0xb0, 0x63, 0xac, 0xd5, 0xa9, + 0x3c, 0xac, 0x6b, 0xab, 0x0d, 0xac, 0x80, 0x10, 0xbd, 0xae, 0xc7, 0xae, 0x44, 0xa9, 0x98, 0xa2, 0x26, 0xad, 0xa6, 0xaa, 0x76, 0xb0, 0x2e, 0xa1, 0xd0, 0xad, 0x0a, 0xb0, 0x13, 0xac, 0xda, 0xa8, + 0xd1, 0xb1, 0x78, 0xa6, 0x4d, 0xab, 0x22, 0xae, 0x04, 0xaa, 0x0b, 0xad, 0xae, 0xa4, 0x30, 0xa3, 0x28, 0x2c, 0x8d, 0xb1, 0xe0, 0xac, 0xb0, 0xae, 0x94, 0xa7, 0x80, 0xac, 0x1a, 0xb1, 0x75, 0xab, + 0x4e, 0xab, 0x61, 0xad, 0xf2, 0xad, 0x94, 0x28, 0x0a, 0xae, 0x80, 0x8d, 0x6f, 0xab, 0xfa, 0xad, 0xee, 0xac, 0xb6, 0xad, 0x8e, 0xac, 0x2b, 0xaa, 0xa0, 0x9e, 0xcd, 0xac, 0xf5, 0xaa, 0xd0, 0xac, + 0xb7, 0xad, 0x69, 0xac, 0x05, 0xaf, 0x6a, 0xb0, 0xf4, 0xad, 0xee, 0xa5, 0x37, 0xac, 0x68, 0xa2, 0xac, 0xae, 0x80, 0x14, 0x2d, 0xae, 0x59, 0xae, 0xdb, 0xa8, 0x62, 0xa9, 0x9e, 0xa9, 0xec, 0xb0, + 0x8a, 0xb0, 0xce, 0xad, 0xe6, 0xac, 0x24, 0x21, 0xd9, 0x24, 0x26, 0xad, 0xf0, 0xad, 0x4e, 0xaf, 0xcc, 0xab, 0x15, 0xb0, 0x76, 0xab, 0xff, 0x23, 0xbd, 0xaa, 0x04, 0xa2, 0x79, 0xb0, 0x2a, 0xa8, + 0x04, 0xaf, 0x5c, 0xac, 0x51, 0xad, 0xb7, 0xb0, 0x15, 0xb1, 0x36, 0xb0, 0xf2, 0xaa, 0x23, 0xae, 0x2e, 0xa9, 0xb6, 0xb1, 0xbc, 0xa9, 0x5d, 0xad, 0x49, 0xac, 0x4a, 0xb0, 0x10, 0x1d, 0x64, 0xa8, + 0x34, 0xac, 0xc2, 0xac, 0x7d, 0xab, 0x81, 0xb0, 0xd3, 0xac, 0x44, 0xad, 0xcd, 0xa8, 0x9d, 0xa7, 0xec, 0xad, 0xc5, 0xab, 0x74, 0xad, 0x20, 0xaa, 0xc8, 0xab, 0xd6, 0xac, 0xef, 0xac, 0x1e, 0xae, + 0xdb, 0xa9, 0xb0, 0xab, 0xb9, 0xac, 0xba, 0xae, 0xa2, 0xae, 0x33, 0xac, 0x4e, 0xac, 0xad, 0xa8, 0xa7, 0xab, 0xdc, 0xad, 0x39, 0xae, 0x53, 0xaa, 0x2a, 0xaf, 0x5c, 0xab, 0xe9, 0xa6, 0xbd, 0xad, + 0x6a, 0xae, 0xd2, 0xa8, 0x1c, 0xae, 0x56, 0xae, 0xde, 0xae, 0xd0, 0xad, 0x4c, 0xaa, 0x78, 0xa7, 0x5b, 0xac, 0xcd, 0xad, 0x9a, 0xac, 0x13, 0xac, 0x02, 0xaf, 0x45, 0xad, 0x20, 0xac, 0x55, 0xab, + 0x2d, 0xaf, 0x3a, 0xac, 0xd2, 0xac, 0xa7, 0xa7, 0x3d, 0xac, 0xbe, 0xae, 0x36, 0xab, 0x48, 0xad, 0xb4, 0xa9, 0x25, 0xad, 0x87, 0xab, 0x63, 0xae, 0x56, 0xac, 0x35, 0xa8, 0x69, 0xac, 0x24, 0xaf, + 0x55, 0xad, 0xa2, 0xac, 0xfc, 0xa8, 0xad, 0xab, 0x21, 0xad, 0x09, 0xae, 0xa6, 0xad, 0x82, 0xac, 0x32, 0xad, 0x1e, 0xad, 0x0e, 0xac, 0x8a, 0xac, 0x24, 0xad, 0x74, 0xae, 0xa4, 0xae, 0x6a, 0xae, + 0xa2, 0xad, 0x9a, 0xac, 0x6e, 0xab, 0x1f, 0xaf, 0x06, 0xac, 0xd0, 0xac, 0x76, 0xac, 0x09, 0xac, 0x60, 0xab, 0xde, 0xad, 0x88, 0xad, 0xfc, 0xab, 0x1b, 0xac, 0xe3, 0xa9, 0xc2, 0xae, 0x62, 0xad, + 0xdc, 0xa9, 0xb1, 0xad, 0x28, 0xab, 0x04, 0xae, 0xf8, 0xae, 0x8d, 0xad, 0x5e, 0xab, 0xa8, 0xad, 0x32, 0xaa, 0xa8, 0xaf, 0x57, 0xaa, 0x9c, 0xa8, 0x81, 0xad, 0x0f, 0xb0, 0x70, 0xad, 0x53, 0xac, + 0x9e, 0xa9, 0x14, 0xab, 0x6f, 0xad, 0x96, 0xaa, 0xff, 0xad, 0x0e, 0xab, 0xf6, 0xab, 0x58, 0xad, 0x09, 0xad, 0x95, 0xae, 0xc1, 0xac, 0x04, 0xad, 0x2d, 0xb0, 0xb2, 0xae, 0xce, 0x9f, 0xfa, 0xa9, + 0xaa, 0x29, 0x31, 0x29, 0xbc, 0x29, 0x46, 0x2c, 0x79, 0x2a, 0xa0, 0x2a, 0x19, 0x29, 0x16, 0x2a, 0xde, 0x2a, 0x0e, 0x29, 0x03, 0x2c, 0x18, 0x2a, 0xdb, 0x28, 0x54, 0x29, 0x0e, 0x2b, 0xda, 0x29, + 0x9f, 0x2a, 0xe6, 0x29, 0x3a, 0x2b, 0x13, 0x2a, 0x59, 0x2a, 0xa8, 0x28, 0xaa, 0x29, 0x44, 0x2a, 0xa4, 0x29, 0xb8, 0x29, 0xf0, 0x29, 0xce, 0x2a, 0x36, 0x2b, 0x12, 0x2b, 0x5e, 0x2a, 0x68, 0x2c, + 0x26, 0x2a, 0x46, 0x28, 0x29, 0x2a, 0x7f, 0x2a, 0xd6, 0x2a, 0x76, 0x2c, 0xc5, 0x2a, 0x5d, 0x28, 0xce, 0x2b, 0x23, 0x2c, 0x68, 0x2b, 0xb2, 0x29, 0x1a, 0x2b, 0x66, 0x2a, 0xe4, 0x2a, 0xd4, 0x2a, + 0x58, 0x2c, 0x02, 0x2a, 0xe1, 0x2a, 0xce, 0x29, 0x73, 0x2b, 0x7e, 0x2a, 0xdc, 0x29, 0x4f, 0x2c, 0x68, 0x28, 0x64, 0x2b, 0x9b, 0x2a, 0x01, 0x2b, 0xc2, 0x29, 0x38, 0x28, 0x12, 0x2a, 0x7e, 0x2b, + 0xd3, 0x2b, 0xca, 0x29, 0xe4, 0x26, 0xe9, 0x28, 0x84, 0x29, 0xbc, 0x2a, 0xfc, 0x2a, 0xac, 0x2b, 0xf0, 0x2a, 0x9f, 0x2a, 0x7e, 0x27, 0x6b, 0x29, 0x08, 0x2b, 0x89, 0x2b, 0x14, 0x2a, 0xc0, 0x2a, + 0x42, 0x2c, 0x03, 0x2c, 0xda, 0x27, 0x33, 0x2c, 0x95, 0x2b, 0xc9, 0x28, 0x1c, 0x2c, 0xac, 0x29, 0xd0, 0x29, 0x00, 0x2c, 0x3d, 0x2a, 0xb4, 0x29, 0x58, 0x2b, 0x40, 0x2a, 0x51, 0x2b, 0x02, 0x2c, + 0x88, 0x2a, 0xe2, 0x2a, 0x8e, 0x28, 0x60, 0x2a, 0x5b, 0x2c, 0x6e, 0x2a, 0x82, 0x2b, 0x8c, 0x2b, 0x14, 0x29, 0x53, 0x2a, 0x26, 0x2a, 0x16, 0x2a, 0x02, 0x2c, 0x7c, 0x2c, 0xf6, 0x2a, 0xcd, 0x29, + 0xa8, 0x29, 0x90, 0x2a, 0x51, 0x2c, 0x15, 0x28, 0xa4, 0x2b, 0x5a, 0x2a, 0x43, 0x2b, 0xce, 0x2a, 0xd4, 0x2a, 0xec, 0x2a, 0x00, 0x2c, 0xb1, 0x2a, 0x8d, 0x2c, 0x09, 0x2a, 0x25, 0x27, 0xde, 0x2a, + 0xca, 0x24, 0x0e, 0x25, 0x79, 0x20, 0x55, 0x26, 0xaa, 0x24, 0xda, 0x21, 0xb0, 0x18, 0x00, 0x1f, 0x2e, 0x24, 0x9a, 0x23, 0xbc, 0x21, 0x09, 0x20, 0x10, 0x1d, 0x73, 0x26, 0xa6, 0x28, 0x6a, 0x28, + 0x88, 0x1e, 0x5c, 0x20, 0xa0, 0x27, 0xe6, 0x24, 0x2c, 0x24, 0x5a, 0x24, 0x08, 0x1f, 0xac, 0x1e, 0x2e, 0x24, 0x52, 0x24, 0xe9, 0x27, 0xb5, 0x20, 0xdc, 0x26, 0xdc, 0x21, 0x20, 0x98, 0x83, 0x26, + 0x5a, 0x24, 0x56, 0x1f, 0x7b, 0x1c, 0x0f, 0x24, 0xe1, 0x27, 0x26, 0x22, 0xa6, 0x24, 0xca, 0x9c, 0x63, 0x25, 0x70, 0x20, 0xba, 0x24, 0xc2, 0x20, 0xfe, 0x27, 0xb6, 0x25, 0x4a, 0x1e, 0x58, 0x20, + 0x38, 0x29, 0x20, 0x25, 0x8a, 0x22, 0x20, 0x90, 0x14, 0x1e, 0x80, 0x28, 0xc4, 0x1e, 0x65, 0x25, 0xe2, 0x1f, 0xe5, 0x25, 0x9c, 0x25, 0xdc, 0x24, 0xe4, 0x24, 0xaa, 0x9d, 0x86, 0x26, 0xcb, 0x27, + 0xe0, 0x1a, 0xce, 0x1d, 0x15, 0x1e, 0x14, 0x21, 0x86, 0x25, 0x42, 0x27, 0x04, 0x21, 0x8c, 0x1e, 0xc5, 0x21, 0xfa, 0x26, 0xcc, 0x11, 0x44, 0x25, 0x88, 0x1d, 0xb1, 0x26, 0x3e, 0x25, 0x1c, 0x26, + 0xe2, 0x20, 0x72, 0x24, 0x84, 0x25, 0x02, 0x27, 0x0a, 0x24, 0x52, 0x24, 0x86, 0x20, 0x68, 0x22, 0xf4, 0x1b, 0xec, 0x25, 0x5a, 0x22, 0x97, 0x24, 0xac, 0x25, 0x8a, 0x1f, 0x20, 0x25, 0x17, 0x27, + 0xd2, 0x25, 0xae, 0x26, 0x7e, 0x20, 0xea, 0x24, 0x10, 0x23, 0xba, 0x25, 0x44, 0x28, 0x66, 0x24, 0xa8, 0x24, 0xb6, 0x26, 0xfe, 0x20, 0xa8, 0x9c, 0x0f, 0x28, 0xb7, 0x22, 0x78, 0x21, 0x28, 0x23, + 0x3e, 0x21, 0xac, 0x25, 0x7c, 0x25, 0xc6, 0x26, 0x68, 0x27, 0x4d, 0x21, 0xfc, 0x26, 0xec, 0x23, 0x28, 0x21, 0x80, 0x24, 0x56, 0x21, 0x5c, 0x24, 0x0e, 0x26, 0xc4, 0x24, 0x6e, 0x23, 0x0e, 0x22, + 0x84, 0x9d, 0xb4, 0x9c, 0x55, 0x9d, 0xb6, 0x9e, 0x20, 0x9e, 0xba, 0x9d, 0xf4, 0x9c, 0xa3, 0x9e, 0xec, 0x9d, 0xc0, 0x9c, 0x4a, 0x9f, 0x16, 0x9e, 0x01, 0x9c, 0x1b, 0x9d, 0xb3, 0x9f, 0x86, 0x9d, + 0xb6, 0x9e, 0x76, 0x9d, 0xae, 0x9f, 0xae, 0x9c, 0xeb, 0x9c, 0x37, 0x9c, 0xda, 0x9c, 0x9a, 0x9e, 0x90, 0x9d, 0x9b, 0x9c, 0x63, 0x9d, 0xf3, 0x9e, 0x36, 0x9e, 0x1b, 0x9f, 0x9e, 0x9e, 0x6a, 0xa0, + 0xd5, 0x9c, 0x37, 0x9c, 0x5b, 0x9c, 0x3c, 0x9d, 0x1e, 0x9e, 0x1d, 0xa0, 0x60, 0x9f, 0x00, 0x9c, 0x0e, 0xa0, 0x46, 0x9f, 0x61, 0x9f, 0x24, 0x9d, 0x62, 0x9e, 0x01, 0x9e, 0x63, 0x9e, 0xaa, 0x9e, + 0x58, 0xa0, 0xfa, 0x9d, 0x56, 0x9e, 0xec, 0x9d, 0xfd, 0x9e, 0xfe, 0x9d, 0x73, 0x9d, 0x4e, 0xa0, 0x29, 0x9c, 0x4f, 0x9f, 0x0c, 0x9f, 0xf9, 0x9d, 0x94, 0x9d, 0x52, 0x9b, 0x3c, 0x9e, 0xbe, 0x9e, + 0xb8, 0x9e, 0xc4, 0x9c, 0x38, 0x9a, 0x5b, 0x9c, 0xfd, 0x9c, 0x4f, 0x9e, 0xe0, 0x9d, 0x20, 0x9f, 0x22, 0x9e, 0x9c, 0x9e, 0xbe, 0x98, 0x22, 0x9d, 0xf9, 0x9d, 0xf0, 0x9e, 0xcc, 0x9c, 0xe7, 0x9d, + 0xb4, 0x9f, 0x09, 0xa0, 0xaa, 0x9b, 0xaf, 0x9f, 0xc2, 0x9f, 0x0a, 0x9c, 0xf0, 0x9f, 0x4b, 0x9d, 0x36, 0x9d, 0xa8, 0x9f, 0x2d, 0x9d, 0xa2, 0x9d, 0xc4, 0x9f, 0x4f, 0x9e, 0x34, 0x9e, 0x15, 0xa0, + 0x71, 0x9f, 0x8e, 0x9e, 0x02, 0x9c, 0x6c, 0x9d, 0x72, 0x9f, 0xe7, 0x9d, 0x60, 0xa0, 0xec, 0x9e, 0x60, 0x9d, 0xda, 0x9c, 0x30, 0x9e, 0xd8, 0x9d, 0x26, 0xa0, 0x1b, 0x9f, 0x00, 0x9e, 0x5f, 0x9d, + 0xd0, 0x9d, 0x20, 0x9f, 0x47, 0xa0, 0x80, 0x9c, 0x74, 0x9f, 0x46, 0x9e, 0xfe, 0x9f, 0x1b, 0x9e, 0x07, 0x9e, 0xb3, 0x9d, 0x99, 0x9f, 0x3a, 0x9e, 0xae, 0x9f, 0xa1, 0x9c, 0xc5, 0x9c, 0x40, 0x9f, + 0xcd, 0x99, 0x26, 0x99, 0x8e, 0x96, 0xd4, 0x98, 0xa1, 0x99, 0x62, 0x96, 0x71, 0x94, 0xf0, 0x98, 0x33, 0x98, 0x76, 0x98, 0x0b, 0x98, 0x09, 0x98, 0x06, 0x91, 0xd8, 0x9a, 0xc6, 0x9d, 0x5f, 0x9c, + 0x22, 0x98, 0x78, 0x96, 0xf3, 0x9c, 0x20, 0x97, 0x06, 0x96, 0x78, 0x98, 0x1a, 0x94, 0x85, 0x98, 0x72, 0x99, 0x07, 0x97, 0x9b, 0x9b, 0xe6, 0x98, 0x5d, 0x9a, 0x40, 0x99, 0x46, 0x95, 0x60, 0x9c, + 0x9b, 0x96, 0x1b, 0x96, 0xd0, 0x07, 0xa1, 0x96, 0x9a, 0x9b, 0xa2, 0x98, 0xa4, 0x9b, 0xf0, 0x81, 0x07, 0x9c, 0x2e, 0x96, 0xb0, 0x9a, 0x22, 0x96, 0xcd, 0x9b, 0x58, 0x9a, 0x01, 0x96, 0x28, 0x98, + 0xeb, 0x9d, 0x72, 0x9a, 0x47, 0x98, 0xe8, 0x94, 0x88, 0x96, 0x66, 0x9c, 0xc0, 0x95, 0xbe, 0x9b, 0xb2, 0x95, 0x82, 0x9b, 0x01, 0x9c, 0xa0, 0x98, 0xdb, 0x99, 0x84, 0x0c, 0xfa, 0x9b, 0xbc, 0x9b, + 0x6c, 0x92, 0xba, 0x91, 0xb1, 0x93, 0xc4, 0x95, 0x91, 0x99, 0xa8, 0x9b, 0x26, 0x95, 0x96, 0x96, 0xf2, 0x96, 0x1b, 0x9c, 0x6f, 0x10, 0xd0, 0x99, 0x38, 0x93, 0x2d, 0x9b, 0x12, 0x98, 0xe5, 0x99, + 0x79, 0x97, 0xda, 0x9a, 0xcc, 0x99, 0x8a, 0x9b, 0x9d, 0x9a, 0xc3, 0x97, 0x88, 0x98, 0x14, 0x98, 0xa6, 0x93, 0x3f, 0x9b, 0xd2, 0x95, 0xd1, 0x99, 0x28, 0x9c, 0x20, 0x98, 0xcc, 0x98, 0x97, 0x9c, + 0x8b, 0x9c, 0x71, 0x9b, 0x0d, 0x95, 0x94, 0x98, 0x8c, 0x97, 0x18, 0x9a, 0x28, 0x9e, 0x4a, 0x99, 0x59, 0x9a, 0xf3, 0x98, 0x88, 0x98, 0x0c, 0x90, 0x24, 0x9d, 0x17, 0x95, 0x1c, 0x96, 0x52, 0x98, + 0xac, 0x98, 0x24, 0x9c, 0xb0, 0x9b, 0xe2, 0x9b, 0x4e, 0x9c, 0x81, 0x98, 0xfa, 0x9c, 0x7e, 0x98, 0x60, 0x96, 0xd8, 0x97, 0x88, 0x98, 0x3e, 0x99, 0xa4, 0x99, 0xd2, 0x96, 0xcb, 0x9a, 0xf0, 0x99, + 0x2a, 0x99, 0x26, 0x99, 0x36, 0x99, 0xa6, 0x9c, 0xfd, 0x99, 0x7d, 0x9a, 0x48, 0x98, 0x7e, 0x98, 0xdc, 0x9a, 0xb9, 0x98, 0x6f, 0x9b, 0x16, 0x99, 0xdc, 0x98, 0x22, 0x99, 0xef, 0x99, 0xe3, 0x99, + 0x60, 0x99, 0x5f, 0x99, 0x18, 0x9a, 0xbd, 0x9a, 0xee, 0x9a, 0x99, 0x98, 0x84, 0x99, 0xd6, 0x98, 0x02, 0x99, 0x20, 0x9a, 0x17, 0x9a, 0x89, 0x99, 0x77, 0x9b, 0xe8, 0x99, 0xcb, 0x98, 0xb3, 0x9b, + 0xac, 0x9a, 0x56, 0x97, 0xd7, 0x9a, 0xde, 0x9a, 0xfa, 0x9a, 0x13, 0x9c, 0x46, 0x99, 0x77, 0x97, 0x77, 0x9a, 0xd6, 0x9b, 0x6d, 0x9a, 0x5a, 0x99, 0x32, 0x9b, 0x14, 0x9a, 0x2e, 0x9a, 0xd4, 0x99, + 0xff, 0x9b, 0x58, 0x99, 0x5e, 0x9a, 0x84, 0x98, 0x95, 0x9a, 0x97, 0x9a, 0x45, 0x99, 0x73, 0x9b, 0xee, 0x97, 0x98, 0x9a, 0x69, 0x99, 0x1e, 0x9b, 0x47, 0x99, 0x97, 0x97, 0x57, 0x99, 0x86, 0x9b, + 0x74, 0x9b, 0xcd, 0x99, 0x7d, 0x96, 0xbe, 0x98, 0x7e, 0x99, 0x88, 0x9a, 0xf4, 0x9a, 0xd8, 0x9a, 0xa5, 0x9a, 0x01, 0x9a, 0x64, 0x98, 0x25, 0x99, 0xd2, 0x9a, 0x4a, 0x9b, 0xa9, 0x9a, 0xdb, 0x9a, + 0xde, 0x9b, 0xd4, 0x9a, 0x91, 0x97, 0x11, 0x9c, 0x4a, 0x9a, 0x02, 0x99, 0x16, 0x9b, 0x3c, 0x99, 0x5b, 0x99, 0x56, 0x9b, 0x60, 0x9a, 0x17, 0x99, 0x08, 0x9a, 0x21, 0x99, 0x7b, 0x9b, 0xf8, 0x9a, + 0xe6, 0x98, 0x7e, 0x9a, 0x6a, 0x98, 0x8a, 0x9a, 0x4c, 0x9c, 0x3f, 0x9a, 0x9e, 0x99, 0x1b, 0x9b, 0x3c, 0x98, 0x34, 0x9b, 0x20, 0x99, 0xff, 0x98, 0xee, 0x9a, 0xb0, 0x9c, 0xcd, 0x9a, 0x6e, 0x99, + 0xa1, 0x98, 0x42, 0x99, 0x87, 0x9b, 0x0f, 0x97, 0x0a, 0x9b, 0x6d, 0x99, 0xc8, 0x99, 0x88, 0x9a, 0x87, 0x9a, 0x36, 0x9b, 0x12, 0x9b, 0x41, 0x9a, 0xa8, 0x9c, 0xb7, 0x9a, 0x10, 0x94, 0x69, 0x99, + 0x18, 0x95, 0x08, 0x96, 0x15, 0x92, 0xfe, 0x98, 0x5a, 0x95, 0xb9, 0x94, 0xf8, 0x8a, 0x2c, 0x89, 0x14, 0x96, 0x67, 0x94, 0x54, 0x94, 0x14, 0x90, 0xef, 0x91, 0xe5, 0x96, 0x02, 0x98, 0xbc, 0x98, + 0xd1, 0x8d, 0x18, 0x92, 0xb9, 0x96, 0xae, 0x97, 0x08, 0x97, 0x3b, 0x95, 0xc4, 0x92, 0xe8, 0x8b, 0x55, 0x94, 0x8c, 0x96, 0x81, 0x98, 0x21, 0x90, 0x70, 0x98, 0xb6, 0x91, 0x0a, 0x0c, 0x80, 0x96, + 0xfd, 0x96, 0xd8, 0x8f, 0xba, 0x94, 0x98, 0x96, 0xa8, 0x98, 0x7a, 0x94, 0xe2, 0x92, 0x92, 0x07, 0xc1, 0x94, 0x36, 0x94, 0xc8, 0x94, 0xfc, 0x92, 0xb6, 0x98, 0x7c, 0x96, 0x2c, 0x91, 0xf6, 0x90, + 0x39, 0x99, 0x2d, 0x95, 0x5a, 0x94, 0xb8, 0x05, 0xe6, 0x90, 0xff, 0x98, 0x08, 0x91, 0x6c, 0x95, 0xec, 0x90, 0x12, 0x96, 0xbd, 0x94, 0xec, 0x96, 0x41, 0x95, 0xba, 0x04, 0x30, 0x96, 0xa6, 0x98, + 0x26, 0x92, 0xfd, 0x92, 0x62, 0x90, 0x4e, 0x93, 0x93, 0x96, 0x0c, 0x98, 0xbc, 0x94, 0x72, 0x91, 0x7f, 0x94, 0x00, 0x97, 0xdf, 0x91, 0xd5, 0x95, 0x00, 0x93, 0xe1, 0x97, 0xcd, 0x97, 0xce, 0x97, + 0x11, 0x94, 0x5d, 0x94, 0xb3, 0x95, 0x38, 0x98, 0x5d, 0x93, 0xcf, 0x95, 0xa7, 0x91, 0xeb, 0x93, 0x68, 0x90, 0x9c, 0x96, 0x4e, 0x95, 0xbb, 0x94, 0xd8, 0x94, 0x13, 0x8f, 0x5b, 0x97, 0xca, 0x96, + 0x00, 0x94, 0x53, 0x97, 0xae, 0x92, 0xd1, 0x96, 0x10, 0x96, 0xc8, 0x96, 0x0b, 0x96, 0xa6, 0x95, 0x11, 0x94, 0xd4, 0x98, 0xd8, 0x90, 0x6a, 0x09, 0x89, 0x97, 0x14, 0x97, 0xa8, 0x94, 0x70, 0x94, + 0x9c, 0x90, 0x8d, 0x94, 0xa2, 0x95, 0xf1, 0x95, 0xc8, 0x97, 0xa0, 0x91, 0x88, 0x95, 0x5a, 0x95, 0x2f, 0x94, 0xf4, 0x96, 0xf0, 0x92, 0x4a, 0x95, 0x73, 0x98, 0x97, 0x97, 0x46, 0x8d, 0x84, 0x90, + 0xde, 0xac, 0xdc, 0xab, 0xef, 0xab, 0x2b, 0xac, 0x15, 0xad, 0xaa, 0xab, 0x55, 0xab, 0xee, 0xad, 0x29, 0xac, 0xe1, 0xab, 0x30, 0xad, 0xef, 0xac, 0x92, 0xa8, 0xca, 0xac, 0x1f, 0xb0, 0x6e, 0xad, + 0x71, 0xad, 0x05, 0xac, 0xb4, 0xaf, 0xba, 0xa9, 0xb8, 0xa9, 0xf9, 0xaa, 0x1d, 0xaa, 0xa6, 0xad, 0xdf, 0xac, 0x05, 0xaa, 0xe8, 0xac, 0xd1, 0xad, 0xd0, 0xac, 0xec, 0xad, 0x26, 0xad, 0xda, 0xaf, + 0xe9, 0xa9, 0xde, 0xaa, 0xcc, 0xa5, 0x86, 0xaa, 0x43, 0xad, 0xf6, 0xad, 0x16, 0xaf, 0xbc, 0xa8, 0x74, 0xaf, 0xc1, 0xac, 0x64, 0xae, 0x5a, 0xab, 0x7b, 0xad, 0x23, 0xad, 0x84, 0xac, 0x3c, 0xad, + 0x3d, 0xb0, 0x6a, 0xad, 0xc1, 0xac, 0x8d, 0xac, 0x01, 0xad, 0x9f, 0xad, 0xe3, 0xab, 0x75, 0xaf, 0x6e, 0xaa, 0x7e, 0xae, 0xd4, 0xae, 0x3b, 0xac, 0xe6, 0xac, 0x67, 0xa7, 0x16, 0xae, 0xad, 0xad, + 0xea, 0xab, 0x65, 0xa9, 0x7b, 0xa8, 0x3c, 0xaa, 0x2f, 0xac, 0x9f, 0xad, 0x3f, 0xab, 0x0b, 0xad, 0x3c, 0xac, 0x42, 0xae, 0x36, 0x9c, 0x85, 0xac, 0x1a, 0xab, 0xcb, 0xad, 0x49, 0xaa, 0x9f, 0xac, + 0x52, 0xad, 0xef, 0xae, 0x9c, 0xab, 0x46, 0xae, 0xc7, 0xae, 0x02, 0xaa, 0x10, 0xae, 0x26, 0xac, 0xd4, 0xaa, 0x78, 0xae, 0x8f, 0xaa, 0x02, 0xad, 0x5b, 0xaf, 0x27, 0xad, 0x5c, 0xac, 0xa0, 0xaf, + 0xb2, 0xaf, 0xce, 0xad, 0xa0, 0xa9, 0xd7, 0xab, 0xd2, 0xac, 0xe8, 0xac, 0xc8, 0xb0, 0x51, 0xad, 0x3d, 0xad, 0x7a, 0xaa, 0x27, 0xad, 0xc6, 0xab, 0x0a, 0xb0, 0xb9, 0xab, 0xde, 0xab, 0x37, 0xac, + 0x08, 0xad, 0x0a, 0xaf, 0x5d, 0xaf, 0x1d, 0xad, 0xc5, 0xae, 0x1d, 0xad, 0x10, 0xb0, 0x8a, 0xac, 0x1a, 0xac, 0x7c, 0xab, 0xbf, 0xad, 0xf8, 0xac, 0x4c, 0xad, 0x8f, 0xa9, 0x9f, 0xad, 0x70, 0xae, + 0x1e, 0xa5, 0xc1, 0xa7, 0x72, 0xa7, 0x00, 0xac, 0xac, 0x9b, 0x3e, 0xa4, 0xf0, 0xa1, 0x0b, 0xa5, 0xf4, 0xac, 0x10, 0xa4, 0x2a, 0xa1, 0x14, 0xa9, 0xd8, 0xa2, 0xc7, 0xab, 0xe4, 0xa5, 0xee, 0xa8, + 0x2c, 0xa7, 0xfc, 0xa2, 0xe4, 0xa9, 0x18, 0xad, 0x56, 0xac, 0xaa, 0xa8, 0xe0, 0xa5, 0x46, 0x24, 0xf3, 0xa9, 0x92, 0xa8, 0x86, 0xa9, 0x4c, 0xa4, 0x2c, 0xac, 0x90, 0xa0, 0x51, 0x26, 0x33, 0xa9, + 0x3e, 0xaa, 0x00, 0x8d, 0x74, 0xaa, 0x42, 0xac, 0x50, 0xa7, 0x43, 0xaa, 0xd5, 0xa4, 0xfe, 0x9c, 0xe8, 0x1d, 0x48, 0xaa, 0x34, 0xa8, 0x36, 0xaa, 0xcd, 0xaa, 0x82, 0x9f, 0xde, 0x9d, 0x1f, 0xa9, + 0x60, 0xab, 0x6c, 0xa4, 0xef, 0xa9, 0x14, 0x97, 0x8e, 0xa5, 0x6c, 0xa8, 0x0c, 0xaa, 0xca, 0xa9, 0x24, 0x9c, 0x70, 0xa9, 0x6f, 0xa8, 0x28, 0xa9, 0xd4, 0xa6, 0xe0, 0x99, 0xdb, 0xa9, 0x20, 0xac, + 0xa4, 0xa7, 0x1c, 0xa9, 0x2d, 0xa5, 0x27, 0xa3, 0x9e, 0xa4, 0x77, 0xa8, 0xbb, 0xa8, 0x61, 0xa8, 0x85, 0xa9, 0x2a, 0xa0, 0xc1, 0xa6, 0x56, 0xaa, 0x47, 0xaa, 0x6a, 0xab, 0x84, 0xa6, 0xb6, 0xa7, + 0xb2, 0xa9, 0x96, 0xa8, 0x60, 0xa9, 0xcf, 0xa8, 0x5a, 0xa9, 0xc6, 0xac, 0xf8, 0xa5, 0x9a, 0xa0, 0x48, 0xa9, 0x98, 0xa7, 0x9f, 0xa8, 0xc0, 0x14, 0xc1, 0xa7, 0xd6, 0x9f, 0x80, 0x81, 0x2b, 0xa5, + 0x62, 0xa1, 0xcd, 0xa6, 0xb2, 0xa8, 0x0a, 0xaa, 0x7f, 0xa8, 0x8b, 0xa5, 0x01, 0x99, 0x42, 0xac, 0x72, 0x23, 0x5c, 0xa2, 0xd0, 0xa5, 0x82, 0xa4, 0xeb, 0xa1, 0x1a, 0xad, 0x86, 0xa9, 0xb0, 0xa1, + 0x42, 0xa9, 0xe4, 0xa3, 0x82, 0xa4, 0xd4, 0xa5, 0x02, 0xab, 0x78, 0xa9, 0xf3, 0xa5, 0x61, 0xab, 0xa0, 0xa6, 0x13, 0xac, 0xc3, 0xa8, 0x96, 0xa9, 0xfc, 0xa9, 0x90, 0xa9, 0x55, 0x9a, 0xba, 0xa7, + 0xac, 0x24, 0x7f, 0x1f, 0x60, 0x26, 0xe9, 0x28, 0x78, 0x23, 0x2b, 0x24, 0x42, 0x25, 0x50, 0x25, 0x9b, 0x26, 0x94, 0x20, 0xed, 0x20, 0xc8, 0x25, 0x5c, 0x1c, 0x12, 0x24, 0x13, 0x27, 0xb3, 0x25, + 0xed, 0x28, 0xf4, 0x21, 0x83, 0x26, 0x0c, 0x21, 0xa0, 0x27, 0x3b, 0x23, 0x00, 0x20, 0x13, 0x24, 0x3e, 0x28, 0x0e, 0x25, 0x46, 0x23, 0x6f, 0x27, 0x42, 0x24, 0xe9, 0x26, 0x3d, 0x23, 0xb1, 0x28, + 0x26, 0x25, 0xaa, 0x1f, 0x7a, 0x26, 0x55, 0x22, 0xa1, 0x21, 0x34, 0x28, 0xe8, 0x25, 0x66, 0x1c, 0x9c, 0x23, 0x79, 0x28, 0x50, 0x28, 0x7a, 0x25, 0x77, 0x28, 0xe6, 0x24, 0x1e, 0x22, 0x80, 0x26, + 0xf2, 0x29, 0xc6, 0x1e, 0xbc, 0x25, 0x7c, 0x26, 0x55, 0x25, 0x17, 0x22, 0x94, 0x25, 0x69, 0x27, 0x7c, 0x9e, 0x26, 0x29, 0x2a, 0x25, 0xba, 0x25, 0x3e, 0x20, 0x1e, 0x20, 0xd8, 0x26, 0xc6, 0x27, + 0xd7, 0x26, 0xf2, 0x24, 0xae, 0x95, 0x5d, 0x25, 0x74, 0x24, 0x1a, 0x24, 0x56, 0x26, 0xf0, 0x28, 0x84, 0x24, 0xcf, 0x22, 0x2a, 0x20, 0xbe, 0x26, 0xae, 0x25, 0x16, 0x24, 0x18, 0x25, 0x58, 0x20, + 0x16, 0x25, 0x96, 0x28, 0x12, 0x24, 0x05, 0x28, 0x76, 0x26, 0x49, 0x26, 0x2a, 0x28, 0xa7, 0x16, 0xc1, 0x26, 0x02, 0x24, 0x77, 0x26, 0x6f, 0x21, 0xf4, 0x27, 0x5e, 0x24, 0xf1, 0x21, 0xf7, 0x25, + 0x0b, 0x26, 0x5f, 0x24, 0x71, 0x20, 0xf9, 0x1f, 0x47, 0x28, 0x0b, 0x27, 0x4d, 0x24, 0x1d, 0x26, 0x85, 0x22, 0xde, 0x20, 0x30, 0x25, 0x12, 0x25, 0x06, 0x28, 0x4b, 0x28, 0xdc, 0x27, 0x03, 0x24, + 0x84, 0x27, 0xae, 0x24, 0xee, 0x27, 0xee, 0x23, 0x6c, 0x28, 0x28, 0x28, 0x5e, 0x28, 0xd4, 0x28, 0x79, 0x24, 0x8e, 0x25, 0x33, 0x29, 0x7d, 0x29, 0xfa, 0x27, 0x6e, 0x26, 0x30, 0x12, 0x0c, 0x25, + 0xf7, 0x21, 0x50, 0x09, 0x42, 0xa1, 0xda, 0x20, 0xc9, 0x1c, 0x13, 0x9e, 0xc4, 0x18, 0x84, 0x21, 0x87, 0x1d, 0xb7, 0x19, 0xf6, 0x9f, 0x4b, 0xa0, 0xba, 0x18, 0xac, 0x26, 0xce, 0x22, 0x79, 0x24, + 0x9e, 0x1c, 0x78, 0x16, 0x72, 0x21, 0xb8, 0x21, 0x60, 0x22, 0x2a, 0x1e, 0x7c, 0x9f, 0x04, 0x9d, 0x8e, 0x22, 0x04, 0x24, 0xbc, 0x21, 0x44, 0x1f, 0x2a, 0x25, 0x10, 0x18, 0x42, 0xa2, 0xb0, 0x9c, + 0xc5, 0x21, 0x21, 0x9b, 0x44, 0x15, 0x5b, 0x20, 0x1a, 0x26, 0xf2, 0x1c, 0xe6, 0x21, 0x32, 0xa0, 0x20, 0x94, 0x84, 0x94, 0x5c, 0x1f, 0x66, 0x21, 0xa4, 0x24, 0x60, 0x23, 0xa8, 0x9d, 0x46, 0x18, + 0xdc, 0x26, 0x57, 0x20, 0x1c, 0x21, 0x50, 0xa0, 0x80, 0x89, 0x20, 0x25, 0x52, 0x23, 0xa6, 0x1d, 0x84, 0x11, 0x96, 0x1f, 0x2e, 0x26, 0x0b, 0x1c, 0xad, 0x23, 0xf2, 0x99, 0x9a, 0x23, 0xbc, 0x24, + 0x38, 0x10, 0x00, 0x9c, 0xd6, 0x22, 0x00, 0x11, 0x66, 0x21, 0x28, 0x20, 0x63, 0x21, 0x8c, 0x24, 0x30, 0x21, 0x32, 0x1e, 0x38, 0xa0, 0xfc, 0x21, 0xc0, 0xa0, 0xf7, 0x25, 0x70, 0x9e, 0xca, 0x23, + 0x96, 0x1e, 0x80, 0x14, 0xba, 0x24, 0xbf, 0x22, 0x0d, 0x24, 0xb8, 0x21, 0x36, 0x1c, 0xc3, 0x15, 0xc4, 0x17, 0xfa, 0x9e, 0x1a, 0x96, 0x69, 0xa1, 0xc5, 0x22, 0x68, 0x92, 0x3a, 0x9d, 0x51, 0x25, + 0xff, 0x24, 0xa5, 0x1d, 0xad, 0x1d, 0xe6, 0x21, 0x1a, 0x9b, 0x80, 0x88, 0xa8, 0x21, 0x34, 0x22, 0x10, 0x1b, 0x9a, 0x22, 0xd4, 0x15, 0xb6, 0x98, 0x92, 0x9e, 0x6d, 0x1f, 0x82, 0x24, 0x51, 0x1d, + 0x18, 0x1c, 0x41, 0x24, 0x70, 0x9a, 0xc3, 0x20, 0x46, 0x24, 0x06, 0x26, 0xd6, 0x23, 0x3b, 0x20, 0x10, 0x12, 0x54, 0x24, 0x71, 0x22, 0xa4, 0x19, 0x49, 0x1c, 0xf6, 0x22, 0x6c, 0x24, 0x7c, 0x17, + 0x71, 0x99, 0x58, 0x86, 0x9e, 0x98, 0x3e, 0x9c, 0xb4, 0x98, 0x85, 0x96, 0xc0, 0x99, 0x22, 0x9a, 0x63, 0x96, 0xcb, 0x93, 0xe2, 0x91, 0xc0, 0x96, 0x0f, 0x8d, 0xd0, 0x96, 0x24, 0x9c, 0x00, 0x9a, + 0x2f, 0x9d, 0x9c, 0x95, 0xc3, 0x99, 0x0d, 0x11, 0xce, 0x99, 0xef, 0x94, 0xb8, 0x82, 0x86, 0x99, 0x2a, 0x9c, 0x3c, 0x99, 0x21, 0x95, 0x40, 0x9c, 0x73, 0x95, 0xe4, 0x9b, 0xc6, 0x98, 0x02, 0x9c, + 0x02, 0x98, 0xb0, 0x93, 0x77, 0x98, 0x40, 0x84, 0xbd, 0x97, 0x3c, 0x9b, 0xfa, 0x9a, 0xb8, 0x84, 0xdc, 0x98, 0x46, 0x9b, 0x6c, 0x9c, 0x58, 0x98, 0x7a, 0x9c, 0xda, 0x9a, 0x6d, 0x95, 0x48, 0x99, + 0x8a, 0x9e, 0xe3, 0x92, 0xbe, 0x98, 0xcd, 0x9a, 0xe6, 0x98, 0xb1, 0x96, 0xef, 0x98, 0x5f, 0x9a, 0x4e, 0x14, 0x1b, 0x9d, 0x4e, 0x9a, 0x87, 0x98, 0x86, 0x94, 0xea, 0x93, 0x9d, 0x9a, 0xd3, 0x9a, + 0x3a, 0x9a, 0x9c, 0x95, 0x02, 0x0b, 0x8c, 0x99, 0x32, 0x99, 0xc0, 0x96, 0x2a, 0x9a, 0xaa, 0x9d, 0x06, 0x97, 0x26, 0x98, 0xc0, 0x04, 0xe9, 0x99, 0x08, 0x95, 0xac, 0x96, 0x78, 0x97, 0x38, 0x94, + 0x71, 0x97, 0x56, 0x9c, 0xd0, 0x97, 0x37, 0x9c, 0x85, 0x9a, 0x31, 0x97, 0x6a, 0x9c, 0xf2, 0x80, 0x7c, 0x99, 0x2e, 0x94, 0x24, 0x99, 0x83, 0x94, 0x6b, 0x9c, 0xa2, 0x98, 0x14, 0x96, 0xe8, 0x9b, + 0x44, 0x9c, 0xfe, 0x97, 0xd3, 0x8d, 0xa1, 0x89, 0x82, 0x9b, 0xf6, 0x9a, 0x20, 0x9a, 0x0a, 0x98, 0x42, 0x99, 0xc9, 0x96, 0xd2, 0x98, 0xa6, 0x98, 0x1a, 0x9c, 0x9a, 0x99, 0x25, 0x9c, 0x97, 0x98, + 0xa0, 0x9a, 0x43, 0x9a, 0xfe, 0x9b, 0x28, 0x98, 0x4a, 0x9c, 0xbc, 0x9c, 0x3b, 0x9d, 0x40, 0x9c, 0x5c, 0x97, 0x0c, 0x98, 0x9e, 0x9d, 0x3d, 0x9d, 0xd0, 0x9a, 0x24, 0x9a, 0x28, 0x93, 0x2e, 0x98, + 0x16, 0x98, 0x12, 0x11, 0x2f, 0x15, 0x26, 0x95, 0x5a, 0x95, 0x26, 0x11, 0x0d, 0x94, 0x17, 0x98, 0xdb, 0x0f, 0x84, 0x8d, 0x30, 0x14, 0xf0, 0x15, 0x75, 0x87, 0x7e, 0x99, 0x58, 0x99, 0x05, 0x99, + 0xd4, 0x96, 0xf6, 0x8d, 0x76, 0x95, 0xee, 0x0f, 0x82, 0x94, 0x77, 0x8e, 0x06, 0x16, 0x0c, 0x91, 0x29, 0x98, 0x68, 0x98, 0xcd, 0x93, 0x02, 0x98, 0xe6, 0x96, 0x89, 0x95, 0x04, 0x0d, 0xd8, 0x0a, + 0x64, 0x94, 0x32, 0x09, 0x46, 0x0b, 0x48, 0x0d, 0x80, 0x9a, 0x02, 0x92, 0x9e, 0x98, 0xd2, 0x14, 0x6e, 0x92, 0x90, 0x03, 0xd6, 0x96, 0x3c, 0x94, 0x86, 0x99, 0xe2, 0x99, 0xe9, 0x0e, 0x16, 0x8c, + 0x87, 0x9c, 0x35, 0x94, 0x66, 0x94, 0xa0, 0x03, 0xcc, 0x8c, 0x02, 0x99, 0xcd, 0x96, 0x4c, 0x92, 0xbe, 0x0d, 0xfd, 0x96, 0x1e, 0x9b, 0x0b, 0x8e, 0x46, 0x97, 0x38, 0x08, 0x2e, 0x98, 0x64, 0x98, + 0xdc, 0x8e, 0xc7, 0x13, 0x2a, 0x95, 0x86, 0x91, 0x95, 0x97, 0x05, 0x93, 0x7e, 0x96, 0x12, 0x9b, 0xe4, 0x93, 0x54, 0x95, 0x00, 0x17, 0xd2, 0x95, 0xda, 0x17, 0xd6, 0x98, 0x84, 0x12, 0xfa, 0x96, + 0xc6, 0x8f, 0x3e, 0x92, 0x70, 0x98, 0xb3, 0x98, 0x98, 0x98, 0xd9, 0x8e, 0x04, 0x96, 0x7a, 0x00, 0x84, 0x8b, 0x06, 0x15, 0x64, 0x09, 0x29, 0x14, 0x2d, 0x99, 0xdc, 0x8f, 0x1c, 0x0a, 0x63, 0x9b, + 0xb2, 0x9b, 0x83, 0x92, 0xd0, 0x82, 0x76, 0x90, 0x50, 0x01, 0x32, 0x91, 0xf2, 0x98, 0x83, 0x92, 0xb6, 0x96, 0x16, 0x98, 0xb6, 0x8f, 0xd0, 0x83, 0x58, 0x8b, 0xf8, 0x86, 0x9b, 0x99, 0xe3, 0x94, + 0xbb, 0x91, 0xec, 0x99, 0x04, 0x90, 0xc7, 0x95, 0xed, 0x98, 0xa4, 0x9b, 0x9c, 0x9a, 0xee, 0x94, 0x08, 0x88, 0x24, 0x96, 0x7e, 0x99, 0x84, 0x94, 0x27, 0x91, 0x90, 0x97, 0x2c, 0x99, 0x26, 0x8c, + 0x26, 0x93, 0x5d, 0x92, 0x56, 0x96, 0xd6, 0x98, 0x84, 0x90, 0xde, 0x93, 0xa6, 0x93, 0xf6, 0x93, 0x7c, 0x98, 0xa1, 0x90, 0x21, 0x91, 0x9b, 0x96, 0xcb, 0x8d, 0x40, 0x95, 0x22, 0x95, 0x27, 0x95, + 0x96, 0x97, 0x45, 0x91, 0x81, 0x96, 0x5f, 0x96, 0x48, 0x98, 0x59, 0x94, 0x48, 0x92, 0x38, 0x8e, 0x9c, 0x97, 0xaa, 0x94, 0x8e, 0x94, 0x3e, 0x95, 0xf9, 0x95, 0xa7, 0x94, 0x72, 0x8c, 0x52, 0x98, + 0xd7, 0x95, 0x64, 0x8d, 0x43, 0x97, 0x01, 0x96, 0xd6, 0x90, 0x11, 0x98, 0x3e, 0x94, 0xa6, 0x8d, 0xc3, 0x8f, 0x63, 0x98, 0x0c, 0x97, 0x0f, 0x96, 0x07, 0x98, 0x67, 0x91, 0x04, 0x91, 0x85, 0x96, + 0xd9, 0x98, 0x0b, 0x8f, 0x19, 0x96, 0x70, 0x94, 0xb0, 0x94, 0x52, 0x92, 0xd7, 0x95, 0x40, 0x97, 0xda, 0x09, 0x59, 0x98, 0x3a, 0x94, 0xfc, 0x95, 0xa7, 0x90, 0x52, 0x8e, 0x79, 0x96, 0x10, 0x98, + 0x1d, 0x96, 0xe3, 0x95, 0xba, 0x87, 0x24, 0x94, 0xba, 0x92, 0x6d, 0x94, 0xc9, 0x95, 0x59, 0x97, 0x22, 0x95, 0x93, 0x90, 0xf2, 0x92, 0xd3, 0x96, 0x3a, 0x97, 0x36, 0x95, 0x1e, 0x95, 0x28, 0x91, + 0xc2, 0x95, 0xe3, 0x97, 0x5c, 0x94, 0xb6, 0x96, 0xeb, 0x95, 0x25, 0x98, 0x61, 0x96, 0x53, 0x8a, 0xc6, 0x96, 0xc7, 0x94, 0x6b, 0x96, 0x2f, 0x90, 0x2a, 0x96, 0x5d, 0x92, 0x17, 0x90, 0xca, 0x93, + 0xb0, 0x92, 0x27, 0x94, 0x1a, 0x93, 0xa6, 0x93, 0xb4, 0x97, 0xb7, 0x95, 0x5d, 0x90, 0x98, 0x97, 0x4c, 0x8a, 0x97, 0x8e, 0x97, 0x94, 0x5d, 0x94, 0xf2, 0x95, 0x1c, 0x99, 0xc2, 0x96, 0xcd, 0x91, + 0x1c, 0x97, 0x04, 0x92, 0x1f, 0x96, 0xea, 0x92, 0x18, 0x98, 0xb3, 0x96, 0x0a, 0x96, 0xa7, 0x98, 0x6a, 0x94, 0xcf, 0x96, 0x12, 0x98, 0xb6, 0x98, 0xc3, 0x97, 0x20, 0x96, 0x76, 0x08, 0xfe, 0x94, + 0x70, 0x90, 0xdf, 0x8e, 0xf8, 0x0a, 0x84, 0x93, 0xa4, 0x83, 0xe8, 0x08, 0x94, 0x81, 0xc0, 0x8f, 0xca, 0x94, 0x38, 0x8c, 0xfd, 0x0b, 0x33, 0x84, 0x33, 0x8c, 0x56, 0x97, 0x65, 0x90, 0x2c, 0x94, + 0x16, 0x89, 0xa0, 0x88, 0xf2, 0x92, 0xa5, 0x96, 0x03, 0x95, 0x60, 0x91, 0xb3, 0x80, 0x6a, 0x11, 0xe4, 0x92, 0xad, 0x93, 0xe1, 0x93, 0xfe, 0x89, 0xb3, 0x96, 0x80, 0x07, 0x49, 0x14, 0x14, 0x83, + 0x07, 0x94, 0x2f, 0x0b, 0xaf, 0x90, 0x37, 0x95, 0x0c, 0x95, 0xce, 0x90, 0x83, 0x8f, 0x96, 0x0c, 0x3d, 0x0d, 0x8e, 0x8d, 0x7c, 0x8e, 0xaa, 0x93, 0xa1, 0x94, 0xee, 0x8e, 0x46, 0x0c, 0x02, 0x8f, + 0xab, 0x95, 0x4a, 0x90, 0x19, 0x93, 0xf3, 0x10, 0x36, 0x86, 0xd3, 0x94, 0x4f, 0x94, 0xe9, 0x90, 0xa0, 0x89, 0xe9, 0x8f, 0x0a, 0x95, 0x6c, 0x90, 0x67, 0x93, 0x4c, 0x09, 0x09, 0x94, 0xaf, 0x95, + 0xfd, 0x89, 0x8e, 0x8c, 0x01, 0x93, 0x64, 0x00, 0xd8, 0x8f, 0x9d, 0x91, 0xbe, 0x91, 0x2d, 0x92, 0x30, 0x93, 0x6d, 0x8a, 0xe6, 0x81, 0xa6, 0x93, 0x48, 0x8a, 0xb6, 0x96, 0xf5, 0x03, 0xea, 0x93, + 0x05, 0x92, 0x3e, 0x8a, 0xe4, 0x94, 0xf6, 0x91, 0xfc, 0x93, 0x8d, 0x95, 0x76, 0x88, 0xf9, 0x89, 0x06, 0x8f, 0xd4, 0x82, 0x89, 0x8b, 0x7d, 0x10, 0x03, 0x91, 0x8a, 0x07, 0x82, 0x0d, 0xc0, 0x92, + 0x04, 0x91, 0x25, 0x8f, 0xd2, 0x91, 0x97, 0x94, 0x84, 0x82, 0xc6, 0x80, 0x04, 0x8c, 0x2e, 0x95, 0x75, 0x0c, 0xae, 0x90, 0xd9, 0x89, 0xa6, 0x02, 0x3a, 0x0f, 0xd6, 0x94, 0x14, 0x94, 0x07, 0x8a, + 0xaa, 0x8f, 0x56, 0x91, 0xf0, 0x0a, 0x62, 0x90, 0x88, 0x94, 0xde, 0x94, 0x62, 0x90, 0x8c, 0x92, 0xe3, 0x8b, 0xd5, 0x95, 0xc9, 0x90, 0xd2, 0x8c, 0x7a, 0x90, 0x8a, 0x93, 0xa6, 0x91, 0x53, 0x8d, + 0x0f, 0xaa, 0xe2, 0x22, 0xec, 0x9b, 0xc8, 0xa9, 0x12, 0xa9, 0xdc, 0x9e, 0x19, 0xa9, 0x8f, 0xaa, 0x5e, 0x1f, 0x64, 0xa1, 0x17, 0x1e, 0xe0, 0x1f, 0xc8, 0x95, 0x50, 0xa8, 0x6c, 0xac, 0x84, 0xaa, + 0x66, 0xac, 0x1e, 0xa4, 0x80, 0xa8, 0x22, 0x28, 0xfe, 0xa6, 0x8d, 0xa0, 0x96, 0x25, 0x4f, 0xa9, 0x4a, 0xab, 0xa3, 0xa9, 0x2d, 0xa3, 0x21, 0xac, 0x83, 0xa4, 0x36, 0xab, 0x6a, 0xa7, 0x0a, 0xa8, + 0x99, 0xa5, 0x74, 0xa0, 0xd4, 0xa0, 0x00, 0x25, 0x61, 0xaa, 0x94, 0xa8, 0x8c, 0xab, 0xfe, 0x22, 0xbb, 0xa8, 0xee, 0xa6, 0x92, 0xab, 0xfc, 0xa5, 0x3d, 0xac, 0x5c, 0xac, 0x78, 0xa0, 0x96, 0xa5, + 0xdd, 0xae, 0x46, 0xa4, 0xcc, 0xa6, 0xa6, 0xa8, 0x59, 0xa6, 0xc5, 0xa8, 0x59, 0xa8, 0x24, 0xa8, 0x4c, 0x24, 0x24, 0xac, 0x2a, 0xac, 0xdb, 0xa4, 0x96, 0xa6, 0x75, 0xa0, 0x20, 0xaa, 0xd3, 0xa9, + 0x06, 0xa8, 0x60, 0x1d, 0xe2, 0x9e, 0x6c, 0xa8, 0xbe, 0xa9, 0xfc, 0xa4, 0x6f, 0xa9, 0xed, 0xad, 0xf4, 0xa4, 0x88, 0xa8, 0xd2, 0x26, 0x9f, 0xa8, 0x1f, 0x25, 0xca, 0xa7, 0x50, 0x9d, 0xf2, 0xa5, + 0x82, 0xa3, 0x0d, 0xaa, 0x86, 0xa8, 0x04, 0xac, 0x7b, 0xaa, 0xf3, 0x9d, 0x93, 0xab, 0x70, 0x18, 0xbc, 0xa5, 0xfd, 0x20, 0x72, 0xa4, 0x30, 0x95, 0x71, 0xac, 0x6a, 0xa7, 0x7a, 0xa3, 0x0e, 0xad, + 0x8a, 0xad, 0x36, 0xa6, 0xcc, 0x1c, 0xa4, 0x1b, 0x20, 0xa8, 0x1e, 0xa9, 0xb6, 0xab, 0x6e, 0xa3, 0x86, 0xaa, 0xdd, 0xa8, 0xb0, 0xa6, 0x94, 0xa5, 0xba, 0xa9, 0x2a, 0xa2, 0x32, 0xac, 0x97, 0xa8, + 0x5a, 0xa8, 0x0a, 0xac, 0xc3, 0xa9, 0x3a, 0xa8, 0xbf, 0xab, 0x55, 0xad, 0xa2, 0xad, 0xdd, 0xa9, 0xbf, 0xa3, 0xf0, 0xa5, 0x49, 0xad, 0x8f, 0xab, 0x33, 0xa8, 0x96, 0xa9, 0xcc, 0xa8, 0xae, 0xa4, + 0x38, 0xbe, 0x37, 0xc0, 0x5e, 0xbe, 0x42, 0xbf, 0x01, 0xbf, 0x28, 0xc2, 0x7f, 0xc1, 0xfc, 0xc2, 0x54, 0xc0, 0x86, 0xc1, 0x72, 0xb4, 0xb4, 0xbd, 0x1f, 0xbd, 0x99, 0xbd, 0x53, 0xbf, 0x41, 0xbd, + 0x39, 0xbf, 0x0e, 0xbd, 0x3f, 0xc2, 0x4d, 0xbf, 0x67, 0xc0, 0x0d, 0xc1, 0x13, 0xc0, 0x86, 0xc1, 0x5b, 0xc1, 0x38, 0xbd, 0x96, 0xbf, 0xd7, 0xbe, 0xad, 0xc3, 0x52, 0xbf, 0xe3, 0xbc, 0xfa, 0xc0, + 0x6d, 0xc2, 0xde, 0xc1, 0x20, 0xc1, 0xb0, 0xbe, 0x10, 0xc0, 0xf2, 0xbc, 0xb6, 0xc1, 0x88, 0xbe, 0x44, 0xbf, 0x91, 0xc0, 0x27, 0xbf, 0xc0, 0xc2, 0x16, 0xc1, 0x2f, 0xc1, 0xbc, 0xc0, 0x15, 0xbe, + 0x83, 0xc1, 0x4e, 0xbf, 0x24, 0xc2, 0x19, 0xbe, 0x83, 0xb7, 0x3b, 0xbd, 0xeb, 0xc2, 0x4f, 0xbe, 0x49, 0xbf, 0x76, 0xc2, 0x4d, 0xbe, 0xe8, 0xc1, 0xa4, 0xc4, 0xb0, 0xbc, 0x8c, 0xbc, 0xd9, 0xbf, + 0x06, 0xb9, 0x7a, 0xbe, 0x4f, 0xc0, 0x7f, 0xc0, 0x91, 0xba, 0xbe, 0xc0, 0xac, 0xc2, 0x43, 0xc0, 0x28, 0xbe, 0x2e, 0xc3, 0x22, 0xc0, 0x1f, 0xc1, 0xd4, 0x33, 0x60, 0xba, 0xb5, 0xbd, 0xfa, 0xbd, + 0x98, 0xc1, 0xa6, 0xc3, 0x4c, 0xc0, 0x10, 0xbd, 0x73, 0xc0, 0x6b, 0xbe, 0x4e, 0xc2, 0x2d, 0xbc, 0x20, 0xb9, 0xfb, 0xbf, 0x07, 0xc2, 0x52, 0xbe, 0x41, 0x32, 0x02, 0xc1, 0x16, 0xc1, 0x88, 0xbd, + 0xe0, 0xc0, 0x74, 0xbd, 0x02, 0xc1, 0xc6, 0xc1, 0x44, 0xb9, 0xdc, 0xc4, 0x58, 0xb7, 0x58, 0xc0, 0xa1, 0xbe, 0xab, 0xc2, 0xb6, 0xbe, 0x63, 0xc0, 0x1e, 0xc0, 0x87, 0xc1, 0x61, 0xc3, 0x73, 0xbe, + 0x63, 0xc3, 0x37, 0xc1, 0xaf, 0xc0, 0xff, 0xc2, 0x58, 0xbf, 0x47, 0xb9, 0x88, 0xba, 0x4d, 0xc1, 0x9f, 0xbc, 0x28, 0xc0, 0x3c, 0xc0, 0x99, 0xb4, 0x8f, 0xb6, 0xd3, 0xbc, 0x59, 0xbc, 0x80, 0xc0, + 0xdc, 0x3d, 0x11, 0x3d, 0x60, 0x3d, 0x72, 0x3e, 0x96, 0x3b, 0x74, 0x3d, 0x90, 0x3e, 0x99, 0x3e, 0x65, 0x39, 0x68, 0x3e, 0x16, 0x38, 0x16, 0x3f, 0x4c, 0x3c, 0x5a, 0x3e, 0xf5, 0x3c, 0xe2, 0x3d, + 0xc3, 0x3f, 0xce, 0x3d, 0x40, 0x3e, 0x30, 0x3e, 0xc3, 0x3d, 0x37, 0x3f, 0x1d, 0x3d, 0xba, 0x3e, 0x84, 0x3e, 0xba, 0x3a, 0x36, 0x3c, 0x82, 0x3c, 0x78, 0x3f, 0x8d, 0x3e, 0x56, 0x3d, 0x16, 0x3f, + 0xc6, 0x3c, 0x0d, 0x3f, 0x52, 0x3c, 0xa2, 0x39, 0x4c, 0x3d, 0x00, 0x3e, 0xcc, 0x3e, 0xcb, 0x3d, 0x74, 0x3e, 0xec, 0x3c, 0x58, 0x3e, 0xe3, 0x3c, 0x0d, 0x40, 0x07, 0x3e, 0xb6, 0x3e, 0x61, 0x3c, + 0xc6, 0x3f, 0x1b, 0x3d, 0x3a, 0x3c, 0xaa, 0x3c, 0x86, 0x3c, 0xce, 0x3d, 0xb6, 0x3c, 0x69, 0x3c, 0xf7, 0x3b, 0x9e, 0x3f, 0x2a, 0x3d, 0xfc, 0x3e, 0x75, 0x3f, 0x2f, 0x3d, 0x4a, 0x3d, 0x05, 0x3c, + 0x66, 0x3c, 0xa2, 0x3c, 0x42, 0x3e, 0x2c, 0x37, 0x1a, 0x3d, 0x6e, 0x3c, 0x87, 0x3e, 0x39, 0x3b, 0xe0, 0x3c, 0xc0, 0x3d, 0xb6, 0x3c, 0x50, 0x3d, 0x08, 0x39, 0x6c, 0x3d, 0xb8, 0x3c, 0x1f, 0x3d, + 0x4b, 0x40, 0x9e, 0x3e, 0x90, 0x3e, 0xd6, 0x3d, 0x0a, 0x3f, 0xd2, 0x3b, 0x9e, 0x3d, 0x62, 0x3a, 0xae, 0x3e, 0x7e, 0x3c, 0x8c, 0x3e, 0x24, 0x3e, 0x90, 0x35, 0xa8, 0x3d, 0x54, 0x3d, 0xfc, 0x3e, + 0x87, 0x3e, 0x46, 0x3d, 0xad, 0x3e, 0x99, 0x3c, 0xa1, 0x38, 0xe2, 0x3d, 0xb2, 0x38, 0xc0, 0x3f, 0x5d, 0x3d, 0xb6, 0x3d, 0x49, 0x3e, 0x97, 0x3c, 0xc0, 0x3e, 0x94, 0x3c, 0x8c, 0x3e, 0x15, 0x3b, + 0x38, 0x40, 0x6a, 0x3e, 0xb2, 0x3d, 0x45, 0x40, 0x34, 0x40, 0xea, 0x37, 0x08, 0x3a, 0x35, 0x3d, 0x04, 0x3c, 0x6a, 0x3e, 0x9c, 0x3c, 0x9d, 0x3b, 0x22, 0x3c, 0x6e, 0x3d, 0x13, 0x39, 0x30, 0x3d, + 0x15, 0x35, 0x68, 0x38, 0xc0, 0x38, 0x08, 0xaa, 0x39, 0x3b, 0xca, 0x38, 0x88, 0x36, 0xef, 0x35, 0x28, 0x34, 0xfe, 0x39, 0xe0, 0xaf, 0x88, 0x2d, 0x7c, 0xad, 0xd6, 0x38, 0x9a, 0x3b, 0x2a, 0x38, + 0x80, 0x29, 0x4e, 0x34, 0x09, 0x38, 0x88, 0xb1, 0x60, 0x35, 0x2c, 0x38, 0x4b, 0x36, 0xcc, 0x37, 0xc0, 0x3b, 0xc2, 0x39, 0x12, 0x39, 0x5a, 0x2d, 0xc8, 0x3b, 0xb9, 0x39, 0x4c, 0x32, 0x18, 0x36, + 0x13, 0x3c, 0x91, 0x37, 0x50, 0x3a, 0x9c, 0x39, 0x8d, 0x39, 0x25, 0x37, 0xf8, 0x2e, 0xd8, 0xae, 0xdb, 0x37, 0x92, 0x38, 0xa2, 0x39, 0x46, 0x38, 0xfc, 0xaf, 0xcc, 0x39, 0x00, 0x38, 0x94, 0x35, + 0xce, 0x3a, 0x23, 0x34, 0x9f, 0x3c, 0xf4, 0x38, 0x0c, 0xb0, 0xa4, 0x31, 0xc2, 0x38, 0xfa, 0x33, 0x0f, 0xb4, 0x53, 0x3d, 0xa6, 0x32, 0xa9, 0x3b, 0x46, 0x3a, 0xc4, 0x2d, 0x07, 0x3c, 0xbb, 0x38, + 0x5e, 0xb0, 0xec, 0x3a, 0x95, 0x38, 0x3c, 0x2e, 0x80, 0x25, 0x4a, 0x36, 0xb5, 0x35, 0xf8, 0x36, 0x76, 0x38, 0xae, 0x3d, 0x1a, 0x3b, 0x86, 0x36, 0x68, 0x29, 0x7c, 0x2f, 0xbb, 0x35, 0x45, 0x37, + 0x25, 0x38, 0x12, 0x3c, 0xcd, 0x37, 0x2c, 0x38, 0x46, 0x36, 0x7c, 0x34, 0x56, 0x39, 0x8e, 0x35, 0xa8, 0xb0, 0x86, 0x39, 0x49, 0x3b, 0xae, 0x3a, 0x8c, 0xa8, 0x78, 0x36, 0xed, 0x38, 0x8a, 0x36, + 0x2e, 0x3a, 0x40, 0x39, 0x64, 0x39, 0x86, 0x38, 0x09, 0xb5, 0x84, 0x3e, 0xfe, 0x3b, 0xb9, 0x39, 0x40, 0x25, 0x40, 0x3d, 0xe8, 0x2d, 0x9f, 0xb0, 0x82, 0x32, 0x3f, 0x38, 0x72, 0x3c, 0x3f, 0x33, + 0x00, 0x3b, 0x20, 0x35, 0x96, 0x3a, 0xc2, 0x3c, 0x31, 0x37, 0xd8, 0x39, 0xb0, 0x27, 0xd8, 0x3a, 0x09, 0x34, 0x87, 0x3d, 0x7d, 0xaf, 0x28, 0xae, 0xa0, 0x2d, 0x00, 0x36, 0x80, 0x97, 0x0c, 0x39, + 0xe2, 0xb1, 0xb6, 0xb0, 0xb4, 0xb1, 0xb9, 0xb1, 0x22, 0xb0, 0x38, 0xb0, 0x9a, 0xb1, 0xc8, 0xb0, 0x61, 0xa9, 0xf0, 0xb1, 0xd5, 0xab, 0x20, 0xb3, 0x50, 0xaf, 0x18, 0xb3, 0x85, 0xb1, 0x7e, 0xb2, + 0x76, 0xb3, 0x0e, 0xb2, 0xfa, 0xb0, 0x34, 0xb1, 0x1b, 0xb1, 0xcc, 0xb2, 0x98, 0xb0, 0xe8, 0xb1, 0x85, 0xb2, 0x99, 0xaf, 0x0b, 0xb0, 0x70, 0xaf, 0x6e, 0xb2, 0x0e, 0xb3, 0x6b, 0xb1, 0x76, 0xb2, + 0xdc, 0xaf, 0x16, 0xb2, 0x6e, 0xaf, 0x70, 0xad, 0x4e, 0xb1, 0x99, 0xb2, 0x48, 0xb1, 0x09, 0xb1, 0x97, 0xb2, 0x63, 0xb0, 0xd6, 0xb2, 0x3b, 0xae, 0xd0, 0xb2, 0xa1, 0xb1, 0x54, 0xb2, 0x31, 0xb0, + 0xbf, 0xb3, 0x98, 0xb0, 0x4c, 0xaf, 0xf6, 0xb0, 0xba, 0xb0, 0xdb, 0xb1, 0xd4, 0xad, 0x02, 0xb0, 0xdc, 0xac, 0xd2, 0xb3, 0xda, 0xb0, 0xc2, 0xb2, 0x40, 0xb1, 0x1e, 0xb1, 0xbf, 0xb2, 0x5a, 0xaf, + 0x64, 0xb0, 0x37, 0xb1, 0x22, 0xb2, 0xa0, 0x19, 0x48, 0xb1, 0xd4, 0xae, 0xd9, 0xb0, 0x84, 0xad, 0x1a, 0xb1, 0x58, 0xb1, 0xdf, 0xb0, 0x4c, 0xb0, 0xb2, 0xae, 0xd8, 0xb1, 0xb6, 0xb0, 0x47, 0xb1, + 0x13, 0xb4, 0x7d, 0xb1, 0x5e, 0xb2, 0x7d, 0xb2, 0xb4, 0xb2, 0xdd, 0xae, 0x73, 0xb0, 0x8b, 0xae, 0x1a, 0xb3, 0x5e, 0xb0, 0x19, 0xb2, 0x04, 0xb3, 0x48, 0xab, 0xc4, 0xb0, 0xab, 0xb0, 0x90, 0xb3, + 0x78, 0xb2, 0xee, 0xb1, 0x6c, 0xb2, 0xb1, 0xae, 0x49, 0xaa, 0x7a, 0xb0, 0xda, 0xaf, 0x10, 0xb4, 0xb3, 0xb0, 0x66, 0xb1, 0xe6, 0xb1, 0xe1, 0xad, 0x3a, 0xb2, 0xcc, 0xae, 0xb4, 0xb1, 0xca, 0xad, + 0x98, 0xb3, 0x6c, 0xb1, 0xa7, 0xb1, 0x36, 0xb4, 0x6c, 0xb4, 0xae, 0xad, 0x98, 0xad, 0xca, 0xb0, 0xfb, 0xaf, 0xb1, 0xb3, 0x2c, 0xae, 0x1b, 0xb0, 0xa9, 0xb0, 0xd4, 0xb1, 0x88, 0xab, 0xd5, 0xb0, + 0x6c, 0xac, 0xd7, 0xac, 0x9b, 0xae, 0xa0, 0xa2, 0xf8, 0xaf, 0x17, 0xab, 0xa2, 0xaa, 0x8e, 0xa5, 0x6c, 0x90, 0x6c, 0xae, 0x90, 0x18, 0xec, 0xaa, 0x60, 0x17, 0xe5, 0xaf, 0x98, 0xb0, 0xe6, 0xae, + 0x90, 0xa9, 0x89, 0xac, 0x5c, 0xaa, 0xd8, 0x20, 0x47, 0xaa, 0x64, 0xad, 0x0c, 0xab, 0x2c, 0xac, 0x68, 0xb0, 0xd2, 0xae, 0x67, 0xad, 0x18, 0xa2, 0x22, 0xaf, 0x1b, 0xb0, 0x1d, 0xab, 0x05, 0xac, + 0xbd, 0xae, 0xd8, 0xab, 0x91, 0xad, 0x6b, 0xad, 0x92, 0xae, 0x74, 0xae, 0x48, 0x9e, 0x00, 0x98, 0x00, 0xae, 0x8d, 0xac, 0x04, 0xb0, 0x5a, 0xa7, 0x80, 0x18, 0x3f, 0xae, 0x20, 0xad, 0x36, 0xab, + 0x40, 0xb0, 0x56, 0xa9, 0xee, 0xaf, 0x6c, 0xae, 0x9e, 0xa5, 0x18, 0xab, 0x32, 0xa8, 0x1c, 0xa9, 0x43, 0x2a, 0xfd, 0xb1, 0x8a, 0xa9, 0x3d, 0xb0, 0x2e, 0xab, 0x06, 0xa9, 0xbc, 0xb1, 0xb6, 0xac, + 0xf4, 0xa2, 0x48, 0xb0, 0xea, 0xad, 0xd8, 0x28, 0x13, 0xa9, 0x84, 0xa8, 0x1e, 0xa6, 0x0d, 0xa9, 0xf8, 0xad, 0x49, 0xb1, 0xe2, 0xaf, 0x67, 0xa9, 0xe2, 0xa9, 0x87, 0xab, 0x2b, 0xac, 0x44, 0xad, + 0xe1, 0xad, 0xe6, 0xae, 0x5a, 0xad, 0xfa, 0xae, 0xa7, 0xac, 0xcc, 0xa8, 0x2e, 0xac, 0x52, 0xab, 0x2c, 0xa9, 0xf8, 0xad, 0x8d, 0xaf, 0xc7, 0xb0, 0xc0, 0xa4, 0x24, 0xaa, 0xc0, 0xac, 0x90, 0xae, + 0x70, 0xaf, 0x9d, 0xaf, 0x78, 0xae, 0xe8, 0xa9, 0x25, 0x29, 0xfc, 0xb0, 0x3f, 0xb1, 0x38, 0xb0, 0xa4, 0xa3, 0x07, 0xb1, 0xaa, 0xa8, 0xcb, 0x28, 0xec, 0xa9, 0xbc, 0xa9, 0x05, 0xb0, 0x48, 0xa6, + 0x48, 0xaf, 0x4b, 0xa9, 0x77, 0xaf, 0x65, 0xb1, 0x1d, 0xaf, 0x30, 0xaf, 0xac, 0xa3, 0xb9, 0xae, 0x58, 0xaa, 0xfb, 0xb2, 0xd0, 0x27, 0x0a, 0xa6, 0x8d, 0xaa, 0x5e, 0xad, 0xdc, 0x1c, 0x67, 0xad, + 0xec, 0xac, 0xd8, 0xac, 0x72, 0xac, 0xd7, 0xad, 0xd8, 0xaa, 0x13, 0xae, 0x8c, 0xae, 0x62, 0xaf, 0x70, 0xab, 0x31, 0xae, 0x7c, 0xa6, 0xb5, 0xad, 0x03, 0xac, 0xd2, 0xac, 0x2b, 0xac, 0x83, 0xac, + 0x8c, 0xae, 0x9e, 0xac, 0xad, 0xae, 0xcc, 0xad, 0x8a, 0xad, 0x9a, 0xae, 0xed, 0xac, 0x97, 0xae, 0xfe, 0xad, 0xab, 0xa9, 0x0b, 0xac, 0x76, 0xac, 0xcc, 0xaf, 0x50, 0xad, 0x5a, 0xac, 0x99, 0xae, + 0x7a, 0xad, 0xfb, 0xae, 0xb2, 0xac, 0xd2, 0xa9, 0xcb, 0xac, 0x90, 0xac, 0x08, 0xaf, 0x4d, 0xad, 0x6a, 0xad, 0xee, 0xac, 0x26, 0xad, 0x19, 0xae, 0xac, 0xaf, 0xcd, 0xad, 0x27, 0xae, 0xf0, 0xab, + 0xeb, 0xae, 0xce, 0xac, 0xe4, 0xac, 0xdc, 0xab, 0xa2, 0xaa, 0xbd, 0xac, 0x0a, 0xae, 0x21, 0xac, 0x88, 0xac, 0xf4, 0xae, 0x9d, 0xac, 0x93, 0xae, 0x66, 0xb0, 0x4a, 0xac, 0x29, 0xab, 0x07, 0xac, + 0xd6, 0xaa, 0xa1, 0xab, 0x98, 0xad, 0xca, 0xaa, 0xd6, 0xab, 0xdf, 0xac, 0x30, 0xaf, 0x17, 0xac, 0x1f, 0xac, 0x26, 0xae, 0x52, 0xac, 0x97, 0xad, 0xdc, 0xa4, 0x02, 0xac, 0x11, 0xac, 0x4d, 0xac, + 0xb8, 0xaf, 0x3c, 0xaf, 0xd6, 0xad, 0x70, 0xac, 0x48, 0xae, 0xab, 0xab, 0x32, 0xae, 0x80, 0xa9, 0xcf, 0xac, 0x45, 0xac, 0x69, 0xae, 0xb3, 0xac, 0x3c, 0xa1, 0xbe, 0xad, 0x6a, 0xad, 0x56, 0xad, + 0xe6, 0xad, 0x18, 0xac, 0x23, 0xae, 0x63, 0xad, 0xaf, 0xa8, 0x68, 0xaf, 0x65, 0xa4, 0x65, 0xae, 0xf8, 0xac, 0xf0, 0xad, 0x7c, 0xad, 0x36, 0xad, 0x15, 0xae, 0x45, 0xad, 0xf9, 0xae, 0x54, 0xab, + 0x27, 0xb0, 0x5f, 0xae, 0x41, 0xad, 0xe6, 0xaf, 0xa4, 0xae, 0x3a, 0xa5, 0x51, 0xa9, 0x44, 0xad, 0xe0, 0xaa, 0xf6, 0xac, 0x1d, 0xad, 0x55, 0xa9, 0xb0, 0xa9, 0x3e, 0xac, 0x5d, 0xa9, 0x02, 0xad, + 0xfe, 0xa4, 0x1c, 0xa9, 0x19, 0xa8, 0x41, 0xa0, 0x8e, 0xaa, 0x42, 0xab, 0x33, 0xa9, 0x97, 0xaa, 0xc8, 0xa8, 0xec, 0xaa, 0x89, 0x1e, 0xf8, 0x9c, 0xcc, 0x9c, 0xb1, 0xa6, 0x7c, 0xaa, 0xe4, 0xa5, + 0xaa, 0x9f, 0xa8, 0xa2, 0x96, 0xaa, 0xb4, 0x9c, 0x00, 0xa8, 0x01, 0xa9, 0x30, 0xa8, 0x8b, 0xa9, 0xc2, 0xab, 0xe5, 0xa8, 0x67, 0xa9, 0x7e, 0xa4, 0xee, 0xac, 0xb8, 0xa8, 0xaa, 0xa1, 0x4d, 0xa8, + 0xea, 0xac, 0xb6, 0xa9, 0x86, 0xab, 0xce, 0xa9, 0x8b, 0xa9, 0xc3, 0xa4, 0xca, 0xa7, 0x40, 0x9d, 0x4c, 0xa7, 0xa4, 0xa9, 0xa8, 0xa8, 0xcf, 0xab, 0x53, 0xa2, 0x98, 0xaa, 0xc4, 0xa8, 0x54, 0xa6, + 0xda, 0xaa, 0x2e, 0xa6, 0x34, 0xad, 0x5c, 0xa8, 0xa8, 0x21, 0x3f, 0xa1, 0x2c, 0xac, 0x97, 0xa5, 0x4e, 0xa0, 0x1e, 0xad, 0x96, 0xa4, 0x0d, 0xac, 0x57, 0xad, 0xdc, 0x9e, 0x24, 0xa9, 0x6b, 0xa9, + 0x4f, 0x20, 0xc1, 0xa9, 0xd7, 0xa8, 0x8c, 0xa8, 0xf8, 0x15, 0x2c, 0xa9, 0x36, 0xaa, 0x30, 0xa9, 0x02, 0xa8, 0x16, 0xae, 0xc8, 0xaa, 0x4b, 0xa9, 0x4c, 0x21, 0xf0, 0x91, 0xc1, 0xa5, 0xa4, 0xa6, + 0x04, 0xa9, 0x32, 0xad, 0x4c, 0xa8, 0xbd, 0xa5, 0xa0, 0xa7, 0x68, 0xa6, 0xad, 0xab, 0x3c, 0xa5, 0x01, 0x24, 0xc6, 0xa9, 0x17, 0xac, 0xec, 0xa8, 0x94, 0x20, 0x05, 0xa9, 0x3f, 0xaa, 0x3b, 0xa4, + 0x4b, 0xaa, 0xe5, 0xa7, 0xe8, 0xa9, 0x09, 0xab, 0x20, 0x20, 0xea, 0xaf, 0xd6, 0xa8, 0xf8, 0xa8, 0xf4, 0xa1, 0x90, 0xad, 0x07, 0xa2, 0x4b, 0xa4, 0x57, 0xa5, 0x9c, 0xaa, 0x4b, 0xad, 0x38, 0xa6, + 0x5a, 0xac, 0x89, 0xa8, 0xac, 0xaa, 0xe0, 0xac, 0x6c, 0xa5, 0x1f, 0xa8, 0x6d, 0x9d, 0xad, 0xab, 0x24, 0xa4, 0x40, 0xac, 0x38, 0xa4, 0x7c, 0x21, 0xb0, 0x19, 0x66, 0xa4, 0x2f, 0xa1, 0xc0, 0xa9, + 0xea, 0xc0, 0xed, 0xbf, 0x89, 0xc1, 0xd2, 0xbe, 0xd5, 0xc0, 0x5a, 0xbd, 0x77, 0xbf, 0x37, 0xbc, 0xf0, 0x2b, 0x0e, 0xc1, 0xc1, 0xb8, 0x84, 0xc1, 0x90, 0xbb, 0x11, 0xc3, 0x32, 0xc2, 0x56, 0xc2, + 0x52, 0xc1, 0x38, 0xc1, 0x1c, 0xbe, 0x11, 0xbd, 0x30, 0xbf, 0x6a, 0xc1, 0xe6, 0xbe, 0x3d, 0xc0, 0x58, 0xc2, 0x6b, 0xc0, 0x97, 0xbf, 0x0e, 0xbc, 0x1c, 0xc1, 0xef, 0xc2, 0x73, 0xc0, 0xb6, 0xc0, + 0x0e, 0xbf, 0x31, 0xc0, 0x94, 0xbe, 0xf6, 0xbd, 0x06, 0xc1, 0x4a, 0xc2, 0xda, 0xbc, 0x8a, 0xbd, 0xd6, 0xc1, 0x17, 0xbf, 0xbc, 0xc2, 0x9b, 0xb8, 0xf1, 0xbe, 0xdc, 0xc0, 0x12, 0xc1, 0xea, 0xbe, + 0x10, 0xc3, 0x83, 0xbe, 0xa3, 0xbf, 0xfc, 0xc0, 0x2a, 0xbf, 0xb6, 0xc0, 0x3c, 0xb8, 0xdc, 0xbd, 0xd8, 0x30, 0x0d, 0xc4, 0x31, 0xbf, 0x42, 0xc2, 0x45, 0xbd, 0xcb, 0xbf, 0x1b, 0xc4, 0x85, 0xbe, + 0x29, 0xbe, 0xde, 0xc1, 0x51, 0xc1, 0x14, 0x3c, 0x32, 0xc0, 0x05, 0xbc, 0x8f, 0xbc, 0x0c, 0xbb, 0xeb, 0xc0, 0xaf, 0xc1, 0x2a, 0xc1, 0x61, 0xbd, 0x05, 0xbf, 0x09, 0xc1, 0x11, 0xc0, 0xd1, 0xc0, + 0x66, 0xc2, 0x60, 0xc0, 0x49, 0xc1, 0x62, 0xc2, 0x3d, 0xc1, 0xe9, 0xbc, 0x0b, 0xbe, 0x22, 0xbe, 0x95, 0xc1, 0x2c, 0xc0, 0x70, 0xc1, 0x82, 0xc3, 0x2f, 0xbb, 0x5b, 0xbe, 0x5d, 0xbf, 0xef, 0xc2, + 0xf2, 0xc1, 0x2a, 0xc2, 0x8a, 0xc1, 0xcb, 0xbb, 0x25, 0x2c, 0x3a, 0xc0, 0x38, 0xc2, 0x99, 0xc3, 0xa0, 0xbd, 0xaa, 0xc1, 0x21, 0xc0, 0x1a, 0xac, 0x64, 0xc0, 0x08, 0xbc, 0xfe, 0xc0, 0x20, 0xbb, + 0x0e, 0xc2, 0xe2, 0xbe, 0x6e, 0xc1, 0xf7, 0xc3, 0xce, 0xc3, 0x32, 0xc0, 0x96, 0xbb, 0x68, 0xc0, 0xb0, 0xbe, 0xa4, 0xc4, 0xd0, 0xb2, 0x8d, 0xbe, 0x3a, 0xc0, 0x62, 0xc1, 0x64, 0xb5, 0x2c, 0xc0}; unsigned char conv2d_winograd_fp16_bias[] = { 0xf6, 0x3e, 0x80, 0x3f, 0x7f, 0x44, 0xde, 0x3e, 0x90, 0x47, 0x25, 0x4b, 0xa4, 0xc4, 0x00, 0x42, diff --git a/tests/unit_test/valid_data/fullyconnected.dat b/tests/unit_test/valid_data/fullyconnected.dat index 28e69609..4a1d7bd0 100644 --- a/tests/unit_test/valid_data/fullyconnected.dat +++ b/tests/unit_test/valid_data/fullyconnected.dat @@ -261,19 +261,19 @@ unsigned char fc_fp32_weight_ref[] = { 0x91, 0x51, 0x76, 0xbf, 0x46, 0x9e, 0x13, 0x3e, 0x9f, 0x56, 0x1e, 0x3f, 0xbc, 0x63, 0x15, 0x3e, 0xed, 0xe3, 0x56, 0xbf, 0xc4, 0x5c, 0xd1, 0x3e, 0xff, 0x0e, 0x62, 0x3f, 0xcf, 0x03, 0xfd, 0x3d, 0x78, 0x49, 0x74, 0x3f, 0x11, 0x16, 0x7f, 0xbf, 0xe6, 0x8b, 0x5c, 0xbf, 0xd4, 0xdb, 0x3a, 0x3f, - 0x28, 0x29, 0x14, 0x3f, 0x34, 0x6f, 0x25, 0xbf, 0xc5, 0xab, 0x6c, 0x3f, 0x13, 0xd3, 0xf6, 0x3e, - 0x7d, 0x2c, 0x21, 0x3f, 0xca, 0x50, 0x30, 0xbd, 0x81, 0x4b, 0x3d, 0xbd, 0xdf, 0xa0, 0x34, 0x3f, - 0xf5, 0xfb, 0xa8, 0x3e, 0xc8, 0xec, 0x98, 0x3e, 0x29, 0x1b, 0x3f, 0xbe, 0x0d, 0x96, 0x1c, 0x3f, - 0x35, 0xf6, 0x3c, 0x3f, 0x02, 0x45, 0xef, 0xbe, 0x93, 0x3d, 0x47, 0xbf, 0x63, 0x99, 0x65, 0xbf, - 0xcd, 0xbc, 0x1f, 0xbf, 0xb0, 0x59, 0x18, 0xbf, 0x1a, 0x16, 0x55, 0xbf, 0xf5, 0x8e, 0x54, 0x3f, - 0xf1, 0x41, 0x0c, 0xbf, 0xff, 0xd4, 0x0c, 0x3f, 0x72, 0xed, 0x15, 0x3e, 0x8b, 0x2e, 0x6f, 0xbe, - 0xce, 0x46, 0x5d, 0xbf, 0xcd, 0xa9, 0x7d, 0xbe, 0x76, 0x06, 0x5b, 0xbf, 0xad, 0xce, 0x74, 0x3d, - 0x4c, 0x40, 0x4d, 0xbf, 0xd3, 0xdc, 0xc2, 0x3d, 0x41, 0x80, 0x56, 0x3f, 0x18, 0x2f, 0x46, 0x3f, - 0xdd, 0x44, 0xc1, 0xbe, 0xa7, 0xa5, 0x88, 0xbe, 0x6d, 0x52, 0x46, 0xbe, 0xc5, 0x68, 0x22, 0xbf, - 0x72, 0x67, 0x80, 0x3d, 0xa3, 0xab, 0x85, 0x3d, 0xcf, 0x99, 0x33, 0x3f, 0x19, 0x7a, 0x08, 0x3f, - 0x3a, 0xed, 0x9d, 0x3d, 0x43, 0x56, 0xca, 0x3d, 0x5d, 0x59, 0x66, 0xbf, 0x2f, 0xfc, 0x52, 0xbf, - 0x8d, 0xc9, 0x12, 0xbf, 0x61, 0x31, 0xbb, 0xbe, 0x12, 0x67, 0x75, 0x3f, 0x5a, 0xe5, 0xae, 0xbe, - 0xcd, 0xe4, 0x4b, 0xbe, 0x3c, 0x5b, 0x43, 0x3f, 0xa8, 0x1e, 0xda, 0xbe}; + 0x28, 0x29, 0x14, 0x3f, 0xb0, 0x59, 0x18, 0xbf, 0x6d, 0x52, 0x46, 0xbe, 0x34, 0x6f, 0x25, 0xbf, + 0x1a, 0x16, 0x55, 0xbf, 0xc5, 0x68, 0x22, 0xbf, 0xc5, 0xab, 0x6c, 0x3f, 0xf5, 0x8e, 0x54, 0x3f, + 0x72, 0x67, 0x80, 0x3d, 0x13, 0xd3, 0xf6, 0x3e, 0xf1, 0x41, 0x0c, 0xbf, 0xa3, 0xab, 0x85, 0x3d, + 0x7d, 0x2c, 0x21, 0x3f, 0xff, 0xd4, 0x0c, 0x3f, 0xcf, 0x99, 0x33, 0x3f, 0xca, 0x50, 0x30, 0xbd, + 0x72, 0xed, 0x15, 0x3e, 0x19, 0x7a, 0x08, 0x3f, 0x81, 0x4b, 0x3d, 0xbd, 0x8b, 0x2e, 0x6f, 0xbe, + 0x3a, 0xed, 0x9d, 0x3d, 0xdf, 0xa0, 0x34, 0x3f, 0xce, 0x46, 0x5d, 0xbf, 0x43, 0x56, 0xca, 0x3d, + 0xf5, 0xfb, 0xa8, 0x3e, 0xcd, 0xa9, 0x7d, 0xbe, 0x5d, 0x59, 0x66, 0xbf, 0xc8, 0xec, 0x98, 0x3e, + 0x76, 0x06, 0x5b, 0xbf, 0x2f, 0xfc, 0x52, 0xbf, 0x29, 0x1b, 0x3f, 0xbe, 0xad, 0xce, 0x74, 0x3d, + 0x8d, 0xc9, 0x12, 0xbf, 0x0d, 0x96, 0x1c, 0x3f, 0x4c, 0x40, 0x4d, 0xbf, 0x61, 0x31, 0xbb, 0xbe, + 0x35, 0xf6, 0x3c, 0x3f, 0xd3, 0xdc, 0xc2, 0x3d, 0x12, 0x67, 0x75, 0x3f, 0x02, 0x45, 0xef, 0xbe, + 0x41, 0x80, 0x56, 0x3f, 0x5a, 0xe5, 0xae, 0xbe, 0x93, 0x3d, 0x47, 0xbf, 0x18, 0x2f, 0x46, 0x3f, + 0xcd, 0xe4, 0x4b, 0xbe, 0x63, 0x99, 0x65, 0xbf, 0xdd, 0x44, 0xc1, 0xbe, 0x3c, 0x5b, 0x43, 0x3f, + 0xcd, 0xbc, 0x1f, 0xbf, 0xa7, 0xa5, 0x88, 0xbe, 0xa8, 0x1e, 0xda, 0xbe}; unsigned char fc_fp32_bias[] = { 0x87, 0x61, 0xbb, 0x3f, 0xde, 0x60, 0xaa, 0x40, 0xe2, 0x91, 0xe9, 0x3e, 0xec, 0x1f, 0xed, 0x3d, 0x98, 0x43, 0x7d, 0x40, 0x2a, 0x12, 0x40, 0x40, 0x55, 0x39, 0xa7, 0x40, 0x20, 0x4e, 0x24, 0xc0, @@ -416,21 +416,21 @@ unsigned char fc_fp16_weight_ref[] = { 0x21, 0xbb, 0x8c, 0x38, 0xbc, 0xba, 0x35, 0x2c, 0x0e, 0x38, 0xef, 0x39, 0x5b, 0xad, 0x12, 0xa3, 0x79, 0x38, 0x64, 0x33, 0x6f, 0xbb, 0x32, 0x3b, 0xae, 0xbb, 0x8d, 0xb8, 0xc6, 0xa7, 0x00, 0x29, 0xb7, 0x3a, 0xea, 0x27, 0xc2, 0x33, 0x90, 0x33, 0x56, 0xbb, 0x12, 0xbb, 0x9d, 0x37, 0xc3, 0xb9, - 0x44, 0xb1, 0x8c, 0xb7, 0x65, 0xb6, 0x2e, 0x37, 0x4c, 0xbb, 0xa7, 0x36, 0xa3, 0xb1, 0x84, 0x28, - 0x5d, 0xb9, 0x00, 0x34, 0x7e, 0x37, 0x5b, 0xb9, 0xdf, 0xb8, 0x6b, 0x39, 0xb2, 0xbb, 0xb7, 0xba, - 0xa2, 0x3b, 0xe4, 0xb8, 0xb7, 0x32, 0xbe, 0x38, 0xe7, 0x35, 0x32, 0x38, 0x78, 0xb4, 0xa3, 0xb9, - 0xa8, 0xba, 0x55, 0xad, 0x9e, 0xb6, 0x80, 0x39, 0x4c, 0x38, 0x98, 0xbb, 0x1f, 0xb8, 0x9c, 0x30, - 0x8a, 0x36, 0xf8, 0xbb, 0x3d, 0x38, 0xb9, 0xb8, 0x8d, 0x3a, 0xca, 0x34, 0x26, 0xb4, 0xfb, 0xb5, - 0x4e, 0x39, 0xa9, 0x30, 0xa5, 0x27, 0x6a, 0xb7, 0xd6, 0xbb, 0xae, 0x38, 0xe7, 0x2a, 0xe8, 0xb9, - 0xf2, 0x38, 0x10, 0x3b, 0xe4, 0xba, 0x22, 0x37, 0x4d, 0x3b, 0xd5, 0xb8, 0xee, 0x31, 0xc3, 0x33, - 0x54, 0xb2, 0xa3, 0x32, 0x8a, 0xb5, 0xd1, 0x36, 0xfa, 0x3a, 0xe8, 0x38, 0x7a, 0xb8, 0x5a, 0x9e, - 0x23, 0xba, 0xab, 0x30, 0xe8, 0x2f, 0xd6, 0x39, 0xa1, 0x38, 0x2b, 0xb9, 0x65, 0x3b, 0xb6, 0x37, - 0x09, 0x39, 0x82, 0xa9, 0xea, 0xa9, 0xa5, 0x39, 0x47, 0x35, 0xc7, 0x34, 0xf8, 0xb1, 0xe4, 0x38, - 0xe7, 0x39, 0x7a, 0xb7, 0x39, 0xba, 0x2c, 0xbb, 0xfd, 0xb8, 0xc2, 0xb8, 0xa8, 0xba, 0xa4, 0x3a, - 0x62, 0xb8, 0x66, 0x38, 0xaf, 0x30, 0x79, 0xb3, 0xea, 0xba, 0xed, 0xb3, 0xd8, 0xba, 0xa6, 0x2b, - 0x6a, 0xba, 0x16, 0x2e, 0xb4, 0x3a, 0x31, 0x3a, 0x0a, 0xb6, 0x45, 0xb4, 0x32, 0xb2, 0x13, 0xb9, - 0x03, 0x2c, 0x2d, 0x2c, 0x9c, 0x39, 0x43, 0x38, 0xef, 0x2c, 0x52, 0x2e, 0x32, 0xbb, 0x97, 0xba, - 0x96, 0xb8, 0xd9, 0xb5, 0xab, 0x3b, 0x77, 0xb5, 0x5f, 0xb2, 0x1a, 0x3a, 0xd0, 0xb6}; + 0x44, 0xb1, 0xe4, 0xb8, 0x3d, 0x38, 0x22, 0x37, 0xa1, 0x38, 0xc2, 0xb8, 0x32, 0xb2, 0x8c, 0xb7, + 0xb7, 0x32, 0xb9, 0xb8, 0x4d, 0x3b, 0x2b, 0xb9, 0xa8, 0xba, 0x13, 0xb9, 0x65, 0xb6, 0xbe, 0x38, + 0x8d, 0x3a, 0xd5, 0xb8, 0x65, 0x3b, 0xa4, 0x3a, 0x03, 0x2c, 0x2e, 0x37, 0xe7, 0x35, 0xca, 0x34, + 0xee, 0x31, 0xb6, 0x37, 0x62, 0xb8, 0x2d, 0x2c, 0x4c, 0xbb, 0x32, 0x38, 0x26, 0xb4, 0xc3, 0x33, + 0x09, 0x39, 0x66, 0x38, 0x9c, 0x39, 0xa7, 0x36, 0x78, 0xb4, 0xfb, 0xb5, 0x54, 0xb2, 0x82, 0xa9, + 0xaf, 0x30, 0x43, 0x38, 0xa3, 0xb1, 0xa3, 0xb9, 0x4e, 0x39, 0xa3, 0x32, 0xea, 0xa9, 0x79, 0xb3, + 0xef, 0x2c, 0x84, 0x28, 0xa8, 0xba, 0xa9, 0x30, 0x8a, 0xb5, 0xa5, 0x39, 0xea, 0xba, 0x52, 0x2e, + 0x5d, 0xb9, 0x55, 0xad, 0xa5, 0x27, 0xd1, 0x36, 0x47, 0x35, 0xed, 0xb3, 0x32, 0xbb, 0x00, 0x34, + 0x9e, 0xb6, 0x6a, 0xb7, 0xfa, 0x3a, 0xc7, 0x34, 0xd8, 0xba, 0x97, 0xba, 0x7e, 0x37, 0x80, 0x39, + 0xd6, 0xbb, 0xe8, 0x38, 0xf8, 0xb1, 0xa6, 0x2b, 0x96, 0xb8, 0x5b, 0xb9, 0x4c, 0x38, 0xae, 0x38, + 0x7a, 0xb8, 0xe4, 0x38, 0x6a, 0xba, 0xd9, 0xb5, 0xdf, 0xb8, 0x98, 0xbb, 0xe7, 0x2a, 0x5a, 0x9e, + 0xe7, 0x39, 0x16, 0x2e, 0xab, 0x3b, 0x6b, 0x39, 0x1f, 0xb8, 0xe8, 0xb9, 0x23, 0xba, 0x7a, 0xb7, + 0xb4, 0x3a, 0x77, 0xb5, 0xb2, 0xbb, 0x9c, 0x30, 0xf2, 0x38, 0xab, 0x30, 0x39, 0xba, 0x31, 0x3a, + 0x5f, 0xb2, 0xb7, 0xba, 0x8a, 0x36, 0x10, 0x3b, 0xe8, 0x2f, 0x2c, 0xbb, 0x0a, 0xb6, 0x1a, 0x3a, + 0xa2, 0x3b, 0xf8, 0xbb, 0xe4, 0xba, 0xd6, 0x39, 0xfd, 0xb8, 0x45, 0xb4, 0xd0, 0xb6}; unsigned char fc_fp16_bias[] = { 0xdb, 0x3d, 0x53, 0x45, 0x4c, 0x37, 0x68, 0x2f, 0xea, 0x43, 0x00, 0x42, 0x39, 0x45, 0x22, 0xc1, 0xd1, 0x2a, 0x1f, 0x42, 0xdd, 0xba, 0xd6, 0x42, 0x1b, 0x3d, 0xeb, 0x44, 0x13, 0x3a, 0xa3, 0x43, diff --git a/tests/utils/math_snr.c b/tests/utils/math_snr.c index a2ec95f9..ec3e752e 100644 --- a/tests/utils/math_snr.c +++ b/tests/utils/math_snr.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ /* ---------------------------------------------------------------------- * Include project header files diff --git a/tests/utils/math_snr.h b/tests/utils/math_snr.h index 474c4ca1..4cc87d68 100644 --- a/tests/utils/math_snr.h +++ b/tests/utils/math_snr.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include #include diff --git a/tests/utils/test_utils.c b/tests/utils/test_utils.c index d191537e..243ff08f 100644 --- a/tests/utils/test_utils.c +++ b/tests/utils/test_utils.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "test_utils.h" @@ -266,7 +266,7 @@ void result_verify_bool(bool *reference, bool *output, float *input, float gap, } } -void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input, float gap, +void result_verify_8(float *reference, struct csinn_tensor *output, int8_t *input, float gap, int size, bool save) { int i; @@ -279,10 +279,10 @@ void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input, for (i = 0; i < size; i++) { if (output->dtype == CSINN_DTYPE_UINT8) { output_tmp[i] = - csi_ref_dequantize_u8_to_f32(*((uint8_t *)output_data + i), output->qinfo); + shl_ref_dequantize_u8_to_f32(*((uint8_t *)output_data + i), output->qinfo); } else if (output->dtype == CSINN_DTYPE_INT8) { output_tmp[i] = - csi_ref_dequantize_i8_to_f32(*((int8_t *)output_data + i), output->qinfo); + shl_ref_dequantize_i8_to_f32(*((int8_t *)output_data + i), output->qinfo); } if (isinf(reference[i]) || isnan(reference[i])) { error = 0; @@ -360,7 +360,7 @@ void result_verify_q15(int16_t *reference, int16_t *output, int16_t *input, floa printf("/====== total = %6d(size=%5d) || error = %5d =======/\n", test_number, size, failures); } -void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp) +void get_scale_and_zp(float max_value, float min_value, float *scale, int32_t *zp) { int valid_range = 255; float scale_tmp, zp_tmp; @@ -383,7 +383,7 @@ void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp) *scale = scale_tmp; } -void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, int *zp) +void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, int32_t *zp) { int valid_range = 255; float scale_tmp, zp_tmp; @@ -404,7 +404,7 @@ void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, in *scale = scale_tmp; } -void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp) +void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int32_t *zp) { int valid_range = 255; float scale_tmp, zp_tmp, max_tmp; @@ -425,7 +425,7 @@ void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp *scale = scale_tmp; } -void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale, int *zp) +void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale, int32_t *zp) { int valid_range = 255; float abs_max = fmax(fabs(min_value), fabs(max_value)); @@ -437,7 +437,7 @@ void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale, *scale = 1.0f / pow(2, exponent - 1); } -void get_scale_and_zp_power2_i16(float max_value, float min_value, float *scale, int *zp) +void get_scale_and_zp_power2_i16(float max_value, float min_value, float *scale, int32_t *zp) { int valid_range = 65535; float abs_max = fmax(fabs(min_value), fabs(max_value)); @@ -470,14 +470,15 @@ void find_min_max(float *input, float *max_value, float *min_value, int size) *min_value = min_tmp; } -void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype, enum csinn_api_enum api) +void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, + enum csinn_api_enum api) { float max, min, scale; - int zp, quantized_multiplier, shift; + int32_t zp, quantized_multiplier, shift; if (tensor->qinfo == NULL) { - tensor->qinfo = malloc(sizeof(struct csi_quant_info)); + tensor->qinfo = malloc(sizeof(struct csinn_quant_info)); } - int size = csi_tensor_size(tensor); + int size = csinn_tensor_size(tensor); find_min_max(tensor->data, &max, &min, size); if (qtype == CSINN_QUANT_INT8_SYM) { @@ -518,21 +519,21 @@ void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype, enum tensor->qinfo->max = max; tensor->qinfo->min = min; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); tensor->qinfo->scale = scale; tensor->qinfo->zero_point = zp; tensor->qinfo->multiplier = quantized_multiplier; tensor->qinfo->shift = shift; } -void get_quant_info(struct csi_tensor *tensor) +void get_quant_info(struct csinn_tensor *tensor) { float max, min, scale; - int zp, quantized_multiplier, shift; + int32_t zp, quantized_multiplier, shift; if (tensor->qinfo == NULL) { - tensor->qinfo = malloc(sizeof(struct csi_quant_info)); + tensor->qinfo = malloc(sizeof(struct csinn_quant_info)); } - int size = csi_tensor_size(tensor); + int size = csinn_tensor_size(tensor); find_min_max(tensor->data, &max, &min, size); if ((tensor->sess != NULL) && (tensor->sess->base_api == CSINN_LIGHT)) { get_scale_and_zp_power2_i8(max, min, &scale, &zp); @@ -552,43 +553,44 @@ void get_quant_info(struct csi_tensor *tensor) tensor->qinfo->min = min; } - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); tensor->qinfo->scale = scale; tensor->qinfo->zero_point = zp; tensor->qinfo->multiplier = quantized_multiplier; tensor->qinfo->shift = shift; } -struct csi_tensor *convert_input(struct csi_tensor *tensor, int dtype) +struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype) { - struct csi_tensor *ret = csi_alloc_tensor(tensor->sess); - csi_tensor_copy(ret, tensor); + struct csinn_tensor *ret = csinn_alloc_tensor(tensor->sess); + csinn_tensor_copy(ret, tensor); ret->dtype = dtype; - ret->data = malloc(csi_tensor_byte_size(ret)); - csi_tensor_data_convert(ret, tensor); + ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret)); + csinn_tensor_data_convert(ret, tensor); return ret; } -struct csi_tensor *convert_f32_input(struct csi_tensor *tensor, int dtype, struct csi_session *sess) +struct csinn_tensor *convert_f32_input(struct csinn_tensor *tensor, int dtype, + struct csinn_session *sess) { set_quant_info(tensor, sess->base_quant_type, sess->base_api); - struct csi_tensor *ret = csi_alloc_tensor(sess); - csi_tensor_copy(ret, tensor); + struct csinn_tensor *ret = csinn_alloc_tensor(sess); + csinn_tensor_copy(ret, tensor); ret->sess = sess; ret->dtype = dtype; - ret->data = malloc(csi_tensor_byte_size(ret)); - csi_tensor_data_convert(ret, tensor); + ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret)); + csinn_tensor_data_convert(ret, tensor); return ret; } -struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant_enum qtype, - enum csinn_api_enum api) +struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, + enum csinn_api_enum api) { set_quant_info(tensor, qtype, api); - struct csi_tensor *ret = csi_alloc_tensor(NULL); - csi_tensor_copy(ret, tensor); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, tensor); if ((qtype == CSINN_QUANT_INT8_SYM) || (qtype == CSINN_QUANT_INT8_ASYM)) { ret->dtype = CSINN_DTYPE_INT8; } else if (qtype == CSINN_QUANT_UINT8_ASYM) { @@ -603,30 +605,54 @@ struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant printf("unsupport qinfo\n"); } - ret->data = malloc(csi_tensor_byte_size(ret)); - csi_tensor_data_convert(ret, tensor); + ret->data = malloc(csinn_tensor_byte_size(ret)); + csinn_tensor_data_convert(ret, tensor); return ret; } -void free_input(struct csi_tensor *tensor) +void free_input(struct csinn_tensor *tensor) { - csi_mem_free(tensor->data); - csi_free_tensor(tensor); + shl_mem_free(tensor->data); + csinn_free_tensor(tensor); } -struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor *weight, - struct csi_tensor *bias, enum csinn_api_enum api) +struct csinn_tensor *convert_f32_bias(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *bias, enum csinn_api_enum api) { set_quant_info(input, CSINN_QUANT_INT8_ASYM, api); set_quant_info(weight, CSINN_QUANT_INT8_SYM, api); - int b_size = csi_tensor_size(bias); - struct csi_tensor *ret = csi_alloc_tensor(NULL); - csi_tensor_copy(ret, bias); + int b_size = csinn_tensor_size(bias); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, bias); ret->qinfo->scale = input->qinfo->scale * weight->qinfo->scale; ret->qinfo->zero_point = 0; ret->dtype = CSINN_DTYPE_INT32; - ret->data = malloc(csi_tensor_byte_size(ret)); + ret->data = malloc(csinn_tensor_byte_size(ret)); + int32_t *ret_data = ret->data; + float new_b = 0.0; + float *bias_data = (float *)bias->data; + int b_length = b_size ? bias->dim[0] : weight->dim[0]; + for (int i = 0; i < b_length; i++) { + new_b = b_size ? bias_data[i] : 0.0; + ret_data[i] = new_b / ret->qinfo->scale; + } + + return ret; +} + +struct csinn_tensor *fuse_zp_to_bias(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *bias, enum csinn_api_enum api) +{ + set_quant_info(input, CSINN_QUANT_INT8_ASYM, api); + set_quant_info(weight, CSINN_QUANT_INT8_SYM, api); + int b_size = csinn_tensor_size(bias); + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + csinn_tensor_copy(ret, bias); + ret->qinfo->scale = input->qinfo->scale * weight->qinfo->scale; + ret->qinfo->zero_point = 0; + ret->dtype = CSINN_DTYPE_INT32; + ret->data = malloc(csinn_tensor_byte_size(ret)); int32_t *ret_data = ret->data; int b_length = b_size ? bias->dim[0] : weight->dim[0]; @@ -655,8 +681,8 @@ struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor * void evaluate_error(void *out, void *ref, int size, enum csinn_dtype_enum dtype) { - float *output = csi_mem_alloc(size * sizeof(float)); - float *reference = csi_mem_alloc(size * sizeof(float)); + float *output = shl_mem_alloc(size * sizeof(float)); + float *reference = shl_mem_alloc(size * sizeof(float)); if (dtype == CSINN_DTYPE_FLOAT32) { memcpy(output, out, size * sizeof(float)); memcpy(reference, ref, size * sizeof(float)); @@ -680,6 +706,6 @@ void evaluate_error(void *out, void *ref, int size, enum csinn_dtype_enum dtype) if (kl > 0.01f || cs < 0.99f) { failures++; } - csi_mem_free(output); - csi_mem_free(reference); + shl_mem_free(output); + shl_mem_free(reference); } diff --git a/tests/utils/test_utils.h b/tests/utils/test_utils.h index b397f2b9..af4f48e2 100644 --- a/tests/utils/test_utils.h +++ b/tests/utils/test_utils.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #ifndef TEST_UTILS_H #define TEST_UTILS_H @@ -27,7 +27,7 @@ #include #include "csi_nn.h" -#include "csi_ref.h" +#include "shl_ref.h" #ifdef __cplusplus extern "C" { @@ -42,26 +42,28 @@ void result_verify_f32(float *reference, float *output, float *input, float gap, bool save); void result_verify_bool(bool *reference, bool *output, float *input, float gap, int size, bool save); -void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input, float gap, +void result_verify_8(float *reference, struct csinn_tensor *output, int8_t *input, float gap, int size, bool save); void result_verify_q7(int8_t *reference, int8_t *output, int8_t *input, float gap, int size, bool save); void result_verify_q15(int16_t *reference, int16_t *output, int16_t *input, float gap, int size, bool save); -void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp); -void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp); +void get_scale_and_zp(float max_value, float min_value, float *scale, int32_t *zp); +void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int32_t *zp); void find_min_max(float *input, float *max_value, float *min_value, int size); -void get_quant_info(struct csi_tensor *tensor); -void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype, +void get_quant_info(struct csinn_tensor *tensor); +void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, enum csinn_api_enum api); -struct csi_tensor *convert_input(struct csi_tensor *tensor, int dtype); -struct csi_tensor *convert_f32_input(struct csi_tensor *tensor, int dtype, - struct csi_session *sess); -struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant_enum qtype, - enum csinn_api_enum api); -struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor *weight, - struct csi_tensor *bias, enum csinn_api_enum api); -void free_input(struct csi_tensor *tensor); +struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype); +struct csinn_tensor *convert_f32_input(struct csinn_tensor *tensor, int dtype, + struct csinn_session *sess); +struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, + enum csinn_api_enum api); +struct csinn_tensor *fuse_zp_to_bias(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *bias, enum csinn_api_enum api); +struct csinn_tensor *convert_f32_bias(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *bias, enum csinn_api_enum api); +void free_input(struct csinn_tensor *tensor); extern void init_testsuite(const char *testname); extern int done_testing(void); #ifdef RISCV_TEST diff --git a/tests/validation/Makefile.c860 b/tests/validation/Makefile.c860 index d7df7900..10757d41 100644 --- a/tests/validation/Makefile.c860 +++ b/tests/validation/Makefile.c860 @@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -static CFLAGS += -mhard-float -mcpu=ck860fv CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=0 # params.api = CSINN_API = CSINN_C860 = 0 -LIB_NAME = csi_nn2_c860 +CFLAGS += -DCSINN_API=0 # params->api = CSINN_API = CSINN_C860 = 0 +LIB_NAME = shl_c860 CC = csky-abiv2-linux-gcc test_objs = diff --git a/tests/validation/Makefile.c906 b/tests/validation/Makefile.c906 index 2e2c8a32..901fe901 100644 --- a/tests/validation/Makefile.c906 +++ b/tests/validation/Makefile.c906 @@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcvxthead -mabi=lp64dv CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections CFLAGS += -DCSINN_API=3 -LIB_NAME = csi_nn2_c906 +LIB_NAME = shl_c906 CC = riscv64-unknown-linux-gnu-gcc test_objs = diff --git a/tests/validation/Makefile.ref b/tests/validation/Makefile.ref index a7cb01b1..dd592a5c 100644 --- a/tests/validation/Makefile.ref +++ b/tests/validation/Makefile.ref @@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -static CFLAGS += -mhard-float -mcpu=ck860fv CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=0 # params.api = CSINN_API = CSINN_REF = 0 -LIB_NAME = csi_nn2_ref +CFLAGS += -DCSINN_API=0 # params->api = CSINN_API = CSINN_REF = 0 +LIB_NAME = shl_ref CC = csky-abiv2-linux-gcc test_objs = diff --git a/tests/validation/Makefile.ref_x86 b/tests/validation/Makefile.ref_x86 index 5c0818af..1101c3b0 100644 --- a/tests/validation/Makefile.ref_x86 +++ b/tests/validation/Makefile.ref_x86 @@ -2,8 +2,8 @@ LIB_DIR = ../../lib INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -fopenmp CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=0 # params.api = CSINN_API = CSINN_REF = 0 -LIB_NAME = csi_nn2_ref_x86 +CFLAGS += -DCSINN_API=0 # params->api = CSINN_API = CSINN_REF = 0 +LIB_NAME = shl_ref_x86 CC = gcc test_objs = diff --git a/tests/validation/abs_f32.c b/tests/validation/abs_f32.c index 3181eb0c..8db2d224 100644 --- a/tests/validation/abs_f32.c +++ b/tests/validation/abs_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of abs f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -49,17 +49,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_abs_init(input, output, ¶ms) == CSINN_TRUE) { - csi_abs(input, output, ¶ms); - } + if (csinn_abs_init(input, output, params) == CSINN_TRUE) { + csinn_abs(input, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/abs_i8.c b/tests/validation/abs_i8.c index 9f3a8b01..275635b1 100644 --- a/tests/validation/abs_i8.c +++ b/tests/validation/abs_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of abs i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,33 +57,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -91,16 +90,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_abs_init(input, output, ¶ms) == CSINN_TRUE) { - csi_abs(input, output, ¶ms); - } + if (csinn_abs_init(input, output, params) == CSINN_TRUE) { + csinn_abs(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/abs_u8.c b/tests/validation/abs_u8.c index 57d9c949..475bf1d7 100644 --- a/tests/validation/abs_u8.c +++ b/tests/validation/abs_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of abs u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,33 +57,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -91,16 +90,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_abs_init(input, output, ¶ms) == CSINN_TRUE) { - csi_abs(input, output, ¶ms); - } + if (csinn_abs_init(input, output, params) == CSINN_TRUE) { + csinn_abs(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/acos_f32.c b/tests/validation/acos_f32.c index d7bd5557..753cbf70 100644 --- a/tests/validation/acos_f32.c +++ b/tests/validation/acos_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acos f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_acos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acos(input, output, ¶ms); + if (csinn_acos_init(input, output, params) == CSINN_TRUE) { + csinn_acos(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/acos_i8.c b/tests/validation/acos_i8.c index d077513e..c5ff7b91 100644 --- a/tests/validation/acos_i8.c +++ b/tests/validation/acos_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acos i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_acos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acos(input, output, ¶ms); + if (csinn_acos_init(input, output, params) == CSINN_TRUE) { + csinn_acos(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/acos_u8.c b/tests/validation/acos_u8.c index 4c5e83c5..ef44a1a5 100644 --- a/tests/validation/acos_u8.c +++ b/tests/validation/acos_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acos u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_acos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acos(input, output, ¶ms); + if (csinn_acos_init(input, output, params) == CSINN_TRUE) { + csinn_acos(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/acosh_f32.c b/tests/validation/acosh_f32.c index a6ee3a28..0205d095 100644 --- a/tests/validation/acosh_f32.c +++ b/tests/validation/acosh_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acosh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_acosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acosh(input, output, ¶ms); + if (csinn_acosh_init(input, output, params) == CSINN_TRUE) { + csinn_acosh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/acosh_i8.c b/tests/validation/acosh_i8.c index dffb1000..6a409734 100644 --- a/tests/validation/acosh_i8.c +++ b/tests/validation/acosh_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acosh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,8 +54,7 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -65,23 +64,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -94,9 +94,8 @@ int main(int argc, char** argv) // max error: 0.2 for input [1, 20] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_acosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acosh(input, output, ¶ms); + if (csinn_acosh_init(input, output, params) == CSINN_TRUE) { + csinn_acosh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/acosh_u8.c b/tests/validation/acosh_u8.c index dec25e2b..67704ef0 100644 --- a/tests/validation/acosh_u8.c +++ b/tests/validation/acosh_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acosh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,9 +95,8 @@ int main(int argc, char** argv) // max error: 0.2 for input [1, 20] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_acosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_acosh(input, output, ¶ms); + if (csinn_acosh_init(input, output, params) == CSINN_TRUE) { + csinn_acosh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/add_f32.c b/tests/validation/add_f32.c index a9932df1..f21edbaf 100644 --- a/tests/validation/add_f32.c +++ b/tests/validation/add_f32.c @@ -16,33 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of add f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -64,17 +64,16 @@ int main(int argc, char** argv) out_size = in_size0; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - input1->data = (float *)(buffer + 5 + in_size0); + input0->data = (float *)(buffer + 5); + input1->data = (float *)(buffer + 5 + in_size0); reference->data = (float *)(buffer + 5 + in_size0 + in_size1); - output->data = malloc(in_size0 * sizeof(float)); + output->data = malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_add_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_add(input0, input1, output, ¶ms); + if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_add(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/add_i8.c b/tests/validation/add_i8.c index 24207a1b..4b6af88f 100644 --- a/tests/validation/add_i8.c +++ b/tests/validation/add_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of add i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,17 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_INT8; input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -74,58 +74,57 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); uint8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -134,16 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_add_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_add(input0, input1, output, ¶ms); + if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_add(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/add_u8.c b/tests/validation/add_u8.c index 43fca61b..06f2af8d 100644 --- a/tests/validation/add_u8.c +++ b/tests/validation/add_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of add u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,17 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_UINT8; input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -74,58 +74,57 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); uint8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -134,16 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_add_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_add(input0, input1, output, ¶ms); + if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_add(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/and_u32.c b/tests/validation/and_u32.c index b00a3210..5fda4b6a 100644 --- a/tests/validation/and_u32.c +++ b/tests/validation/and_u32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of and u32.\n"); - struct csi_tensor *input_0 = csi_alloc_tensor(NULL); - struct csi_tensor *input_1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input_0->dim_count = buffer[0]; input_1->dim_count = buffer[0]; output->dim_count = input_0->dim_count; - for(int i = 0; i < input_0->dim_count; i++) { + for (int i = 0; i < input_0->dim_count; i++) { input_0->dim[i] = buffer[i + 1]; input_1->dim[i] = buffer[i + 1]; output->dim[i] = input_0->dim[i]; @@ -48,17 +48,16 @@ int main(int argc, char** argv) input_0->dtype = CSINN_DTYPE_UINT32; input_1->dtype = CSINN_DTYPE_UINT32; output->dtype = CSINN_DTYPE_UINT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); - input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); + input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); + input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size); - output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); + output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_and_init(input_0, input_1, output, ¶ms) == CSINN_TRUE) { - csi_and(input_0, input_1, output, ¶ms); + if (csinn_and_init(input_0, input_1, output, params) == CSINN_TRUE) { + csinn_and(input_0, input_1, output, params); } result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false); diff --git a/tests/validation/arange_f32.c b/tests/validation/arange_f32.c index d14f0e3c..9d5ec79b 100644 --- a/tests/validation/arange_f32.c +++ b/tests/validation/arange_f32.c @@ -16,42 +16,41 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of arange f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct arange_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_arange_params *params = + csinn_alloc_params(sizeof(struct csinn_arange_params), NULL); int out_size = 1; int *buffer = read_input_data_f32(argv[1]); out_size = buffer[3]; - params.start = buffer[0]; - params.stop = buffer[1]; - params.step = buffer[2]; + params->start = buffer[0]; + params->stop = buffer[1]; + params->step = buffer[2]; output->dim_count = 1; output->dim[0] = out_size; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = 0; - reference->data = (float *)(buffer + 4); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_arange_init(output, ¶ms) == CSINN_TRUE) { - csi_arange(output, ¶ms); + if (csinn_arange_init(output, params) == CSINN_TRUE) { + csinn_arange(output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/arange_i8.c b/tests/validation/arange_i8.c index 42842c52..883daaeb 100644 --- a/tests/validation/arange_i8.c +++ b/tests/validation/arange_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of arange i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct arange_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_arange_params *params = + csinn_alloc_params(sizeof(struct csinn_arange_params), NULL); int out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,32 +39,30 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); out_size = buffer[3]; - params.start = buffer[0]; - params.stop = buffer[1]; - params.step = buffer[2]; + params->start = buffer[0]; + params->stop = buffer[1]; + params->step = buffer[2]; output->dim_count = 1; output->dim[0] = out_size; output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; float *ref_data = (float *)(buffer + 4); - csi_quantize_multiplier(params.start, &multiplier, &shift); - params.start_multiplier = multiplier; - params.start_shift = shift; + shl_quantize_multiplier(params->start, &multiplier, &shift); + params->start_multiplier = multiplier; + params->start_shift = shift; - csi_quantize_multiplier(params.stop, &multiplier, &shift); - params.stop_multiplier = multiplier; - params.stop_shift = shift; + shl_quantize_multiplier(params->stop, &multiplier, &shift); + params->stop_multiplier = multiplier; + params->stop_shift = shift; - csi_quantize_multiplier(params.step, &multiplier, &shift); - params.step_multiplier = multiplier; - params.step_shift = shift; + shl_quantize_multiplier(params->step, &multiplier, &shift); + params->step_multiplier = multiplier; + params->step_shift = shift; output->data = ref_data; get_quant_info(output); @@ -71,11 +70,10 @@ int main(int argc, char** argv) reference->data = ref_data; output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); - float difference = argc > 2 ? atof(argv[2]) : 1e-3; - if (csi_arange_init(output, ¶ms) == CSINN_TRUE) { - csi_arange(output, ¶ms); + if (csinn_arange_init(output, params) == CSINN_TRUE) { + csinn_arange(output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/arange_u8.c b/tests/validation/arange_u8.c index 09651ea2..22900518 100644 --- a/tests/validation/arange_u8.c +++ b/tests/validation/arange_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of arange u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct arange_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_arange_params *params = + csinn_alloc_params(sizeof(struct csinn_arange_params), NULL); int out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,32 +39,30 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); out_size = buffer[3]; - params.start = buffer[0]; - params.stop = buffer[1]; - params.step = buffer[2]; + params->start = buffer[0]; + params->stop = buffer[1]; + params->step = buffer[2]; output->dim_count = 1; output->dim[0] = out_size; output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; float *ref_data = (float *)(buffer + 4); - csi_quantize_multiplier(params.start, &multiplier, &shift); - params.start_multiplier = multiplier; - params.start_shift = shift; + shl_quantize_multiplier(params->start, &multiplier, &shift); + params->start_multiplier = multiplier; + params->start_shift = shift; - csi_quantize_multiplier(params.stop, &multiplier, &shift); - params.stop_multiplier = multiplier; - params.stop_shift = shift; + shl_quantize_multiplier(params->stop, &multiplier, &shift); + params->stop_multiplier = multiplier; + params->stop_shift = shift; - csi_quantize_multiplier(params.step, &multiplier, &shift); - params.step_multiplier = multiplier; - params.step_shift = shift; + shl_quantize_multiplier(params->step, &multiplier, &shift); + params->step_multiplier = multiplier; + params->step_shift = shift; output->data = ref_data; get_quant_info(output); @@ -71,11 +70,10 @@ int main(int argc, char** argv) reference->data = ref_data; output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); - float difference = argc > 2 ? atof(argv[2]) : 1e-3; - if (csi_arange_init(output, ¶ms) == CSINN_TRUE) { - csi_arange(output, ¶ms); + if (csinn_arange_init(output, params) == CSINN_TRUE) { + csinn_arange(output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/argmax_stride_f32.c b/tests/validation/argmax_stride_f32.c index d5152a10..55e7faaf 100644 --- a/tests/validation/argmax_stride_f32.c +++ b/tests/validation/argmax_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of argmax f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_argmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_argmax(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_argmax_init(input, output, params) == CSINN_TRUE) { + csinn_argmax(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/argmax_stride_u8.c b/tests/validation/argmax_stride_u8.c index db9c9f6d..e7498125 100644 --- a/tests/validation/argmax_stride_u8.c +++ b/tests/validation/argmax_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of argmax u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -73,7 +71,6 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - float *src_in = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); @@ -85,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_argmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_argmax(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_argmax_init(input, output, params) == CSINN_TRUE) { + csinn_argmax(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/argmin_stride_f32.c b/tests/validation/argmin_stride_f32.c index f7413fd8..db0d2a94 100644 --- a/tests/validation/argmin_stride_f32.c +++ b/tests/validation/argmin_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of argmin f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_argmin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_argmin(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_argmin_init(input, output, params) == CSINN_TRUE) { + csinn_argmin(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/argmin_stride_u8.c b/tests/validation/argmin_stride_u8.c index de82f431..51f4e460 100644 --- a/tests/validation/argmin_stride_u8.c +++ b/tests/validation/argmin_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of argmin u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -67,13 +65,12 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - float *src_in = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); @@ -85,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_argmin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_argmin(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_argmin_init(input, output, params) == CSINN_TRUE) { + csinn_argmin(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/asin_f32.c b/tests/validation/asin_f32.c index cbd02916..b0bc9071 100644 --- a/tests/validation/asin_f32.c +++ b/tests/validation/asin_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asin f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asin(input, output, ¶ms); + if (csinn_asin_init(input, output, params) == CSINN_TRUE) { + csinn_asin(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/asin_i8.c b/tests/validation/asin_i8.c index 53275bf0..585957d8 100644 --- a/tests/validation/asin_i8.c +++ b/tests/validation/asin_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asin i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asin(input, output, ¶ms); + if (csinn_asin_init(input, output, params) == CSINN_TRUE) { + csinn_asin(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/asin_u8.c b/tests/validation/asin_u8.c index 486f7cc8..9d6a26f4 100644 --- a/tests/validation/asin_u8.c +++ b/tests/validation/asin_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asin u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asin(input, output, ¶ms); + if (csinn_asin_init(input, output, params) == CSINN_TRUE) { + csinn_asin(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/asinh_f32.c b/tests/validation/asinh_f32.c index 16c3de0c..ae73149d 100644 --- a/tests/validation/asinh_f32.c +++ b/tests/validation/asinh_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asinh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asinh(input, output, ¶ms); + if (csinn_asinh_init(input, output, params) == CSINN_TRUE) { + csinn_asinh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/asinh_i8.c b/tests/validation/asinh_i8.c index ee8d9dd0..6c1d389f 100644 --- a/tests/validation/asinh_i8.c +++ b/tests/validation/asinh_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asinh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asinh(input, output, ¶ms); + if (csinn_asinh_init(input, output, params) == CSINN_TRUE) { + csinn_asinh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/asinh_u8.c b/tests/validation/asinh_u8.c index 5f989c38..cdb3a770 100644 --- a/tests/validation/asinh_u8.c +++ b/tests/validation/asinh_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asinh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +86,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_asinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_asinh(input, output, ¶ms); + if (csinn_asinh_init(input, output, params) == CSINN_TRUE) { + csinn_asinh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/atan_f32.c b/tests/validation/atan_f32.c index 0d17d612..f74343d1 100644 --- a/tests/validation/atan_f32.c +++ b/tests/validation/atan_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atan f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_atan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atan(input, output, ¶ms); + if (csinn_atan_init(input, output, params) == CSINN_TRUE) { + csinn_atan(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/atan_i8.c b/tests/validation/atan_i8.c index 72bd32f1..293444d2 100644 --- a/tests/validation/atan_i8.c +++ b/tests/validation/atan_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atan i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,8 +96,8 @@ int main(int argc, char** argv) // max error: 0.4 for input [-100, 100] float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_atan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atan(input, output, ¶ms); + if (csinn_atan_init(input, output, params) == CSINN_TRUE) { + csinn_atan(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/atan_u8.c b/tests/validation/atan_u8.c index 5c348568..80ca38b8 100644 --- a/tests/validation/atan_u8.c +++ b/tests/validation/atan_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atan u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,8 +96,8 @@ int main(int argc, char** argv) // max error: 0.4 for input [-100, 100] float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_atan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atan(input, output, ¶ms); + if (csinn_atan_init(input, output, params) == CSINN_TRUE) { + csinn_atan(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/atanh_f32.c b/tests/validation/atanh_f32.c index ee0fa35a..74add65e 100644 --- a/tests/validation/atanh_f32.c +++ b/tests/validation/atanh_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atanh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_atanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atanh(input, output, ¶ms); + if (csinn_atanh_init(input, output, params) == CSINN_TRUE) { + csinn_atanh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/atanh_i8.c b/tests/validation/atanh_i8.c index 36e3e2f0..0c9b8fd1 100644 --- a/tests/validation/atanh_i8.c +++ b/tests/validation/atanh_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atanh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,10 +95,9 @@ int main(int argc, char** argv) output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); // max error: 0.02 for input [-0.9, 0.9] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_atanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atanh(input, output, ¶ms); + if (csinn_atanh_init(input, output, params) == CSINN_TRUE) { + csinn_atanh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/atanh_u8.c b/tests/validation/atanh_u8.c index 5483eab5..c58a1303 100644 --- a/tests/validation/atanh_u8.c +++ b/tests/validation/atanh_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atanh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,10 +95,9 @@ int main(int argc, char** argv) output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); // max error: 0.02 for input [-0.9, 0.9] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_atanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_atanh(input, output, ¶ms); + if (csinn_atanh_init(input, output, params) == CSINN_TRUE) { + csinn_atanh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool3d_f32.c b/tests/validation/averagepool3d_f32.c index f9b2601b..9ea583c8 100644 --- a/tests/validation/averagepool3d_f32.c +++ b/tests/validation/averagepool3d_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool3d f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -46,21 +46,21 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.count_include_pad = buffer[20]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->count_include_pad = buffer[20]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -69,17 +69,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 21); reference->data = (float *)(buffer + 21 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-5; - if (csi_avgpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool3d(input, output, ¶ms); + if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool3d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool3d_i8.c b/tests/validation/averagepool3d_i8.c index 381b55b9..71086029 100644 --- a/tests/validation/averagepool3d_i8.c +++ b/tests/validation/averagepool3d_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool3d i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,11 +37,11 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -49,21 +49,21 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; - - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.count_include_pad = buffer[20]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; + + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->count_include_pad = buffer[20]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCDHW; @@ -74,39 +74,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCDHW; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 5; output->dim_count = 5; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 20); - float *ref = (float *)(buffer + 20 + in_size); + float *src_in = (float *)(buffer + 20); + float *ref = (float *)(buffer + 20 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -114,17 +113,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool3d(input, output, ¶ms); + if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool3d(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/averagepool3d_u8.c b/tests/validation/averagepool3d_u8.c index 7692de6b..75624502 100644 --- a/tests/validation/averagepool3d_u8.c +++ b/tests/validation/averagepool3d_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool3d u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,11 +37,11 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -49,21 +49,21 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; - - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.count_include_pad = buffer[20]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; + + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->count_include_pad = buffer[20]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCDHW; @@ -80,33 +80,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 20); - float *ref = (float *)(buffer + 20 + in_size); + float *src_in = (float *)(buffer + 20); + float *ref = (float *)(buffer + 20 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -114,17 +113,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool3d(input, output, ¶ms); + if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool3d(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/averagepool_f32.c b/tests/validation/averagepool_f32.c index 44f7cc9a..552914d0 100644 --- a/tests/validation/averagepool_f32.c +++ b/tests/validation/averagepool_f32.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel output->dim[0] = buffer[0]; output->dim[1] = buffer[12]; output->dim[2] = buffer[13]; output->dim[3] = buffer[3]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -62,17 +62,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 14); reference->data = (float *)(buffer + 14 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool_i8.c b/tests/validation/averagepool_i8.c index 2d3762eb..7cf91f0e 100644 --- a/tests/validation/averagepool_i8.c +++ b/tests/validation/averagepool_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,26 +37,26 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel output->dim[0] = buffer[0]; output->dim[1] = buffer[12]; output->dim[2] = buffer[13]; output->dim[3] = buffer[3]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; @@ -67,58 +67,53 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 4; output->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 14); - float *ref = (float *)(buffer + 14 + in_size); + float *src_in = (float *)(buffer + 14); + float *ref = (float *)(buffer + 14 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool_nchw_f32.c b/tests/validation/averagepool_nchw_f32.c index 4fe417ee..6ce00061 100644 --- a/tests/validation/averagepool_nchw_f32.c +++ b/tests/validation/averagepool_nchw_f32.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -62,17 +62,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 14); reference->data = (float *)(buffer + 14 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool_nchw_i8.c b/tests/validation/averagepool_nchw_i8.c index d9389fdd..c5ab6751 100644 --- a/tests/validation/averagepool_nchw_i8.c +++ b/tests/validation/averagepool_nchw_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,26 +37,26 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; @@ -67,40 +67,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 4; output->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 14); - float *ref = (float *)(buffer + 14 + in_size); + float *src_in = (float *)(buffer + 14); + float *ref = (float *)(buffer + 14 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -108,14 +106,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool_nchw_u8.c b/tests/validation/averagepool_nchw_u8.c index 4d6b11d6..7c2ef450 100644 --- a/tests/validation/averagepool_nchw_u8.c +++ b/tests/validation/averagepool_nchw_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,26 +37,26 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; @@ -67,40 +67,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 4; output->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 14); - float *ref = (float *)(buffer + 14 + in_size); + float *src_in = (float *)(buffer + 14); + float *ref = (float *)(buffer + 14 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -108,14 +106,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/averagepool_u8.c b/tests/validation/averagepool_u8.c index 41df8b2e..36b1a402 100644 --- a/tests/validation/averagepool_u8.c +++ b/tests/validation/averagepool_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,26 +37,26 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel output->dim[0] = buffer[0]; output->dim[1] = buffer[12]; output->dim[2] = buffer[13]; output->dim[3] = buffer[3]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; @@ -73,52 +73,47 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 14); - float *ref = (float *)(buffer + 14 + in_size); + float *src_in = (float *)(buffer + 14); + float *ref = (float *)(buffer + 14 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/batch_norm_f32.c b/tests/validation/batch_norm_f32.c index 9629b576..549e1383 100644 --- a/tests/validation/batch_norm_f32.c +++ b/tests/validation/batch_norm_f32.c @@ -16,24 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch normalization f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *mean = csi_alloc_tensor(NULL); - struct csi_tensor *variance = csi_alloc_tensor(NULL); - struct csi_tensor *beta = csi_alloc_tensor(NULL); - struct csi_tensor *gamma = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct bn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *mean = csinn_alloc_tensor(NULL); + struct csinn_tensor *variance = csinn_alloc_tensor(NULL); + struct csinn_tensor *beta = csinn_alloc_tensor(NULL); + struct csinn_tensor *gamma = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL); int size = 1; int *buffer = read_input_data_f32(argv[1]); @@ -49,25 +49,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; - params.epsilon = *((float *)buffer + 1 + input->dim_count); - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + params->epsilon = *((float *)buffer + 1 + input->dim_count); + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); - mean->data = (float *)(buffer + 2 + input->dim_count + size); - variance->data = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); - gamma->data = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); - beta->data = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); - reference->data = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); - output->data = malloc(size * sizeof(float)); + input->data = (float *)(buffer + 2 + input->dim_count); + mean->data = (float *)(buffer + 2 + input->dim_count + size); + variance->data = + (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); + gamma->data = + (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); + beta->data = + (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); + reference->data = + (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); + output->data = malloc(size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-1; - if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, ¶ms) == CSINN_TRUE) { - csi_batch_normalization(input, mean, variance, gamma, beta, output, ¶ms); + if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) == + CSINN_TRUE) { + csinn_batch_normalization(input, mean, variance, gamma, beta, output, params); } - result_verify_f32(reference->data, output->data, input->data, difference, size, false); free(buffer); diff --git a/tests/validation/batch_norm_i8.c b/tests/validation/batch_norm_i8.c index bc8a9f0c..e0c7ad91 100644 --- a/tests/validation/batch_norm_i8.c +++ b/tests/validation/batch_norm_i8.c @@ -16,24 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch normalization i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *mean = csi_alloc_tensor(NULL); - struct csi_tensor *variance = csi_alloc_tensor(NULL); - struct csi_tensor *beta = csi_alloc_tensor(NULL); - struct csi_tensor *gamma = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct bn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *mean = csinn_alloc_tensor(NULL); + struct csinn_tensor *variance = csinn_alloc_tensor(NULL); + struct csinn_tensor *beta = csinn_alloc_tensor(NULL); + struct csinn_tensor *gamma = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL); int size = 1; int zp, quantized_multiplier, shift; float max_value, min_value, scale; @@ -51,15 +51,15 @@ int main(int argc, char** argv) size *= input->dim[i]; } - mean->dim_count = 1; + mean->dim_count = 1; variance->dim_count = 1; - gamma->dim_count = 1; - beta->dim_count = 1; + gamma->dim_count = 1; + beta->dim_count = 1; - mean->dim[0] = input->dim[input->dim_count - 1]; + mean->dim[0] = input->dim[input->dim_count - 1]; variance->dim[0] = input->dim[input->dim_count - 1]; - gamma->dim[0] = input->dim[input->dim_count - 1]; - beta->dim[0] = input->dim[input->dim_count - 1]; + gamma->dim[0] = input->dim[input->dim_count - 1]; + beta->dim[0] = input->dim[input->dim_count - 1]; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; @@ -91,100 +91,103 @@ int main(int argc, char** argv) beta->is_const = 0; beta->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - params.epsilon = *(float *)&buffer[1 + input->dim_count]; - csi_quantize_multiplier(params.epsilon, &quantized_multiplier, &shift); - params.epsilon_multiplier = quantized_multiplier; - params.epsilon_shift = shift; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *mean_in = (float *)(buffer + 2 + input->dim_count + size); - float *var_in = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); - float *gamma_in = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); - float *beta_in = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); - float *ref = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); + params->base.layout = CSINN_LAYOUT_NHWC; + params->epsilon = *(float *)&buffer[1 + input->dim_count]; + shl_quantize_multiplier(params->epsilon, &quantized_multiplier, &shift); + params->epsilon_multiplier = quantized_multiplier; + params->epsilon_shift = shift; + + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *mean_in = (float *)(buffer + 2 + input->dim_count + size); + float *var_in = + (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); + float *gamma_in = + (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); + float *beta_in = + (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); + float *ref = + (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); int8_t *input_tmp = malloc(size * sizeof(char)); - int8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); - int8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + int8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + int8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); int8_t *gamma_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); - int8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + int8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } mean->data = mean_in; get_quant_info(mean); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - mean_tmp[i] = csi_ref_quantize_f32_to_i8(mean_in[i], mean->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + mean_tmp[i] = shl_ref_quantize_f32_to_i8(mean_in[i], mean->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(mean_in[i], mean->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], mean->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } variance->data = var_in; get_quant_info(variance); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - var_tmp[i] = csi_ref_quantize_f32_to_i8(var_in[i], variance->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + var_tmp[i] = shl_ref_quantize_f32_to_i8(var_in[i], variance->qinfo); } gamma->data = gamma_in; get_quant_info(gamma); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - gamma_tmp[i] = csi_ref_quantize_f32_to_i8(gamma_in[i], gamma->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + gamma_tmp[i] = shl_ref_quantize_f32_to_i8(gamma_in[i], gamma->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(mean_in[i], gamma->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], gamma->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[2]) { + if (error1 > error[2]) { error[2] = error1; } } @@ -193,23 +196,23 @@ int main(int argc, char** argv) beta->data = beta_in; get_quant_info(beta); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - beta_tmp[i] = csi_ref_quantize_f32_to_i8(beta_in[i], beta->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + beta_tmp[i] = shl_ref_quantize_f32_to_i8(beta_in[i], beta->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(mean_in[i], beta->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], beta->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[3]) { + if (error1 > error[3]) { error[3] = error1; } } @@ -218,17 +221,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - mean->data = mean_tmp; - variance->data = var_tmp; - gamma->data = gamma_tmp; - beta->data = beta_tmp; + input->data = input_tmp; + mean->data = mean_tmp; + variance->data = var_tmp; + gamma->data = gamma_tmp; + beta->data = beta_tmp; reference->data = ref; - output->data = malloc(size * sizeof(char)); + output->data = malloc(size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, ¶ms) == CSINN_TRUE) { - csi_batch_normalization(input, mean, variance, gamma, beta, output, ¶ms); + if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) == + CSINN_TRUE) { + csinn_batch_normalization(input, mean, variance, gamma, beta, output, params); } result_verify_8(reference->data, output, input->data, difference, size, false); diff --git a/tests/validation/batch_norm_u8.c b/tests/validation/batch_norm_u8.c index 14e0361d..2364ec87 100644 --- a/tests/validation/batch_norm_u8.c +++ b/tests/validation/batch_norm_u8.c @@ -16,24 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch normalization u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *mean = csi_alloc_tensor(NULL); - struct csi_tensor *variance = csi_alloc_tensor(NULL); - struct csi_tensor *beta = csi_alloc_tensor(NULL); - struct csi_tensor *gamma = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct bn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *mean = csinn_alloc_tensor(NULL); + struct csinn_tensor *variance = csinn_alloc_tensor(NULL); + struct csinn_tensor *beta = csinn_alloc_tensor(NULL); + struct csinn_tensor *gamma = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL); int size = 1; int zp, quantized_multiplier, shift; float max_value, min_value, scale; @@ -51,15 +51,15 @@ int main(int argc, char** argv) size *= input->dim[i]; } - mean->dim_count = 1; + mean->dim_count = 1; variance->dim_count = 1; - gamma->dim_count = 1; - beta->dim_count = 1; + gamma->dim_count = 1; + beta->dim_count = 1; - mean->dim[0] = input->dim[input->dim_count - 1]; + mean->dim[0] = input->dim[input->dim_count - 1]; variance->dim[0] = input->dim[input->dim_count - 1]; - gamma->dim[0] = input->dim[input->dim_count - 1]; - beta->dim[0] = input->dim[input->dim_count - 1]; + gamma->dim[0] = input->dim[input->dim_count - 1]; + beta->dim[0] = input->dim[input->dim_count - 1]; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; @@ -91,100 +91,103 @@ int main(int argc, char** argv) beta->is_const = 0; beta->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - params.epsilon = *(float *)&buffer[1 + input->dim_count]; - csi_quantize_multiplier(params.epsilon, &quantized_multiplier, &shift); - params.epsilon_multiplier = quantized_multiplier; - params.epsilon_shift = shift; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *mean_in = (float *)(buffer + 2 + input->dim_count + size); - float *var_in = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); - float *gamma_in = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); - float *beta_in = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); - float *ref = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); + params->base.layout = CSINN_LAYOUT_NHWC; + params->epsilon = *(float *)&buffer[1 + input->dim_count]; + shl_quantize_multiplier(params->epsilon, &quantized_multiplier, &shift); + params->epsilon_multiplier = quantized_multiplier; + params->epsilon_shift = shift; + + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *mean_in = (float *)(buffer + 2 + input->dim_count + size); + float *var_in = + (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); + float *gamma_in = + (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); + float *beta_in = + (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); + float *ref = + (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); uint8_t *input_tmp = malloc(size * sizeof(char)); - uint8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); - uint8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + uint8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + uint8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); uint8_t *gamma_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); - uint8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); + uint8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } mean->data = mean_in; get_quant_info(mean); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - mean_tmp[i] = csi_ref_quantize_f32_to_u8(mean_in[i], mean->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + mean_tmp[i] = shl_ref_quantize_f32_to_u8(mean_in[i], mean->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(mean_in[i], mean->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], mean->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } variance->data = var_in; get_quant_info(variance); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - var_tmp[i] = csi_ref_quantize_f32_to_u8(var_in[i], variance->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + var_tmp[i] = shl_ref_quantize_f32_to_u8(var_in[i], variance->qinfo); } gamma->data = gamma_in; get_quant_info(gamma); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - gamma_tmp[i] = csi_ref_quantize_f32_to_u8(gamma_in[i], gamma->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + gamma_tmp[i] = shl_ref_quantize_f32_to_u8(gamma_in[i], gamma->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(mean_in[i], gamma->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], gamma->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[2]) { + if (error1 > error[2]) { error[2] = error1; } } @@ -193,23 +196,23 @@ int main(int argc, char** argv) beta->data = beta_in; get_quant_info(beta); - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { - beta_tmp[i] = csi_ref_quantize_f32_to_u8(beta_in[i], beta->qinfo); + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { + beta_tmp[i] = shl_ref_quantize_f32_to_u8(beta_in[i], beta->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[input->dim_count - 1]; i++) { + for (int i = 0; i < input->dim[input->dim_count - 1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(mean_in[i], beta->qinfo); - if(isinf(mean_in[i]) || isnan(mean_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], beta->qinfo); + if (isinf(mean_in[i]) || isnan(mean_in[i])) { continue; } else { - error1 = fabs(mean_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9); + error1 = fabs(mean_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9); } } - if(error1 > error[3]) { + if (error1 > error[3]) { error[3] = error1; } } @@ -218,17 +221,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - mean->data = mean_tmp; - variance->data = var_tmp; - gamma->data = gamma_tmp; - beta->data = beta_tmp; + input->data = input_tmp; + mean->data = mean_tmp; + variance->data = var_tmp; + gamma->data = gamma_tmp; + beta->data = beta_tmp; reference->data = ref; - output->data = malloc(size * sizeof(char)); + output->data = malloc(size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, ¶ms) == CSINN_TRUE) { - csi_batch_normalization(input, mean, variance, gamma, beta, output, ¶ms); + if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) == + CSINN_TRUE) { + csinn_batch_normalization(input, mean, variance, gamma, beta, output, params); } result_verify_8(reference->data, output, input->data, difference, size, false); @@ -242,4 +246,3 @@ int main(int argc, char** argv) free(output->data); return done_testing(); } - diff --git a/tests/validation/batch_to_space_f32.c b/tests/validation/batch_to_space_f32.c index 1d0be8df..7a0bc511 100644 --- a/tests/validation/batch_to_space_f32.c +++ b/tests/validation/batch_to_space_f32.c @@ -16,39 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch_to_space f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct batch_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_batch_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //in_batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width - params.block_size = buffer[4]; - params.crop_top = buffer[5]; - params.crop_bottom = buffer[6]; - params.crop_left = buffer[7]; - params.crop_right = buffer[8]; + input->dim[0] = buffer[0]; // in_batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width + params->block_size = buffer[4]; + params->crop_top = buffer[5]; + params->crop_bottom = buffer[6]; + params->crop_left = buffer[7]; + params->crop_right = buffer[8]; - output->dim[0] = input->dim[0] / (params.block_size * params.block_size); + output->dim[0] = input->dim[0] / (params->block_size * params->block_size); output->dim[1] = input->dim[1]; - output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom; - output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right; + output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom; + output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right; input->dim_count = 4; output->dim_count = 4; @@ -57,16 +58,15 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 9); reference->data = (float *)(buffer + 9 + in_size); output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_batch_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_batch_to_space(input, output, ¶ms); + if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_batch_to_space(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/batch_to_space_i8.c b/tests/validation/batch_to_space_i8.c index b7fb96f9..e2e8df33 100644 --- a/tests/validation/batch_to_space_i8.c +++ b/tests/validation/batch_to_space_i8.c @@ -16,40 +16,41 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch_to_space i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct batch_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_batch_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL); int in_size = 0; int out_size = 0; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //in_batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width - params.block_size = buffer[4]; - params.crop_top = buffer[5]; - params.crop_bottom = buffer[6]; - params.crop_left = buffer[7]; - params.crop_right = buffer[8]; - - output->dim[0] = input->dim[0] / (params.block_size * params.block_size); + input->dim[0] = buffer[0]; // in_batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width + params->block_size = buffer[4]; + params->crop_top = buffer[5]; + params->crop_bottom = buffer[6]; + params->crop_left = buffer[7]; + params->crop_right = buffer[8]; + + output->dim[0] = input->dim[0] / (params->block_size * params->block_size); output->dim[1] = input->dim[1]; - output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom; - output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right; + output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom; + output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right; input->dim_count = 4; output->dim_count = 4; @@ -58,40 +59,39 @@ int main(int argc, char** argv) input->is_const = 0; input->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 9); - float *ref = (float *)(buffer + 9 + in_size); + float *src_in = (float *)(buffer + 9); + float *ref = (float *)(buffer + 9 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,14 +99,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_batch_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_batch_to_space(input, output, ¶ms); + if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_batch_to_space(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/batch_to_space_u8.c b/tests/validation/batch_to_space_u8.c index e3f7ebbb..8a8c3d10 100644 --- a/tests/validation/batch_to_space_u8.c +++ b/tests/validation/batch_to_space_u8.c @@ -16,40 +16,41 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch_to_space u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct batch_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_batch_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL); int in_size = 0; int out_size = 0; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //in_batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width - params.block_size = buffer[4]; - params.crop_top = buffer[5]; - params.crop_bottom = buffer[6]; - params.crop_left = buffer[7]; - params.crop_right = buffer[8]; - - output->dim[0] = input->dim[0] / (params.block_size * params.block_size); + input->dim[0] = buffer[0]; // in_batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width + params->block_size = buffer[4]; + params->crop_top = buffer[5]; + params->crop_bottom = buffer[6]; + params->crop_left = buffer[7]; + params->crop_right = buffer[8]; + + output->dim[0] = input->dim[0] / (params->block_size * params->block_size); output->dim[1] = input->dim[1]; - output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom; - output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right; + output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom; + output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right; input->dim_count = 4; output->dim_count = 4; @@ -65,33 +66,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 9); - float *ref = (float *)(buffer + 9 + in_size); + float *src_in = (float *)(buffer + 9); + float *ref = (float *)(buffer + 9 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,14 +99,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_batch_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_batch_to_space(input, output, ¶ms); + if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_batch_to_space(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/broadcast_to_f32.c b/tests/validation/broadcast_to_f32.c index 7bd718f2..08bb5b72 100644 --- a/tests/validation/broadcast_to_f32.c +++ b/tests/validation/broadcast_to_f32.c @@ -16,60 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of broadcast_to f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct broadcast_to_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_broadcast_to_params *params = + csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - params.shape_count = buffer[1]; + params->shape_count = buffer[1]; output->dim_count = buffer[1]; - for(int i=0; idim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size = in_size * input->dim[i]; } - params.shape = (int *)malloc(params.shape_count * sizeof(int)); + params->shape = (int *)malloc(params->shape_count * sizeof(int)); - for(int i=0; idim[i] = buffer[2 + input->dim_count +i]; + for (int i = 0; i < params->shape_count; i++) { + output->dim[i] = buffer[2 + input->dim_count + i]; out_size = out_size * output->dim[i]; - params.shape[i] = output->dim[i]; + params->shape[i] = output->dim[i]; } input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count + params.shape_count); - reference->data = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size); + input->data = (float *)(buffer + 2 + input->dim_count + params->shape_count); + reference->data = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size); input->dtype = CSINN_DTYPE_FLOAT32; - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_broadcast_to_init(input, output, ¶ms) == CSINN_TRUE) { - csi_broadcast_to(input, output, ¶ms); + if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) { + csinn_broadcast_to(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); free(output->data); - free(params.shape); + free(params->shape); return done_testing(); } diff --git a/tests/validation/broadcast_to_i8.c b/tests/validation/broadcast_to_i8.c index a07fdd49..fe5a5805 100644 --- a/tests/validation/broadcast_to_i8.c +++ b/tests/validation/broadcast_to_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of broadcast_to i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct broadcast_to_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_broadcast_to_params *params = + csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -39,21 +40,20 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - params.shape_count = buffer[1]; + params->shape_count = buffer[1]; output->dim_count = buffer[1]; - for(int i=0; idim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size = in_size * input->dim[i]; } - params.shape = (int *)malloc(params.shape_count * sizeof(int)); + params->shape = (int *)malloc(params->shape_count * sizeof(int)); - - for(int i=0; idim[i] = buffer[2 + input->dim_count +i]; + for (int i = 0; i < params->shape_count; i++) { + output->dim[i] = buffer[2 + input->dim_count + i]; out_size = out_size * output->dim[i]; - params.shape[i] = output->dim[i]; + params->shape[i] = output->dim[i]; } input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; @@ -64,33 +64,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + input->dim_count + params.shape_count); - float *ref = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count + params->shape_count); + float *ref = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,16 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_broadcast_to_init(input, output, ¶ms) == CSINN_TRUE) { - csi_broadcast_to(input, output, ¶ms); + if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) { + csinn_broadcast_to(input, output, params); } - + result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/broadcast_to_u8.c b/tests/validation/broadcast_to_u8.c index 733cfe07..f6dd1c97 100644 --- a/tests/validation/broadcast_to_u8.c +++ b/tests/validation/broadcast_to_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of broadcast_to u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct broadcast_to_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_broadcast_to_params *params = + csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -39,21 +40,20 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - params.shape_count = buffer[1]; + params->shape_count = buffer[1]; output->dim_count = buffer[1]; - for(int i=0; idim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size = in_size * input->dim[i]; } - params.shape = (int *)malloc(params.shape_count * sizeof(int)); + params->shape = (int *)malloc(params->shape_count * sizeof(int)); - - for(int i=0; idim[i] = buffer[2 + input->dim_count +i]; + for (int i = 0; i < params->shape_count; i++) { + output->dim[i] = buffer[2 + input->dim_count + i]; out_size = out_size * output->dim[i]; - params.shape[i] = output->dim[i]; + params->shape[i] = output->dim[i]; } input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; @@ -64,33 +64,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + input->dim_count + params.shape_count); - float *ref = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count + params->shape_count); + float *ref = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,16 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_broadcast_to_init(input, output, ¶ms) == CSINN_TRUE) { - csi_broadcast_to(input, output, ¶ms); + if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) { + csinn_broadcast_to(input, output, params); } - + result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/ceil_f32.c b/tests/validation/ceil_f32.c index c2110d45..fb063ecd 100644 --- a/tests/validation/ceil_f32.c +++ b/tests/validation/ceil_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ceil f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -49,17 +49,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_ceil_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ceil(input, output, ¶ms); - } + if (csinn_ceil_init(input, output, params) == CSINN_TRUE) { + csinn_ceil(input, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/ceil_i8.c b/tests/validation/ceil_i8.c index 5d272e46..84cb4eef 100644 --- a/tests/validation/ceil_i8.c +++ b/tests/validation/ceil_i8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ceil i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,16 +91,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_ceil_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ceil(input, output, ¶ms); - } + if (csinn_ceil_init(input, output, params) == CSINN_TRUE) { + csinn_ceil(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/ceil_u8.c b/tests/validation/ceil_u8.c index 76d4fc56..afbe0380 100644 --- a/tests/validation/ceil_u8.c +++ b/tests/validation/ceil_u8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ceil u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,16 +91,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_ceil_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ceil(input, output, ¶ms); - } + if (csinn_ceil_init(input, output, params) == CSINN_TRUE) { + csinn_ceil(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/clip_f32.c b/tests/validation/clip_f32.c index 2932932c..c584ce1f 100644 --- a/tests/validation/clip_f32.c +++ b/tests/validation/clip_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of clip f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct clip_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -52,18 +52,17 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.min_value = buffer[4]; - params.max_value = buffer[5]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->min_value = buffer[4]; + params->max_value = buffer[5]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_clip_init(input, output, ¶ms) == CSINN_TRUE) { - csi_clip(input, output, ¶ms); + if (csinn_clip_init(input, output, params) == CSINN_TRUE) { + csinn_clip(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/clip_i8.c b/tests/validation/clip_i8.c index db802698..896e7d13 100644 --- a/tests/validation/clip_i8.c +++ b/tests/validation/clip_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of clip i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct clip_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -62,35 +62,34 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.min_value = buffer[4]; - params.max_value = buffer[5]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->min_value = buffer[4]; + params->max_value = buffer[5]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,15 +97,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_clip_init(input, output, ¶ms) == CSINN_TRUE) { - csi_clip(input, output, ¶ms); + if (csinn_clip_init(input, output, params) == CSINN_TRUE) { + csinn_clip(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/clip_u8.c b/tests/validation/clip_u8.c index 24b5dad8..810ed13c 100644 --- a/tests/validation/clip_u8.c +++ b/tests/validation/clip_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of clip u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct clip_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -62,36 +62,34 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; + params->min_value = buffer[4]; + params->max_value = buffer[5]; + params->base.api = CSINN_API; - params.min_value = buffer[4]; - params.max_value = buffer[5]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +97,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_clip_init(input, output, ¶ms) == CSINN_TRUE) { - csi_clip(input, output, ¶ms); + if (csinn_clip_init(input, output, params) == CSINN_TRUE) { + csinn_clip(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/concat_f32.c b/tests/validation/concat_f32.c index ffd2a41a..057518e0 100644 --- a/tests/validation/concat_f32.c +++ b/tests/validation/concat_f32.c @@ -16,64 +16,63 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - init_testsuite("Testing function of concat f32.\n"); + init_testsuite("Testing function of concat f32.\n"); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - struct concat_params params; + struct csinn_concat_params *params = + csinn_alloc_params(sizeof(struct csinn_concat_params), NULL); + + params->inputs_count = buffer[4]; - params.inputs_count = buffer[4]; - - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input[params.inputs_count]; + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); } - params.axis = buffer[5]; + params->axis = buffer[5]; output->dim_count = 4; - for(int i = 0; i < output->dim_count; i++) { - if ( i == params.axis ){ - output->dim[i] = params.inputs_count*buffer[i]; - } - else { + for (int i = 0; i < output->dim_count; i++) { + if (i == params->axis) { + output->dim[i] = params->inputs_count * buffer[i]; + } else { output->dim[i] = buffer[i]; - } + } out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = out_size / params->inputs_count; + params->base.api = CSINN_API; - for(int i = 0; i < params.inputs_count; i++) { - input[i]->data = (float *)(buffer + 6 + in_size * i); - input[i]->dim[0] = buffer[0]; // batch - input[i]->dim[1] = buffer[1]; // height - input[i]->dim[2] = buffer[2]; // width - input[i]->dim[3] = buffer[3]; // channel - input[i]->dim_count = 4; - input[i]->dtype = CSINN_DTYPE_FLOAT32; + for (int i = 0; i < params->inputs_count; i++) { + input[i]->data = (float *)(buffer + 6 + in_size * i); + input[i]->dim[0] = buffer[0]; // batch + input[i]->dim[1] = buffer[1]; // height + input[i]->dim[2] = buffer[2]; // width + input[i]->dim[3] = buffer[3]; // channel + input[i]->dim_count = 4; + input[i]->dtype = CSINN_DTYPE_FLOAT32; } output->dtype = CSINN_DTYPE_FLOAT32; - reference->data = (float *)(buffer + 6 + in_size * params.inputs_count); - output->data = (float *)malloc(out_size * sizeof(float)); + reference->data = (float *)(buffer + 6 + in_size * params->inputs_count); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_concat_init((struct csi_tensor **)input, output, ¶ms) == CSINN_TRUE) { - csi_concat((struct csi_tensor **)input, output, ¶ms); + if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) { + csinn_concat((struct csinn_tensor **)input, output, params); } result_verify_f32(reference->data, output->data, input[0]->data, difference, out_size, false); diff --git a/tests/validation/concat_i8.c b/tests/validation/concat_i8.c index dcf2f197..ba25b6a0 100644 --- a/tests/validation/concat_i8.c +++ b/tests/validation/concat_i8.c @@ -16,72 +16,70 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - init_testsuite("Testing function of concat i8.\n"); + init_testsuite("Testing function of concat i8.\n"); int in_size = 1; int out_size = 1; float error = 0.2f; int *buffer = read_input_data_f32(argv[1]); - - struct concat_params params; - params.inputs_count = buffer[4]; - - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input[params.inputs_count]; + struct csinn_concat_params *params = + csinn_alloc_params(sizeof(struct csinn_concat_params), NULL); + + params->inputs_count = buffer[4]; + + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); } - float *src_in[params.inputs_count]; - params.axis = buffer[5]; + float *src_in[params->inputs_count]; + params->axis = buffer[5]; output->dim_count = 4; - - for(int i = 0; i < output->dim_count; i++) { - if ( i == params.axis ){ - output->dim[i] = params.inputs_count*buffer[i]; - } - else { + for (int i = 0; i < output->dim_count; i++) { + if (i == params->axis) { + output->dim[i] = params->inputs_count * buffer[i]; + } else { output->dim[i] = buffer[i]; - } + } out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = out_size / params->inputs_count; + params->base.api = CSINN_API; - int8_t *src_tmp[params.inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { - src_in[i] = (float *)(buffer + 6 + in_size * i); + int8_t *src_tmp[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + src_in[i] = (float *)(buffer + 6 + in_size * i); src_tmp[i] = malloc(in_size * sizeof(char)); - } + } - float *ref = (float *)(buffer + 6 + in_size * params.inputs_count); + float *ref = (float *)(buffer + 6 + in_size * params->inputs_count); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { input[i]->data = src_in[i]; - input[i]->dim[0] = buffer[0]; - input[i]->dim[1] = buffer[1]; - input[i]->dim[2] = buffer[2]; - input[i]->dim[3] = buffer[3]; + input[i]->dim[0] = buffer[0]; + input[i]->dim[1] = buffer[1]; + input[i]->dim[2] = buffer[2]; + input[i]->dim[3] = buffer[3]; input[i]->dim_count = 4; input[i]->dtype = CSINN_DTYPE_INT8; input[i]->layout = CSINN_LAYOUT_NCHW; input[i]->is_const = 0; input[i]->quant_channel = 1; get_quant_info(input[i]); - for(int j = 0; j < in_size; j++) { - src_tmp[i][j] = csi_ref_quantize_f32_to_i8(src_in[i][j], input[i]->qinfo); + for (int j = 0; j < in_size; j++) { + src_tmp[i][j] = shl_ref_quantize_f32_to_i8(src_in[i][j], input[i]->qinfo); } input[i]->data = src_tmp[i]; } @@ -94,18 +92,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - reference->data = ref; - output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); + reference->data = ref; + output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_concat_init((struct csi_tensor **)input, output, ¶ms) == CSINN_TRUE) { - csi_concat((struct csi_tensor **)input, output, ¶ms); + if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) { + csinn_concat((struct csinn_tensor **)input, output, params); } result_verify_8(reference->data, output, input[0]->data, difference, out_size, false); free(buffer); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { free(src_tmp[i]); } free(output->data); diff --git a/tests/validation/concat_u8.c b/tests/validation/concat_u8.c index 8e413b9e..c8b3cbfc 100644 --- a/tests/validation/concat_u8.c +++ b/tests/validation/concat_u8.c @@ -16,72 +16,70 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - init_testsuite("Testing function of concat u8.\n"); + init_testsuite("Testing function of concat u8.\n"); int in_size = 1; int out_size = 1; float error = 0.2f; int *buffer = read_input_data_f32(argv[1]); - - struct concat_params params; - params.inputs_count = buffer[4]; - - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input[params.inputs_count]; + struct csinn_concat_params *params = + csinn_alloc_params(sizeof(struct csinn_concat_params), NULL); + + params->inputs_count = buffer[4]; + + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); } - float *src_in[params.inputs_count]; - params.axis = buffer[5]; + float *src_in[params->inputs_count]; + params->axis = buffer[5]; output->dim_count = 4; - - for(int i = 0; i < output->dim_count; i++) { - if ( i == params.axis ){ - output->dim[i] = params.inputs_count*buffer[i]; - } - else { + for (int i = 0; i < output->dim_count; i++) { + if (i == params->axis) { + output->dim[i] = params->inputs_count * buffer[i]; + } else { output->dim[i] = buffer[i]; - } + } out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = out_size / params->inputs_count; + params->base.api = CSINN_API; - uint8_t *src_tmp[params.inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { - src_in[i] = (float *)(buffer + 6 + in_size * i); + uint8_t *src_tmp[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + src_in[i] = (float *)(buffer + 6 + in_size * i); src_tmp[i] = malloc(in_size * sizeof(char)); - } + } - float *ref = (float *)(buffer + 6 + in_size * params.inputs_count); + float *ref = (float *)(buffer + 6 + in_size * params->inputs_count); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { input[i]->data = src_in[i]; - input[i]->dim[0] = buffer[0]; - input[i]->dim[1] = buffer[1]; - input[i]->dim[2] = buffer[2]; - input[i]->dim[3] = buffer[3]; + input[i]->dim[0] = buffer[0]; + input[i]->dim[1] = buffer[1]; + input[i]->dim[2] = buffer[2]; + input[i]->dim[3] = buffer[3]; input[i]->dim_count = 4; input[i]->dtype = CSINN_DTYPE_UINT8; input[i]->layout = CSINN_LAYOUT_NCHW; input[i]->is_const = 0; input[i]->quant_channel = 1; get_quant_info(input[i]); - for(int j = 0; j < in_size; j++) { - src_tmp[i][j] = csi_ref_quantize_f32_to_u8(src_in[i][j], input[i]->qinfo); + for (int j = 0; j < in_size; j++) { + src_tmp[i][j] = shl_ref_quantize_f32_to_u8(src_in[i][j], input[i]->qinfo); } input[i]->data = src_tmp[i]; } @@ -94,18 +92,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - reference->data = ref; - output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); + reference->data = ref; + output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_concat_init((struct csi_tensor **)input, output, ¶ms) == CSINN_TRUE) { - csi_concat((struct csi_tensor **)input, output, ¶ms); + if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) { + csinn_concat((struct csinn_tensor **)input, output, params); } result_verify_8(reference->data, output, input[0]->data, difference, out_size, false); free(buffer); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { free(src_tmp[i]); } free(output->data); diff --git a/tests/validation/convolution3d_f32.c b/tests/validation/convolution3d_f32.c index c46f9ac6..64dbd2e4 100644 --- a/tests/validation/convolution3d_f32.c +++ b/tests/validation/convolution3d_f32.c @@ -16,23 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution3d f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL); int in_size, out_size, weight_size, bias_size; if (argc == 1) { @@ -41,41 +41,41 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_depth - input->dim[3] = buffer[3]; //in_height - input->dim[4] = buffer[4]; //in_width - - kernel->dim[0] = buffer[5]; //out_channel - kernel->dim[1] = buffer[1]; //in_channel - kernel->dim[2] = buffer[6]; //filter_depth - kernel->dim[3] = buffer[7]; //filter_height - kernel->dim[4] = buffer[8]; //filter_width - - bias->dim[0] = buffer[5]; - - output->dim[0] = buffer[0]; //batch - output->dim[1] = buffer[5]; //out_channel - output->dim[2] = buffer[9]; //out_depth - output->dim[3] = buffer[10]; //out_height - output->dim[4] = buffer[11]; //out_width - - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.dilation_depth = buffer[21]; - params.dilation_height = buffer[22]; - params.dilation_width = buffer[23]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_depth + input->dim[3] = buffer[3]; // in_height + input->dim[4] = buffer[4]; // in_width + + kernel->dim[0] = buffer[5]; // out_channel + kernel->dim[1] = buffer[1]; // in_channel + kernel->dim[2] = buffer[6]; // filter_depth + kernel->dim[3] = buffer[7]; // filter_height + kernel->dim[4] = buffer[8]; // filter_width + + bias->dim[0] = buffer[5]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[5]; // out_channel + output->dim[2] = buffer[9]; // out_depth + output->dim[3] = buffer[10]; // out_height + output->dim[4] = buffer[11]; // out_width + + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->dilation_depth = buffer[21]; + params->dilation_height = buffer[22]; + params->dilation_width = buffer[23]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -86,23 +86,23 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; - bias_size = output->dim[1]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; + weight_size = + kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; + bias_size = output->dim[1]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 24); - kernel->data = (float *)(buffer + 24 + in_size); - bias->data = (float *)(buffer + 24 + in_size + weight_size); + input->data = (float *)(buffer + 24); + kernel->data = (float *)(buffer + 24 + in_size); + bias->data = (float *)(buffer + 24 + in_size + weight_size); reference->data = (float *)(buffer + 24 + in_size + weight_size + bias_size); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv3d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv3d(input, output, kernel, bias, ¶ms); + if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv3d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/convolution3d_i8.c b/tests/validation/convolution3d_i8.c index 819e2f5b..125e821c 100644 --- a/tests/validation/convolution3d_i8.c +++ b/tests/validation/convolution3d_i8.c @@ -16,23 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution3d i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL); int in_size, out_size, weight_size, bias_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,41 +45,41 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_depth - input->dim[3] = buffer[3]; //in_height - input->dim[4] = buffer[4]; //in_width - - kernel->dim[0] = buffer[5]; //out_channel - kernel->dim[1] = buffer[1]; //in_channel - kernel->dim[2] = buffer[6]; //filter_depth - kernel->dim[3] = buffer[7]; //filter_height - kernel->dim[4] = buffer[8]; //filter_width - - bias->dim[0] = buffer[5]; - - output->dim[0] = buffer[0]; //batch - output->dim[1] = buffer[5]; //out_channel - output->dim[2] = buffer[9]; //out_depth - output->dim[3] = buffer[10]; //out_height - output->dim[4] = buffer[11]; //out_width - - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.dilation_depth = buffer[21]; - params.dilation_height = buffer[22]; - params.dilation_width = buffer[23]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_depth + input->dim[3] = buffer[3]; // in_height + input->dim[4] = buffer[4]; // in_width + + kernel->dim[0] = buffer[5]; // out_channel + kernel->dim[1] = buffer[1]; // in_channel + kernel->dim[2] = buffer[6]; // filter_depth + kernel->dim[3] = buffer[7]; // filter_height + kernel->dim[4] = buffer[8]; // filter_width + + bias->dim[0] = buffer[5]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[5]; // out_channel + output->dim[2] = buffer[9]; // out_depth + output->dim[3] = buffer[10]; // out_height + output->dim[4] = buffer[11]; // out_width + + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->dilation_depth = buffer[21]; + params->dilation_height = buffer[22]; + params->dilation_width = buffer[23]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -105,105 +105,101 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; - bias_size = output->dim[1]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 24); - float *kernel_in = (float *)(buffer + 24 + in_size); - float *bias_in = (float *)(buffer + 24 + in_size + weight_size); - float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size); + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; + weight_size = + kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; + bias_size = output->dim[1]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 24); + float *kernel_in = (float *)(buffer + 24 + in_size); + float *bias_in = (float *)(buffer + 24 + in_size + weight_size); + float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < bias_size; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < bias_size; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv3d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv3d(input, output, kernel, bias, ¶ms); + if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv3d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution3d_u8.c b/tests/validation/convolution3d_u8.c index fe1b3b2d..9bf099c7 100644 --- a/tests/validation/convolution3d_u8.c +++ b/tests/validation/convolution3d_u8.c @@ -16,23 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution3d u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL); int in_size, out_size, weight_size, bias_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,41 +45,41 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_depth - input->dim[3] = buffer[3]; //in_height - input->dim[4] = buffer[4]; //in_width - - kernel->dim[0] = buffer[5]; //out_channel - kernel->dim[1] = buffer[1]; //in_channel - kernel->dim[2] = buffer[6]; //filter_depth - kernel->dim[3] = buffer[7]; //filter_height - kernel->dim[4] = buffer[8]; //filter_width - - bias->dim[0] = buffer[5]; - - output->dim[0] = buffer[0]; //batch - output->dim[1] = buffer[5]; //out_channel - output->dim[2] = buffer[9]; //out_depth - output->dim[3] = buffer[10]; //out_height - output->dim[4] = buffer[11]; //out_width - - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.dilation_depth = buffer[21]; - params.dilation_height = buffer[22]; - params.dilation_width = buffer[23]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_depth + input->dim[3] = buffer[3]; // in_height + input->dim[4] = buffer[4]; // in_width + + kernel->dim[0] = buffer[5]; // out_channel + kernel->dim[1] = buffer[1]; // in_channel + kernel->dim[2] = buffer[6]; // filter_depth + kernel->dim[3] = buffer[7]; // filter_height + kernel->dim[4] = buffer[8]; // filter_width + + bias->dim[0] = buffer[5]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[5]; // out_channel + output->dim[2] = buffer[9]; // out_depth + output->dim[3] = buffer[10]; // out_height + output->dim[4] = buffer[11]; // out_width + + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->dilation_depth = buffer[21]; + params->dilation_height = buffer[22]; + params->dilation_width = buffer[23]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -105,105 +105,101 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; - bias_size = output->dim[1]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 24); - float *kernel_in = (float *)(buffer + 24 + in_size); - float *bias_in = (float *)(buffer + 24 + in_size + weight_size); - float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size); + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; + weight_size = + kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; + bias_size = output->dim[1]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 24); + float *kernel_in = (float *)(buffer + 24 + in_size); + float *bias_in = (float *)(buffer + 24 + in_size + weight_size); + float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < bias_size; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < bias_size; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv3d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv3d(input, output, kernel, bias, ¶ms); + if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv3d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_channel_nchw_i8.c b/tests/validation/convolution_channel_nchw_i8.c index a1766874..8d642f98 100644 --- a/tests/validation/convolution_channel_nchw_i8.c +++ b/tests/validation/convolution_channel_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution channel nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size, per_weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -43,30 +44,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; struct ScaleZp szp[kernel->dim[0]]; input->dim_count = 4; @@ -77,68 +78,65 @@ int main(int argc, char** argv) kernel->dtype = CSINN_DTYPE_INT8; bias->dtype = CSINN_DTYPE_INT8; output->dtype = CSINN_DTYPE_INT8; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + int8_t *input_tmp = malloc(in_size * sizeof(char)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } - for(int i = 0; i < kernel->dim[0]; i++){ - float *kernel_in = (float *)(buffer + 17 + in_size + i*per_weight_size); + for (int i = 0; i < kernel->dim[0]; i++) { + float *kernel_in = (float *)(buffer + 17 + in_size + i * per_weight_size); kernel->qinfo = get_quant_info_i8(kernel_in, per_weight_size); scale2 = kernel->qinfo->scale; zp = kernel->qinfo->zero_point; - for(int j = 0; j < per_weight_size; j++) { - kernel_tmp[i*per_weight_size + j] = csi_ref_quantize_f32_to_i8(kernel_in[j], kernel->qinfo); + for (int j = 0; j < per_weight_size; j++) { + kernel_tmp[i * per_weight_size + j] = + shl_ref_quantize_f32_to_i8(kernel_in[j], kernel->qinfo); } szp[i].zero_point = zp; szp[i].scale = scale2; - } - params.scale_zp = szp; + params->scale_zp = szp; output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_channel_nchw_u8.c b/tests/validation/convolution_channel_nchw_u8.c index 3cb5a0d0..af1e93cc 100644 --- a/tests/validation/convolution_channel_nchw_u8.c +++ b/tests/validation/convolution_channel_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution channel nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size, per_weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -43,31 +44,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; - struct csi_scale_zp szp[kernel->dim[0]]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -77,66 +77,64 @@ int main(int argc, char** argv) kernel->dtype = CSINN_DTYPE_UINT8; bias->dtype = CSINN_DTYPE_UINT8; output->dtype = CSINN_DTYPE_UINT8; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + uint8_t *input_tmp = malloc(in_size * sizeof(char)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } - csi_realloc_quant_info(kernel, kernel->dim[0]); + csinn_realloc_quant_info(kernel, kernel->dim[0]); - for(int i = 0; i < kernel->dim[0]; i++) { - float *kernel_in = (float *)(buffer + 17 + in_size + i*per_weight_size); - struct csi_quant_info *qinfo = get_quant_info(kernel_in, per_weight_size); + for (int i = 0; i < kernel->dim[0]; i++) { + float *kernel_in = (float *)(buffer + 17 + in_size + i * per_weight_size); + struct csinn_quant_info *qinfo = get_quant_info(kernel_in, per_weight_size); kernel->qinfo[i].scale = qinfo->scale; kernel->qinfo[i].zero_point = qinfo->zero_point; - for(int j = 0; j < per_weight_size; j++) { - kernel_tmp[i*per_weight_size + j] = csi_ref_quantize_f32_to_u8(kernel_in[j], kernel->qinfo); + for (int j = 0; j < per_weight_size; j++) { + kernel_tmp[i * per_weight_size + j] = + shl_ref_quantize_f32_to_u8(kernel_in[j], kernel->qinfo); } } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_f32.c b/tests/validation/convolution_f32.c index 93762a2a..c045e483 100644 --- a/tests/validation/convolution_f32.c +++ b/tests/validation/convolution_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -40,52 +41,51 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; - bias->dim_count = 1; + bias->dim_count = 1; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; kernel->dtype = CSINN_DTYPE_FLOAT32; bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/convolution_i8.c b/tests/validation/convolution_i8.c index 2ad9ed11..36002390 100644 --- a/tests/validation/convolution_i8.c +++ b/tests/validation/convolution_i8.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,42 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,59 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_nchw_f32.c b/tests/validation/convolution_nchw_f32.c index ea4f5cb6..efe8bd6a 100644 --- a/tests/validation/convolution_nchw_f32.c +++ b/tests/validation/convolution_nchw_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -40,30 +41,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -75,23 +76,21 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = malloc(out_size * sizeof(float)); - + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/convolution_nchw_i8.c b/tests/validation/convolution_nchw_i8.c index 66c143c3..cf2e681f 100644 --- a/tests/validation/convolution_nchw_i8.c +++ b/tests/validation/convolution_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -92,95 +93,90 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + int8_t *input_tmp = malloc(in_size * sizeof(char)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/convolution_nchw_u8.c b/tests/validation/convolution_nchw_u8.c index 132332cb..1b5c87a5 100644 --- a/tests/validation/convolution_nchw_u8.c +++ b/tests/validation/convolution_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,94 +94,89 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + uint8_t *input_tmp = malloc(in_size * sizeof(char)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/convolution_relu6_i8.c b/tests/validation/convolution_relu6_i8.c index befadf58..07ab15b0 100644 --- a/tests/validation/convolution_relu6_i8.c +++ b/tests/validation/convolution_relu6_i8.c @@ -16,57 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -92,105 +92,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + int8_t *input_tmp = malloc(in_size * sizeof(char)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu6_nchw_i8.c b/tests/validation/convolution_relu6_nchw_i8.c index db52ae8d..cb02a008 100644 --- a/tests/validation/convolution_relu6_nchw_i8.c +++ b/tests/validation/convolution_relu6_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu6 nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,104 +94,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + int8_t *input_tmp = malloc(in_size * sizeof(char)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu6_nchw_u8.c b/tests/validation/convolution_relu6_nchw_u8.c index d6d15bf8..8c849c5d 100644 --- a/tests/validation/convolution_relu6_nchw_u8.c +++ b/tests/validation/convolution_relu6_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu6 nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,41 +94,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +136,56 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu6_u8.c b/tests/validation/convolution_relu6_u8.c index febbbcf8..8b2dba58 100644 --- a/tests/validation/convolution_relu6_u8.c +++ b/tests/validation/convolution_relu6_u8.c @@ -16,57 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu6 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -92,43 +92,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); - + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,59 +134,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu_i8.c b/tests/validation/convolution_relu_i8.c index 92d80e65..3075d5a3 100644 --- a/tests/validation/convolution_relu_i8.c +++ b/tests/validation/convolution_relu_i8.c @@ -16,57 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -91,105 +91,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu_nchw_i8.c b/tests/validation/convolution_relu_nchw_i8.c index c715d944..6e617449 100644 --- a/tests/validation/convolution_relu_nchw_i8.c +++ b/tests/validation/convolution_relu_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,104 +94,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + int8_t *input_tmp = malloc(in_size * sizeof(char)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu_nchw_u8.c b/tests/validation/convolution_relu_nchw_u8.c index 72f6465a..3c48baa9 100644 --- a/tests/validation/convolution_relu_nchw_u8.c +++ b/tests/validation/convolution_relu_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,104 +94,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + uint8_t *input_tmp = malloc(in_size * sizeof(char)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_relu_u8.c b/tests/validation/convolution_relu_u8.c index a6e0a0c5..f9f8a33a 100644 --- a/tests/validation/convolution_relu_u8.c +++ b/tests/validation/convolution_relu_u8.c @@ -16,57 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -91,105 +91,99 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + uint8_t *input_tmp = malloc(in_size * sizeof(char)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/convolution_u8.c b/tests/validation/convolution_u8.c index bb2fc183..d06f6758 100644 --- a/tests/validation/convolution_u8.c +++ b/tests/validation/convolution_u8.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,43 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -138,59 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/cos_f32.c b/tests/validation/cos_f32.c index c8dcede9..c0e64534 100644 --- a/tests/validation/cos_f32.c +++ b/tests/validation/cos_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cos f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cos(input, output, ¶ms); + if (csinn_cos_init(input, output, params) == CSINN_TRUE) { + csinn_cos(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/cos_i8.c b/tests/validation/cos_i8.c index 941a640b..68f8f96f 100644 --- a/tests/validation/cos_i8.c +++ b/tests/validation/cos_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cos i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,9 +96,8 @@ int main(int argc, char** argv) // max error:0.018 for input [-3.14, 3.14] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_cos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cos(input, output, ¶ms); + if (csinn_cos_init(input, output, params) == CSINN_TRUE) { + csinn_cos(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cos_u8.c b/tests/validation/cos_u8.c index de00bd6d..04e5a37b 100644 --- a/tests/validation/cos_u8.c +++ b/tests/validation/cos_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cos u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,9 +96,8 @@ int main(int argc, char** argv) // max error:0.018 for input [-3.14, 3.14] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_cos_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cos(input, output, ¶ms); + if (csinn_cos_init(input, output, params) == CSINN_TRUE) { + csinn_cos(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cosh_f32.c b/tests/validation/cosh_f32.c index 122de8f1..2563b118 100644 --- a/tests/validation/cosh_f32.c +++ b/tests/validation/cosh_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cosh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cosh(input, output, ¶ms); + if (csinn_cosh_init(input, output, params) == CSINN_TRUE) { + csinn_cosh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/cosh_i8.c b/tests/validation/cosh_i8.c index 3ab765ef..d75ea659 100644 --- a/tests/validation/cosh_i8.c +++ b/tests/validation/cosh_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cosh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -55,8 +55,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,8 +96,8 @@ int main(int argc, char** argv) // max error: 0.2 for input [-5, 5] float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cosh(input, output, ¶ms); + if (csinn_cosh_init(input, output, params) == CSINN_TRUE) { + csinn_cosh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cosh_u8.c b/tests/validation/cosh_u8.c index a63eb9f7..e7e30a29 100644 --- a/tests/validation/cosh_u8.c +++ b/tests/validation/cosh_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cosh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,8 +96,8 @@ int main(int argc, char** argv) // max error: 0.2 for input [-5, 5] float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cosh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cosh(input, output, ¶ms); + if (csinn_cosh_init(input, output, params) == CSINN_TRUE) { + csinn_cosh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cumprod_f32.c b/tests/validation/cumprod_f32.c index 7458b2a9..3b634711 100644 --- a/tests/validation/cumprod_f32.c +++ b/tests/validation/cumprod_f32.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumprod f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumprod_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumprod_params *params = + csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -53,17 +54,16 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumprod_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumprod(input, output, ¶ms); + if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) { + csinn_cumprod(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/cumprod_i8.c b/tests/validation/cumprod_i8.c index 424a55a7..937a8739 100644 --- a/tests/validation/cumprod_i8.c +++ b/tests/validation/cumprod_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumprod i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumprod_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumprod_params *params = + csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,18 +38,18 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -60,35 +61,33 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -96,14 +95,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumprod_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumprod(input, output, ¶ms); + if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) { + csinn_cumprod(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cumprod_u8.c b/tests/validation/cumprod_u8.c index 1bd713a1..fc41e805 100644 --- a/tests/validation/cumprod_u8.c +++ b/tests/validation/cumprod_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumprod u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumprod_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumprod_params *params = + csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,18 +38,18 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -60,35 +61,33 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -96,14 +95,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumprod_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumprod(input, output, ¶ms); + if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) { + csinn_cumprod(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/cumsum_f32.c b/tests/validation/cumsum_f32.c index fa1a6044..c2662802 100644 --- a/tests/validation/cumsum_f32.c +++ b/tests/validation/cumsum_f32.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumsum f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumsum_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumsum_params *params = + csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -53,21 +54,20 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumsum_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumsum(input, output, ¶ms); + if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) { + csinn_cumsum(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); - + free(buffer); free(output->data); return done_testing(); diff --git a/tests/validation/cumsum_i8.c b/tests/validation/cumsum_i8.c index 2fddd428..143581f7 100644 --- a/tests/validation/cumsum_i8.c +++ b/tests/validation/cumsum_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumsum i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumsum_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumsum_params *params = + csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,18 +38,18 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -64,34 +65,33 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,18 +99,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumsum_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumsum(input, output, ¶ms); + if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) { + csinn_cumsum(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); - + free(buffer); free(src_tmp); free(output->data); diff --git a/tests/validation/cumsum_u8.c b/tests/validation/cumsum_u8.c index 22cbd59b..74acfdda 100644 --- a/tests/validation/cumsum_u8.c +++ b/tests/validation/cumsum_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumsum u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumsum_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_cumsum_params *params = + csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,18 +38,18 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -63,35 +64,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,18 +99,18 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_cumsum_init(input, output, ¶ms) == CSINN_TRUE) { - csi_cumsum(input, output, ¶ms); + if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) { + csinn_cumsum(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); - + free(buffer); free(src_tmp); free(output->data); diff --git a/tests/validation/deconvolution3d_f32.c b/tests/validation/deconvolution3d_f32.c index 96b755f6..df0f63cf 100644 --- a/tests/validation/deconvolution3d_f32.c +++ b/tests/validation/deconvolution3d_f32.c @@ -16,23 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution3d f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL); int in_size, out_size, weight_size, bias_size; if (argc == 1) { @@ -41,45 +41,45 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_depth - input->dim[3] = buffer[3]; //in_height - input->dim[4] = buffer[4]; //in_width - - kernel->dim[0] = buffer[1]; //in_channel - kernel->dim[1] = buffer[5]; //out_channel - kernel->dim[2] = buffer[6]; //filter_depth - kernel->dim[3] = buffer[7]; //filter_height - kernel->dim[4] = buffer[8]; //filter_width - - bias->dim[0] = buffer[5]; // out_channel - - output->dim[0] = buffer[0]; //batch - output->dim[1] = buffer[5]; //out_channel - output->dim[2] = buffer[9]; //out_depth - output->dim[3] = buffer[10]; //out_height - output->dim[4] = buffer[11]; //out_width - - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.out_pad_depth = buffer[21]; - params.out_pad_height = buffer[22]; - params.out_pad_width = buffer[23]; - - params.dilation_depth = buffer[24]; - params.dilation_height = buffer[25]; - params.dilation_width = buffer[26]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_depth + input->dim[3] = buffer[3]; // in_height + input->dim[4] = buffer[4]; // in_width + + kernel->dim[0] = buffer[1]; // in_channel + kernel->dim[1] = buffer[5]; // out_channel + kernel->dim[2] = buffer[6]; // filter_depth + kernel->dim[3] = buffer[7]; // filter_height + kernel->dim[4] = buffer[8]; // filter_width + + bias->dim[0] = buffer[5]; // out_channel + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[5]; // out_channel + output->dim[2] = buffer[9]; // out_depth + output->dim[3] = buffer[10]; // out_height + output->dim[4] = buffer[11]; // out_width + + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->out_pad_depth = buffer[21]; + params->out_pad_height = buffer[22]; + params->out_pad_width = buffer[23]; + + params->dilation_depth = buffer[24]; + params->dilation_height = buffer[25]; + params->dilation_width = buffer[26]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -90,25 +90,23 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; + weight_size = + kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; + bias_size = bias->dim[0]; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; - bias_size = bias->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - input->data = (float *)(buffer + 27); - kernel->data = (float *)(buffer + 27 + in_size); - bias->data = (float *)(buffer + 27 + in_size + weight_size); + input->data = (float *)(buffer + 27); + kernel->data = (float *)(buffer + 27 + in_size); + bias->data = (float *)(buffer + 27 + in_size + weight_size); reference->data = (float *)(buffer + 27 + in_size + weight_size + bias_size); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_deconv3d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv3d(input, output, kernel, bias, ¶ms); + if (csinn_deconv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv3d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/deconvolution3d_u8.c b/tests/validation/deconvolution3d_u8.c index 7cbc7947..ca65e22a 100644 --- a/tests/validation/deconvolution3d_u8.c +++ b/tests/validation/deconvolution3d_u8.c @@ -16,23 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution3d u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL); int in_size, out_size, weight_size, bias_size; float scale, scale1, scale2, scale3; @@ -42,45 +42,45 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_depth - input->dim[3] = buffer[3]; //in_height - input->dim[4] = buffer[4]; //in_width - - kernel->dim[0] = buffer[1]; //in_channel - kernel->dim[1] = buffer[5]; //out_channel - kernel->dim[2] = buffer[6]; //filter_depth - kernel->dim[3] = buffer[7]; //filter_height - kernel->dim[4] = buffer[8]; //filter_width - - bias->dim[0] = buffer[5]; // out_channel - - output->dim[0] = buffer[0]; //batch - output->dim[1] = buffer[5]; //out_channel - output->dim[2] = buffer[9]; //out_depth - output->dim[3] = buffer[10]; //out_height - output->dim[4] = buffer[11]; //out_width - - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.out_pad_depth = buffer[21]; - params.out_pad_height = buffer[22]; - params.out_pad_width = buffer[23]; - - params.dilation_depth = buffer[24]; - params.dilation_height = buffer[25]; - params.dilation_width = buffer[26]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_depth + input->dim[3] = buffer[3]; // in_height + input->dim[4] = buffer[4]; // in_width + + kernel->dim[0] = buffer[1]; // in_channel + kernel->dim[1] = buffer[5]; // out_channel + kernel->dim[2] = buffer[6]; // filter_depth + kernel->dim[3] = buffer[7]; // filter_height + kernel->dim[4] = buffer[8]; // filter_width + + bias->dim[0] = buffer[5]; // out_channel + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[5]; // out_channel + output->dim[2] = buffer[9]; // out_depth + output->dim[3] = buffer[10]; // out_height + output->dim[4] = buffer[11]; // out_width + + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->out_pad_depth = buffer[21]; + params->out_pad_height = buffer[22]; + params->out_pad_width = buffer[23]; + + params->dilation_depth = buffer[24]; + params->dilation_height = buffer[25]; + params->dilation_width = buffer[26]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -106,59 +106,55 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; - bias_size = bias->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - - float *src_in = (float *)(buffer + 27); - float *kernel_in = (float *)(buffer + 27 + in_size); - float *bias_in = (float *)(buffer + 27 + in_size + weight_size); - float *ref = (float *)(buffer + 27 + in_size + weight_size + bias_size); + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; + weight_size = + kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; + bias_size = bias->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 27); + float *kernel_in = (float *)(buffer + 27 + in_size); + float *bias_in = (float *)(buffer + 27 + in_size + weight_size); + float *ref = (float *)(buffer + 27 + in_size + weight_size + bias_size); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); - + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } scale = scale1 * scale2; - for(int i = 0; i < bias_size; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + for (int i = 0; i < bias_size; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_deconv3d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv3d(input, output, kernel, bias, ¶ms); + if (csinn_deconv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv3d(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/deconvolution_f32.c b/tests/validation/deconvolution_f32.c index 499333ef..25b06f5a 100644 --- a/tests/validation/deconvolution_f32.c +++ b/tests/validation/deconvolution_f32.c @@ -16,54 +16,54 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nhwc f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[14]; // o - kernel->dim[1] = buffer[6]; // h - kernel->dim[2] = buffer[7]; // w - kernel->dim[3] = buffer[3]; // i - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[14]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[14]; // o + kernel->dim[1] = buffer[6]; // h + kernel->dim[2] = buffer[7]; // w + kernel->dim[3] = buffer[3]; // i + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[14]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -73,23 +73,21 @@ int main(int argc, char** argv) kernel->dtype = CSINN_DTYPE_FLOAT32; bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; input->data = (float *)(buffer + 17); kernel->data = (float *)(buffer + 17 + in_size); bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/deconvolution_i8.c b/tests/validation/deconvolution_i8.c index 1aae0eab..136b20df 100644 --- a/tests/validation/deconvolution_i8.c +++ b/tests/validation/deconvolution_i8.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nhwc i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[14]; // o - kernel->dim[1] = buffer[6]; // h - kernel->dim[2] = buffer[7]; // w - kernel->dim[3] = buffer[3]; // i - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[14]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[14]; // o + kernel->dim[1] = buffer[6]; // h + kernel->dim[2] = buffer[7]; // w + kernel->dim[3] = buffer[3]; // i + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[14]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -79,7 +79,7 @@ int main(int argc, char** argv) input->quant_channel = 1; kernel->dtype = CSINN_DTYPE_INT8; - //kernel->layout = CSINN_LAYOUT_OHWI; + // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; @@ -93,42 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,59 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/deconvolution_nchw_f32.c b/tests/validation/deconvolution_nchw_f32.c index eff93163..4cc86908 100644 --- a/tests/validation/deconvolution_nchw_f32.c +++ b/tests/validation/deconvolution_nchw_f32.c @@ -16,54 +16,54 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[1]; // i - kernel->dim[1] = buffer[14]; // o - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[14]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[1]; // i + kernel->dim[1] = buffer[14]; // o + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[14]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -73,23 +73,21 @@ int main(int argc, char** argv) kernel->dtype = CSINN_DTYPE_FLOAT32; bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); @@ -98,4 +96,3 @@ int main(int argc, char** argv) free(output->data); return done_testing(); } - diff --git a/tests/validation/deconvolution_nchw_i8.c b/tests/validation/deconvolution_nchw_i8.c index 45090777..7ba3563b 100644 --- a/tests/validation/deconvolution_nchw_i8.c +++ b/tests/validation/deconvolution_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,29 +45,29 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[1]; // i - kernel->dim[1] = buffer[14]; // o - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[14]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[1]; // i + kernel->dim[1] = buffer[14]; // o + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[14]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -92,42 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { error1 = fabs(src_in[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); @@ -197,4 +195,3 @@ int main(int argc, char** argv) free(output->data); return done_testing(); } - diff --git a/tests/validation/deconvolution_nchw_u8.c b/tests/validation/deconvolution_nchw_u8.c index e9dd2105..91e89352 100644 --- a/tests/validation/deconvolution_nchw_u8.c +++ b/tests/validation/deconvolution_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,29 +45,29 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[1]; // i - kernel->dim[1] = buffer[14]; // o - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[14]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[1]; // i + kernel->dim[1] = buffer[14]; // o + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[14]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -92,42 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { error1 = fabs(src_in[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); @@ -197,4 +195,3 @@ int main(int argc, char** argv) free(output->data); return done_testing(); } - diff --git a/tests/validation/deconvolution_u8.c b/tests/validation/deconvolution_u8.c index 0e4ac548..6ee6b80f 100644 --- a/tests/validation/deconvolution_u8.c +++ b/tests/validation/deconvolution_u8.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution nhwc u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; float error[2] = {0}; float max_error; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[14]; // o - kernel->dim[1] = buffer[6]; // h - kernel->dim[2] = buffer[7]; // w - kernel->dim[3] = buffer[3]; // i - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[14]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[14]; // o + kernel->dim[1] = buffer[6]; // h + kernel->dim[2] = buffer[7]; // w + kernel->dim[3] = buffer[3]; // i + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[14]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -79,7 +79,7 @@ int main(int argc, char** argv) input->quant_channel = 1; kernel->dtype = CSINN_DTYPE_UINT8; - //kernel->layout = CSINN_LAYOUT_OHWI; + // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; @@ -93,42 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,59 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depth_to_space_f32.c b/tests/validation/depth_to_space_f32.c index 9022ddf1..c269d2cc 100644 --- a/tests/validation/depth_to_space_f32.c +++ b/tests/validation/depth_to_space_f32.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depth_to_space f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct depth_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_depth_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] / (params.block_size * params.block_size); - output->dim[2] = input->dim[2] * params.block_size; - output->dim[3] = input->dim[3] * params.block_size; + output->dim[1] = input->dim[1] / (params->block_size * params->block_size); + output->dim[2] = input->dim[2] * params->block_size; + output->dim[3] = input->dim[3] * params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -54,16 +55,15 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_depth_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_depth_to_space(input, output, ¶ms); + if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_depth_to_space(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/depth_to_space_i8.c b/tests/validation/depth_to_space_i8.c index 00c28046..1726be71 100644 --- a/tests/validation/depth_to_space_i8.c +++ b/tests/validation/depth_to_space_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depth_to_space i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct depth_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_depth_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] / (params.block_size * params.block_size); - output->dim[2] = input->dim[2] * params.block_size; - output->dim[3] = input->dim[3] * params.block_size; + output->dim[1] = input->dim[1] / (params->block_size * params->block_size); + output->dim[2] = input->dim[2] * params->block_size; + output->dim[3] = input->dim[3] * params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -64,33 +65,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,15 +98,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_depth_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_depth_to_space(input, output, ¶ms); + if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_depth_to_space(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/depth_to_space_u8.c b/tests/validation/depth_to_space_u8.c index cddaaebc..09670e49 100644 --- a/tests/validation/depth_to_space_u8.c +++ b/tests/validation/depth_to_space_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depth_to_space u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct depth_to_space_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_depth_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] / (params.block_size * params.block_size); - output->dim[2] = input->dim[2] * params.block_size; - output->dim[3] = input->dim[3] * params.block_size; + output->dim[1] = input->dim[1] / (params->block_size * params->block_size); + output->dim[2] = input->dim[2] * params->block_size; + output->dim[3] = input->dim[3] * params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -64,33 +65,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,15 +98,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_depth_to_space_init(input, output, ¶ms) == CSINN_TRUE) { - csi_depth_to_space(input, output, ¶ms); + if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) { + csinn_depth_to_space(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/depthwise_convolution_f32.c b/tests/validation/depthwise_convolution_f32.c index 8fbd1a00..4cbd5261 100644 --- a/tests/validation/depthwise_convolution_f32.c +++ b/tests/validation/depthwise_convolution_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution nhwc f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -41,31 +42,30 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = 1; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = 1; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; @@ -76,23 +76,20 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/depthwise_convolution_i8.c b/tests/validation/depthwise_convolution_i8.c index e7e43b2a..d957c80c 100644 --- a/tests/validation/depthwise_convolution_i8.c +++ b/tests/validation/depthwise_convolution_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,91 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_INT8; + input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_INT8; + kernel->dtype = CSINN_DTYPE_INT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_INT8; + bias->dtype = CSINN_DTYPE_INT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_nchw_f32.c b/tests/validation/depthwise_convolution_nchw_f32.c index 5678dea4..00ed7f0a 100644 --- a/tests/validation/depthwise_convolution_nchw_f32.c +++ b/tests/validation/depthwise_convolution_nchw_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -41,34 +42,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -79,24 +79,22 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_nchw_i8.c b/tests/validation/depthwise_convolution_nchw_i8.c index f3089733..0213462b 100644 --- a/tests/validation/depthwise_convolution_nchw_i8.c +++ b/tests/validation/depthwise_convolution_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,52 +140,48 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_nchw_u8.c b/tests/validation/depthwise_convolution_nchw_u8.c index 1d225355..e287b5bd 100644 --- a/tests/validation/depthwise_convolution_nchw_u8.c +++ b/tests/validation/depthwise_convolution_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,53 +140,48 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu6_i8.c b/tests/validation/depthwise_convolution_relu6_i8.c index 5e1f831a..54e8d3fe 100644 --- a/tests/validation/depthwise_convolution_relu6_i8.c +++ b/tests/validation/depthwise_convolution_relu6_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu6 i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,91 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_INT8; + input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_INT8; + kernel->dtype = CSINN_DTYPE_INT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_INT8; + bias->dtype = CSINN_DTYPE_INT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu6_nchw_i8.c b/tests/validation/depthwise_convolution_relu6_nchw_i8.c index 2be157b6..02359edf 100644 --- a/tests/validation/depthwise_convolution_relu6_nchw_i8.c +++ b/tests/validation/depthwise_convolution_relu6_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu6 nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,51 +140,48 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu6_nchw_u8.c b/tests/validation/depthwise_convolution_relu6_nchw_u8.c index a8e04516..688c4e4d 100644 --- a/tests/validation/depthwise_convolution_relu6_nchw_u8.c +++ b/tests/validation/depthwise_convolution_relu6_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu6 nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,49 +140,46 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/depthwise_convolution_relu6_u8.c b/tests/validation/depthwise_convolution_relu6_u8.c index 5b8b656f..9a3a5819 100644 --- a/tests/validation/depthwise_convolution_relu6_u8.c +++ b/tests/validation/depthwise_convolution_relu6_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu6 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,91 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_UINT8; + input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_UINT8; + kernel->dtype = CSINN_DTYPE_UINT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_UINT8; + bias->dtype = CSINN_DTYPE_UINT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_UINT8; + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu_i8.c b/tests/validation/depthwise_convolution_relu_i8.c index d0102fe8..b18854aa 100644 --- a/tests/validation/depthwise_convolution_relu_i8.c +++ b/tests/validation/depthwise_convolution_relu_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,91 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_INT8; + input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_INT8; + kernel->dtype = CSINN_DTYPE_INT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_INT8; + bias->dtype = CSINN_DTYPE_INT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu_nchw_i8.c b/tests/validation/depthwise_convolution_relu_nchw_i8.c index 0c4bddec..586d66b0 100644 --- a/tests/validation/depthwise_convolution_relu_nchw_i8.c +++ b/tests/validation/depthwise_convolution_relu_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,52 +140,48 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu_nchw_u8.c b/tests/validation/depthwise_convolution_relu_nchw_u8.c index b1609629..2905fa2c 100644 --- a/tests/validation/depthwise_convolution_relu_nchw_u8.c +++ b/tests/validation/depthwise_convolution_relu_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,41 +98,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -141,52 +140,48 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_relu_u8.c b/tests/validation/depthwise_convolution_relu_u8.c index c8e9e832..9c66068f 100644 --- a/tests/validation/depthwise_convolution_relu_u8.c +++ b/tests/validation/depthwise_convolution_relu_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,91 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_UINT8; + input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_UINT8; + kernel->dtype = CSINN_DTYPE_UINT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_UINT8; + bias->dtype = CSINN_DTYPE_UINT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_UINT8; + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_convolution_u8.c b/tests/validation/depthwise_convolution_u8.c index 96241f47..eca0a9f5 100644 --- a/tests/validation/depthwise_convolution_u8.c +++ b/tests/validation/depthwise_convolution_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,90 +45,89 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[3] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[0] = buffer[3] / input->dim[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[3] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[0] = buffer[3] / input->dim[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_UINT8; + input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - kernel->dtype = CSINN_DTYPE_UINT8; + kernel->dtype = CSINN_DTYPE_UINT8; // kernel->layout = CSINN_LAYOUT_OHWI; kernel->is_const = 1; kernel->quant_channel = 1; - bias->dtype = CSINN_DTYPE_UINT8; + bias->dtype = CSINN_DTYPE_UINT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_UINT8; + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_deconvolution_f32.c b/tests/validation/depthwise_deconvolution_f32.c index edff7e40..fb960f48 100644 --- a/tests/validation/depthwise_deconvolution_f32.c +++ b/tests/validation/depthwise_deconvolution_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -40,57 +41,55 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = 1; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = 1; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_FLOAT32; + input->dtype = CSINN_DTYPE_FLOAT32; kernel->dtype = CSINN_DTYPE_FLOAT32; bias->dtype = CSINN_DTYPE_FLOAT32; - output->dtype = CSINN_DTYPE_FLOAT32; + output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/depthwise_deconvolution_i8.c b/tests/validation/depthwise_deconvolution_i8.c index b06cf6fd..7c88b60d 100644 --- a/tests/validation/depthwise_deconvolution_i8.c +++ b/tests/validation/depthwise_deconvolution_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,36 +45,36 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = 1; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = 1; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_INT8; + input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; @@ -88,46 +89,45 @@ int main(int argc, char** argv) bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_deconvolution_nchw_f32.c b/tests/validation/depthwise_deconvolution_nchw_f32.c index 5811f3b8..dd801bd3 100644 --- a/tests/validation/depthwise_deconvolution_nchw_f32.c +++ b/tests/validation/depthwise_deconvolution_nchw_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -41,34 +42,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -79,24 +79,22 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/depthwise_deconvolution_nchw_u8.c b/tests/validation/depthwise_deconvolution_nchw_u8.c index 8f7eee18..2334017c 100644 --- a/tests/validation/depthwise_deconvolution_nchw_u8.c +++ b/tests/validation/depthwise_deconvolution_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -45,34 +46,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -98,56 +98,52 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } - scale = scale1 * scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/depthwise_deconvolution_u8.c b/tests/validation/depthwise_deconvolution_u8.c index 884655a4..d3b42200 100644 --- a/tests/validation/depthwise_deconvolution_u8.c +++ b/tests/validation/depthwise_deconvolution_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,36 +45,36 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = 1; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[15]; // height - output->dim[2] = buffer[16]; // width - output->dim[3] = buffer[12]; // out_channel - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = buffer[3]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = 1; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[15]; // height + output->dim[2] = buffer[16]; // width + output->dim[3] = buffer[12]; // out_channel + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = buffer[3]; input->dim_count = 4; kernel->dim_count = 4; bias->dim_count = 1; output->dim_count = 4; - input->dtype = CSINN_DTYPE_UINT8; + input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; @@ -88,46 +89,45 @@ int main(int argc, char** argv) bias->is_const = 0; bias->quant_channel = 1; - output->dtype = CSINN_DTYPE_UINT8; + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 17); - float *kernel_in = (float *)(buffer + 17 + in_size); - float *bias_in = (float *)(buffer + 17 + in_size + weight_size); - float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 17); + float *kernel_in = (float *)(buffer + 17 + in_size); + float *bias_in = (float *)(buffer + 17 + in_size + weight_size); + float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -136,58 +136,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_deconv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_deconv2d(input, output, kernel, bias, ¶ms); + if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_deconv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/dequantize_f32.c b/tests/validation/dequantize_f32.c index e946e95d..cb85f5d3 100644 --- a/tests/validation/dequantize_f32.c +++ b/tests/validation/dequantize_f32.c @@ -16,43 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" -#include "csi_c860.h" #include "math_snr.h" +#include "shl_c860.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of dequantize f32.\n"); - struct csi_tensor *it = csi_alloc_tensor(NULL); + struct csinn_tensor *it = csinn_alloc_tensor(NULL); float *input, *output, *reference; int in_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; int *buffer = read_input_data_f32(argv[1]); - in_size = buffer[0]; + in_size = buffer[0]; - input = (float *)(buffer + 1); - reference = malloc(in_size * sizeof(float)); - output = malloc(in_size * sizeof(float)); + input = (float *)(buffer + 1); + reference = malloc(in_size * sizeof(float)); + output = malloc(in_size * sizeof(float)); uint8_t *input_tmp = malloc(in_size * sizeof(char)); find_min_max(input, &max_value, &min_value, in_size); get_scale_and_zp(max_value, min_value, &scale, &zp); - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); it->data = input; get_quant_info(it); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(input[i], it->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(input[i], it->qinfo); } - for(int i = 0; i < in_size; i++) { - reference[i] = csi_ref_dequantize_u8_to_f32(input_tmp[i], it->qinfo); + for (int i = 0; i < in_size; i++) { + reference[i] = shl_ref_dequantize_u8_to_f32(input_tmp[i], it->qinfo); } - csi_dequantize_f32_c860(input_tmp, output, -it->qinfo->zero_point, it->qinfo->multiplier, + shl_c860_dequantize_f32(input_tmp, output, -it->qinfo->zero_point, it->qinfo->multiplier, it->qinfo->shift, in_size); float difference = argc > 2 ? atof(argv[2]) : 0.9; diff --git a/tests/validation/div_f32.c b/tests/validation/div_f32.c index eac3b2f4..461a7d76 100644 --- a/tests/validation/div_f32.c +++ b/tests/validation/div_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of div f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -53,17 +53,17 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(out_size * sizeof(float)); - float difference = argc > 2 ? atof(argv[2]) : 0.9;; + output->data = malloc(out_size * sizeof(float)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; + ; - if (csi_div_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_div(input0, input1, output, ¶ms); + if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_div(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/div_i8.c b/tests/validation/div_i8.c index eaeaab41..c6148d19 100644 --- a/tests/validation/div_i8.c +++ b/tests/validation/div_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of div i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,11 +38,11 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,36 +68,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -105,23 +104,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -130,17 +129,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_div_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_div(input0, input1, output, ¶ms); + if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_div(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/div_u8.c b/tests/validation/div_u8.c index a0b374dc..4ef3c12a 100644 --- a/tests/validation/div_u8.c +++ b/tests/validation/div_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of div u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,11 +38,11 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,36 +68,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -105,23 +104,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -130,17 +129,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_div_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_div(input0, input1, output, ¶ms); + if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_div(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/elu_f32.c b/tests/validation/elu_f32.c index d8b19a9e..988853b0 100644 --- a/tests/validation/elu_f32.c +++ b/tests/validation/elu_f32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -50,16 +50,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_elu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_elu(input, output, ¶ms); + if (csinn_elu_init(input, output, params) == CSINN_TRUE) { + csinn_elu(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/elu_i8.c b/tests/validation/elu_i8.c index cba90917..9519cff1 100644 --- a/tests/validation/elu_i8.c +++ b/tests/validation/elu_i8.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -56,35 +56,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_elu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_elu(input, output, ¶ms); + if (csinn_elu_init(input, output, params) == CSINN_TRUE) { + csinn_elu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/elu_u8.c b/tests/validation/elu_u8.c index eb545673..35e2f20f 100644 --- a/tests/validation/elu_u8.c +++ b/tests/validation/elu_u8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -59,33 +59,30 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_elu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_elu(input, output, ¶ms); + if (csinn_elu_init(input, output, params) == CSINN_TRUE) { + csinn_elu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/equal_f32.c b/tests/validation/equal_f32.c index 0acea4af..e549d57d 100644 --- a/tests/validation/equal_f32.c +++ b/tests/validation/equal_f32.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of equal f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1; int out_size = 1; @@ -38,7 +38,7 @@ int main(int argc, char** argv) input0->dim_count = input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for (int i = 0; i < input0->dim_count; i++ ) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; input1->dim[i] = input0->dim[i]; output->dim[i] = input0->dim[i]; @@ -49,17 +49,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); - reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size); - output->data = malloc(out_size * sizeof(float)); + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_equal(input0, input1, output, ¶ms); + if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_equal(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/equal_i8.c b/tests/validation/equal_i8.c index 29527fef..e950b47f 100644 --- a/tests/validation/equal_i8.c +++ b/tests/validation/equal_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of equal i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -42,7 +42,7 @@ int main(int argc, char** argv) input0->dim_count = input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for (int i = 0; i < input0->dim_count; i++ ) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; input1->dim[i] = input0->dim[i]; output->dim[i] = input0->dim[i]; @@ -65,35 +65,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; int8_t *input_tmp0 = malloc(in_size * sizeof(char)); int8_t *input_tmp1 = malloc(in_size * sizeof(char)); - float *src_in0 = (float *)(buffer + 1 + input0->dim_count); - float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in0 = (float *)(buffer + 1 + input0->dim_count); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); input0->data = src_in0; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - input_tmp0[i] = csi_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp0[i] = shl_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp0[i], input0->qinfo); - if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp0[i], input0->qinfo); + if (src_in0[i] == INFINITY && output_tmp == INFINITY || + src_in0[i] == NAN && output_tmp == NAN) { continue; } else { error1 = fabs(src_in0[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -101,23 +100,24 @@ int main(int argc, char** argv) input1->data = src_in1; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - input_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp1[i], input1->qinfo); - if(src_in1[i] == INFINITY && output_tmp == INFINITY || input_tmp1[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp1[i], input1->qinfo); + if (src_in1[i] == INFINITY && output_tmp == INFINITY || + input_tmp1[i] == NAN && output_tmp == NAN) { continue; } else { error1 = fabs(src_in1[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -127,16 +127,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = input_tmp0; - input1->data = input_tmp1; + input0->data = input_tmp0; + input1->data = input_tmp1; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_equal(input0, input1, output, ¶ms); + if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/equal_u8.c b/tests/validation/equal_u8.c index 57c6b54d..ee57794d 100644 --- a/tests/validation/equal_u8.c +++ b/tests/validation/equal_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of equal u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -42,7 +42,7 @@ int main(int argc, char** argv) input0->dim_count = input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for (int i = 0; i < input0->dim_count; i++ ) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; input1->dim[i] = input0->dim[i]; output->dim[i] = input0->dim[i]; @@ -56,7 +56,6 @@ int main(int argc, char** argv) input0->is_const = 0; input0->quant_channel = 1; - input1->dtype = CSINN_DTYPE_UINT8; input1->layout = CSINN_LAYOUT_NCHW; input1->is_const = 0; @@ -66,36 +65,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; uint8_t *input_tmp0 = malloc(in_size * sizeof(char)); uint8_t *input_tmp1 = malloc(in_size * sizeof(char)); - float *src_in0 = (float *)(buffer + 1 + input0->dim_count); - float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in0 = (float *)(buffer + 1 + input0->dim_count); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); input0->data = src_in0; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - input_tmp0[i] = csi_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp0[i] = shl_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp0[i], input0->qinfo); - if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp0[i], input0->qinfo); + if (src_in0[i] == INFINITY && output_tmp == INFINITY || + src_in0[i] == NAN && output_tmp == NAN) { continue; } else { error1 = fabs(src_in0[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -103,23 +101,24 @@ int main(int argc, char** argv) input1->data = src_in1; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - input_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp1[i], input1->qinfo); - if(src_in1[i] == INFINITY && output_tmp == INFINITY || input_tmp1[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp1[i], input1->qinfo); + if (src_in1[i] == INFINITY && output_tmp == INFINITY || + input_tmp1[i] == NAN && output_tmp == NAN) { continue; } else { error1 = fabs(src_in1[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -129,16 +128,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = input_tmp0; - input1->data = input_tmp1; + input0->data = input_tmp0; + input1->data = input_tmp1; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_equal(input0, input1, output, ¶ms); + if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/erf_f32.c b/tests/validation/erf_f32.c index 8d216e12..8abfe16c 100644 --- a/tests/validation/erf_f32.c +++ b/tests/validation/erf_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of erf f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_erf_init(input, output, ¶ms) == CSINN_TRUE) { - csi_erf(input, output, ¶ms); + if (csinn_erf_init(input, output, params) == CSINN_TRUE) { + csinn_erf(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/erf_i8.c b/tests/validation/erf_i8.c index 6c85ea58..d219f1d2 100644 --- a/tests/validation/erf_i8.c +++ b/tests/validation/erf_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of erf i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -55,34 +55,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,14 +88,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_erf_init(input, output, ¶ms) == CSINN_TRUE) { - csi_erf(input, output, ¶ms); + if (csinn_erf_init(input, output, params) == CSINN_TRUE) { + csinn_erf(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/erf_u8.c b/tests/validation/erf_u8.c index eab7ef2f..af05edfd 100644 --- a/tests/validation/erf_u8.c +++ b/tests/validation/erf_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of erf u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -55,34 +55,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,14 +88,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_erf_init(input, output, ¶ms) == CSINN_TRUE) { - csi_erf(input, output, ¶ms); + if (csinn_erf_init(input, output, params) == CSINN_TRUE) { + csinn_erf(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/exp_f32.c b/tests/validation/exp_f32.c index ea32e20f..f606730d 100644 --- a/tests/validation/exp_f32.c +++ b/tests/validation/exp_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of exp f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_exp_init(input, output, ¶ms) == CSINN_TRUE) { - csi_exp(input, output, ¶ms); + if (csinn_exp_init(input, output, params) == CSINN_TRUE) { + csinn_exp(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/exp_i8.c b/tests/validation/exp_i8.c index 6e4a0806..a5275e56 100644 --- a/tests/validation/exp_i8.c +++ b/tests/validation/exp_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of exp i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,35 +54,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,14 +88,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_exp_init(input, output, ¶ms) == CSINN_TRUE) { - csi_exp(input, output, ¶ms); + if (csinn_exp_init(input, output, params) == CSINN_TRUE) { + csinn_exp(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/exp_u8.c b/tests/validation/exp_u8.c index b34219d3..18b0f69b 100644 --- a/tests/validation/exp_u8.c +++ b/tests/validation/exp_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of exp u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,35 +54,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,14 +88,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_exp_init(input, output, ¶ms) == CSINN_TRUE) { - csi_exp(input, output, ¶ms); + if (csinn_exp_init(input, output, params) == CSINN_TRUE) { + csinn_exp(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/expand_dims_f32.c b/tests/validation/expand_dims_f32.c index ab9804ee..1e0f7f78 100644 --- a/tests/validation/expand_dims_f32.c +++ b/tests/validation/expand_dims_f32.c @@ -16,37 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expand_dims f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct expand_dims_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_expand_dims_params *params = + csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); int dim_count = buffer[0]; int axis = buffer[1]; - for(int i = 0; i < dim_count; i++) { + for (int i = 0; i < dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } input->dim_count = dim_count; - output->dim_count = input->dim_count + 1; // axis is 0-D scalar + output->dim_count = input->dim_count + 1; // axis is 0-D scalar - for(int i = 0; i < output->dim_count; i++) { - if(i < axis) { + for (int i = 0; i < output->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } else if(i == axis) { + } else if (i == axis) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i - 1]; @@ -56,16 +57,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 2 + dim_count); reference->data = (float *)(buffer + 2 + dim_count + in_size); output->data = (float *)malloc(sizeof(float) * out_size); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expand_dims_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expand_dims(input, output, ¶ms); + if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) { + csinn_expand_dims(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/expand_dims_i8.c b/tests/validation/expand_dims_i8.c index b1380387..bb0e7480 100644 --- a/tests/validation/expand_dims_i8.c +++ b/tests/validation/expand_dims_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expand_dims i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct expand_dims_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_expand_dims_params *params = + csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -39,17 +40,17 @@ int main(int argc, char** argv) int dim_count = buffer[0]; int axis = buffer[1]; - for(int i = 0; i < dim_count; i++) { + for (int i = 0; i < dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } input->dim_count = dim_count; output->dim_count = input->dim_count + 1; - for(int i = 0; i < output->dim_count; i++) { - if(i < axis) { + for (int i = 0; i < output->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } else if(i == axis) { + } else if (i == axis) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i - 1]; @@ -65,52 +66,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + dim_count); - float *ref = (float *)(buffer + 2 + dim_count + in_size); + float *src_in = (float *)(buffer + 2 + dim_count); + float *ref = (float *)(buffer + 2 + dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expand_dims_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expand_dims(input, output, ¶ms); + if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) { + csinn_expand_dims(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/expand_dims_u8.c b/tests/validation/expand_dims_u8.c index 98410380..165a25a1 100644 --- a/tests/validation/expand_dims_u8.c +++ b/tests/validation/expand_dims_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expand_dims u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct expand_dims_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_expand_dims_params *params = + csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -39,17 +40,17 @@ int main(int argc, char** argv) int dim_count = buffer[0]; int axis = buffer[1]; - for(int i = 0; i < dim_count; i++) { + for (int i = 0; i < dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } input->dim_count = dim_count; output->dim_count = input->dim_count + 1; - for(int i = 0; i < output->dim_count; i++) { - if(i < axis) { + for (int i = 0; i < output->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } else if(i == axis) { + } else if (i == axis) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i - 1]; @@ -65,52 +66,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + dim_count); - float *ref = (float *)(buffer + 2 + dim_count + in_size); + float *src_in = (float *)(buffer + 2 + dim_count); + float *ref = (float *)(buffer + 2 + dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expand_dims_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expand_dims(input, output, ¶ms); + if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) { + csinn_expand_dims(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/expm1_f32.c b/tests/validation/expm1_f32.c index bfde48b4..58406d73 100644 --- a/tests/validation/expm1_f32.c +++ b/tests/validation/expm1_f32.c @@ -16,27 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expm1 f32. \n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -45,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expm1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expm1(input, output, ¶ms); + if (csinn_expm1_init(input, output, params) == CSINN_TRUE) { + csinn_expm1(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/expm1_i8.c b/tests/validation/expm1_i8.c index b53f8dea..1d7cced9 100644 --- a/tests/validation/expm1_i8.c +++ b/tests/validation/expm1_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expm1 i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,54 +52,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expm1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expm1(input, output, ¶ms); + if (csinn_expm1_init(input, output, params) == CSINN_TRUE) { + csinn_expm1(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/expm1_u8.c b/tests/validation/expm1_u8.c index da5dcf7e..c50fe140 100644 --- a/tests/validation/expm1_u8.c +++ b/tests/validation/expm1_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expm1 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,54 +52,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_expm1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_expm1(input, output, ¶ms); + if (csinn_expm1_init(input, output, params) == CSINN_TRUE) { + csinn_expm1(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/flatten_f32.c b/tests/validation/flatten_f32.c index a77c4c1b..a5605509 100644 --- a/tests/validation/flatten_f32.c +++ b/tests/validation/flatten_f32.c @@ -16,25 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of flatten f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct flatten_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_flatten_params *params = + csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -44,16 +45,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_flatten_init(input, output, ¶ms) == CSINN_TRUE) { - csi_flatten(input, output, ¶ms); + if (csinn_flatten_init(input, output, params) == CSINN_TRUE) { + csinn_flatten(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/flatten_i8.c b/tests/validation/flatten_i8.c index db7d1d48..e9c12cf6 100644 --- a/tests/validation/flatten_i8.c +++ b/tests/validation/flatten_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of flatten i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct flatten_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_flatten_params *params = + csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -54,54 +55,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_flatten_init(input, output, ¶ms) == CSINN_TRUE) { - csi_flatten(input, output, ¶ms); + if (csinn_flatten_init(input, output, params) == CSINN_TRUE) { + csinn_flatten(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/flatten_u8.c b/tests/validation/flatten_u8.c index a6ce4266..9468231b 100644 --- a/tests/validation/flatten_u8.c +++ b/tests/validation/flatten_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of flatten u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct flatten_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_flatten_params *params = + csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -54,54 +55,49 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_flatten_init(input, output, ¶ms) == CSINN_TRUE) { - csi_flatten(input, output, ¶ms); + if (csinn_flatten_init(input, output, params) == CSINN_TRUE) { + csinn_flatten(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/floor_div_f32.c b/tests/validation/floor_div_f32.c index 70390391..99dae89b 100644 --- a/tests/validation/floor_div_f32.c +++ b/tests/validation/floor_div_f32.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor div f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -54,17 +54,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_floor_divide_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_divide(input0, input1, output, ¶ms); + if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_divide(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/floor_div_i8.c b/tests/validation/floor_div_i8.c index c23f86e6..acbcddda 100644 --- a/tests/validation/floor_div_i8.c +++ b/tests/validation/floor_div_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor div i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -40,10 +40,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -69,58 +69,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = (int8_t *)malloc(in_size * sizeof(int8_t)); int8_t *src1_tmp = (int8_t *)malloc(in_size * sizeof(int8_t)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -130,15 +129,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); + output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_floor_divide_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_divide(input0, input1, output, ¶ms); + if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_divide(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/floor_div_u8.c b/tests/validation/floor_div_u8.c index 20f6831b..b30e0255 100644 --- a/tests/validation/floor_div_u8.c +++ b/tests/validation/floor_div_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor div u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -40,10 +40,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -69,59 +69,58 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t)); uint8_t *src1_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -131,15 +130,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); + output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_floor_divide_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_divide(input0, input1, output, ¶ms); + if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_divide(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/floor_f32.c b/tests/validation/floor_f32.c index 43afe982..7e5fa2e4 100644 --- a/tests/validation/floor_f32.c +++ b/tests/validation/floor_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -49,17 +49,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_floor_init(input, output, ¶ms) == CSINN_TRUE) { - csi_floor(input, output, ¶ms); - } + if (csinn_floor_init(input, output, params) == CSINN_TRUE) { + csinn_floor(input, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/floor_i8.c b/tests/validation/floor_i8.c index 833f376f..e5ccc635 100644 --- a/tests/validation/floor_i8.c +++ b/tests/validation/floor_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,56 +57,51 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_floor_init(input, output, ¶ms) == CSINN_TRUE) { - csi_floor(input, output, ¶ms); - } + if (csinn_floor_init(input, output, params) == CSINN_TRUE) { + csinn_floor(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/floor_mod_f32.c b/tests/validation/floor_mod_f32.c index 7bf814c9..a359a0e6 100644 --- a/tests/validation/floor_mod_f32.c +++ b/tests/validation/floor_mod_f32.c @@ -16,37 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor mod f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width - + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width - - input1->dim[0] = input1->dim[0] = buffer[0]; // batch - input1->dim[1] = input1->dim[1] = buffer[1]; // channel - input1->dim[2] = input1->dim[2] = buffer[2]; // height - input1->dim[3] = input1->dim[3] = buffer[3]; // width + input1->dim[0] = input1->dim[0] = buffer[0]; // batch + input1->dim[1] = input1->dim[1] = buffer[1]; // channel + input1->dim[2] = input1->dim[2] = buffer[2]; // height + input1->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -61,17 +59,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_floor_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_mod(input0, input1, output, ¶ms); + if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_mod(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/floor_mod_i8.c b/tests/validation/floor_mod_i8.c index 661f9593..1ffb4b1c 100644 --- a/tests/validation/floor_mod_i8.c +++ b/tests/validation/floor_mod_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor mod i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,20 +38,17 @@ int main(int argc, char** argv) float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width - - - input1->dim[0] = input1->dim[0] = buffer[0]; // batch - input1->dim[1] = input1->dim[1] = buffer[1]; // channel - input1->dim[2] = input1->dim[2] = buffer[2]; // height - input1->dim[3] = input1->dim[3] = buffer[3]; // width + input1->dim[0] = input1->dim[0] = buffer[0]; // batch + input1->dim[1] = input1->dim[1] = buffer[1]; // channel + input1->dim[2] = input1->dim[2] = buffer[2]; // height + input1->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -77,61 +74,59 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -140,18 +135,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_floor_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_mod(input0, input1, output, ¶ms); + if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_mod(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/floor_mod_u8.c b/tests/validation/floor_mod_u8.c index fba11778..88765d01 100644 --- a/tests/validation/floor_mod_u8.c +++ b/tests/validation/floor_mod_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor mod u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,20 +38,17 @@ int main(int argc, char** argv) float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = input1->dim[0] = buffer[0]; // batch - input0->dim[1] = input1->dim[1] = buffer[1]; // channel - input0->dim[2] = input1->dim[2] = buffer[2]; // height - input0->dim[3] = input1->dim[3] = buffer[3]; // width + input0->dim[0] = input1->dim[0] = buffer[0]; // batch + input0->dim[1] = input1->dim[1] = buffer[1]; // channel + input0->dim[2] = input1->dim[2] = buffer[2]; // height + input0->dim[3] = input1->dim[3] = buffer[3]; // width - - - input1->dim[0] = input1->dim[0] = buffer[0]; // batch - input1->dim[1] = input1->dim[1] = buffer[1]; // channel - input1->dim[2] = input1->dim[2] = buffer[2]; // height - input1->dim[3] = input1->dim[3] = buffer[3]; // width + input1->dim[0] = input1->dim[0] = buffer[0]; // batch + input1->dim[1] = input1->dim[1] = buffer[1]; // channel + input1->dim[2] = input1->dim[2] = buffer[2]; // height + input1->dim[3] = input1->dim[3] = buffer[3]; // width output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -77,61 +74,59 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -140,18 +135,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_floor_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_floor_mod(input0, input1, output, ¶ms); + if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_floor_mod(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/floor_u8.c b/tests/validation/floor_u8.c index 4258448c..18ea1bf6 100644 --- a/tests/validation/floor_u8.c +++ b/tests/validation/floor_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,57 +57,52 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; + if (csinn_floor_init(input, output, params) == CSINN_TRUE) { + csinn_floor(input, output, params); + } - if (csi_floor_init(input, output, ¶ms) == CSINN_TRUE) { - csi_floor(input, output, ¶ms); - } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); free(src_tmp); diff --git a/tests/validation/fullyconnected_f32.c b/tests/validation/fullyconnected_f32.c index 0b1e1737..4351dd9a 100644 --- a/tests/validation/fullyconnected_f32.c +++ b/tests/validation/fullyconnected_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *weight = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct fc_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_size - weight->dim[0] = buffer[2]; // out_size - weight->dim[1] = buffer[1]; // in_size - bias->dim[0] = buffer[2]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_size + weight->dim[0] = buffer[2]; // out_size + weight->dim[1] = buffer[1]; // in_size + bias->dim[0] = buffer[2]; output->dim[0] = buffer[0]; output->dim[1] = buffer[2]; - input->dim_count = 2; + input->dim_count = 2; weight->dim_count = 2; - bias->dim_count = 1; + bias->dim_count = 1; output->dim_count = 2; in_size0 = input->dim[0] * input->dim[1]; in_size1 = weight->dim[0] * weight->dim[1]; out_size = output->dim[0] * output->dim[1]; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 3); - weight->data = (float *)(buffer + 3 + in_size0); - bias->data = (float *)(buffer + 3 + in_size0 + in_size1); + input->data = (float *)(buffer + 3); + weight->data = (float *)(buffer + 3 + in_size0); + bias->data = (float *)(buffer + 3 + in_size0 + in_size1); reference->data = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]); - output->data = malloc(out_size * sizeof(float)); - float difference = argc > 2 ? atof(argv[2]) : 0.9;; + output->data = malloc(out_size * sizeof(float)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; + ; - if (csi_fullyconnected_init(input, output, weight, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weight, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weight, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/fullyconnected_i8.c b/tests/validation/fullyconnected_i8.c index 26036b08..112db978 100644 --- a/tests/validation/fullyconnected_i8.c +++ b/tests/validation/fullyconnected_i8.c @@ -16,38 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *weight = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct fc_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); int in_size0, in_size1, out_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; - int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_size - weight->dim[0] = buffer[2]; // out_size - weight->dim[1] = buffer[1]; // in_size - bias->dim[0] = buffer[2]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_size + weight->dim[0] = buffer[2]; // out_size + weight->dim[1] = buffer[1]; // in_size + bias->dim[0] = buffer[2]; output->dim[0] = buffer[0]; output->dim[1] = buffer[2]; - input->dim_count = 2; + input->dim_count = 2; weight->dim_count = 2; - bias->dim_count = 1; + bias->dim_count = 1; output->dim_count = 2; in_size0 = input->dim[0] * input->dim[1]; in_size1 = weight->dim[0] * weight->dim[1]; @@ -71,14 +70,12 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NC; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 3); - float *weight_in = (float *)(buffer + 3 + in_size0); - float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1); - float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]); + float *src_in = (float *)(buffer + 3); + float *weight_in = (float *)(buffer + 3 + in_size0); + float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1); + float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]); int8_t *input_tmp = malloc(in_size0 * sizeof(char)); int8_t *weight_tmp = malloc(in_size1 * sizeof(char)); @@ -88,49 +85,47 @@ int main(int argc, char** argv) get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size0; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size0; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } weight->data = weight_in; get_quant_info(weight); scale2 = weight->qinfo->scale; - for(int i = 0; i < in_size1; i++) { - weight_tmp[i] = csi_ref_quantize_f32_to_i8(weight_in[i], weight->qinfo); + for (int i = 0; i < in_size1; i++) { + weight_tmp[i] = shl_ref_quantize_f32_to_i8(weight_in[i], weight->qinfo); } - - - scale=scale1*scale2; - for(int i = 0; i < buffer[2]; i++) { - bias_tmp[i] = (int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < buffer[2]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - weight->data = weight_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + weight->data = weight_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-3; - if (csi_fullyconnected_init(input, output, weight, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weight, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weight, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/fullyconnected_u8.c b/tests/validation/fullyconnected_u8.c index 3948d1f3..f65e5c0a 100644 --- a/tests/validation/fullyconnected_u8.c +++ b/tests/validation/fullyconnected_u8.c @@ -16,38 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *weight = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct fc_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); int in_size0, in_size1, out_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; - int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_size - weight->dim[0] = buffer[2]; // out_size - weight->dim[1] = buffer[1]; // in_size - bias->dim[0] = buffer[2]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_size + weight->dim[0] = buffer[2]; // out_size + weight->dim[1] = buffer[1]; // in_size + bias->dim[0] = buffer[2]; output->dim[0] = buffer[0]; output->dim[1] = buffer[2]; - input->dim_count = 2; + input->dim_count = 2; weight->dim_count = 2; - bias->dim_count = 1; + bias->dim_count = 1; output->dim_count = 2; in_size0 = input->dim[0] * input->dim[1]; in_size1 = weight->dim[0] * weight->dim[1]; @@ -56,13 +55,13 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NC; input->is_const = 0; input->quant_channel = 1; - + weight->dtype = CSINN_DTYPE_UINT8; weight->layout = CSINN_LAYOUT_OI; weight->is_const = 1; weight->quant_channel = 1; - bias->dtype = CSINN_DTYPE_UINT8; + bias->dtype = CSINN_DTYPE_UINT8; bias->layout = CSINN_LAYOUT_O; bias->is_const = 1; bias->quant_channel = 1; @@ -71,14 +70,12 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NC; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 3); - float *weight_in = (float *)(buffer + 3 + in_size0); - float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1); - float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]); + float *src_in = (float *)(buffer + 3); + float *weight_in = (float *)(buffer + 3 + in_size0); + float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1); + float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]); uint8_t *input_tmp = malloc(in_size0 * sizeof(char)); uint8_t *weight_tmp = malloc(in_size1 * sizeof(char)); @@ -88,41 +85,38 @@ int main(int argc, char** argv) get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size0; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size0; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } weight->data = weight_in; get_quant_info(weight); scale2 = weight->qinfo->scale; - for(int i = 0; i < in_size1; i++) { - weight_tmp[i] = csi_ref_quantize_f32_to_u8(weight_in[i], weight->qinfo); + for (int i = 0; i < in_size1; i++) { + weight_tmp[i] = shl_ref_quantize_f32_to_u8(weight_in[i], weight->qinfo); } - - - scale=scale1*scale2; - for(int i = 0; i < buffer[2]; i++) { - bias_tmp[i] = (int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < buffer[2]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - weight->data = weight_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + weight->data = weight_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-3; - if (csi_fullyconnected_init(input, output, weight, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weight, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weight, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/gather_f32.c b/tests/validation/gather_f32.c index 0f65cfe6..95b2a17a 100644 --- a/tests/validation/gather_f32.c +++ b/tests/validation/gather_f32.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_params), NULL); int in_size = 1, indices_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); int axis = buffer[0]; input->dim_count = buffer[1]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } indices->dim_count = buffer[2 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[3 + input->dim_count + i]; indices_size *= indices->dim[i]; } @@ -68,18 +69,18 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; indices->dtype = CSINN_DTYPE_INT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; + params->base.api = CSINN_API; + params->axis = axis; - input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count); - indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); - reference->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); - output->data = (float *)malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count); + indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); + reference->data = + (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_gather_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather(input, indices, output, ¶ms); + if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather(input, indices, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/gather_i8.c b/tests/validation/gather_i8.c index a8d44c3b..27e20869 100644 --- a/tests/validation/gather_i8.c +++ b/tests/validation/gather_i8.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_params), NULL); int in_size = 1, indices_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -39,13 +40,13 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); int axis = buffer[0]; input->dim_count = buffer[1]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } indices->dim_count = buffer[2 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[3 + input->dim_count + i]; indices_size *= indices->dim[i]; } @@ -73,35 +74,35 @@ int main(int argc, char** argv) indices->dtype = CSINN_DTYPE_INT32; output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; + params->base.api = CSINN_API; + params->axis = axis; - float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count); - indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); - float *ref = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); + float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count); + indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); + float *ref = + (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,16 +110,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_gather_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather(input, indices, output, ¶ms); + if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather(input, indices, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/gather_nd_f32.c b/tests/validation/gather_nd_f32.c index 57873852..8ef751cb 100644 --- a/tests/validation/gather_nd_f32.c +++ b/tests/validation/gather_nd_f32.c @@ -16,35 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather_nd f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_nd_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_nd_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL); int in_size = 1, out_size = 1, indices_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 0; // init output->dim_count = 0 - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } indices->dim_count = buffer[1 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[i + 2 + input->dim_count]; indices_size *= indices->dim[i]; - if(i < indices->dim_count - 1) { + if (i < indices->dim_count - 1) { output->dim_count++; output->dim[i] = indices->dim[i]; } @@ -56,7 +57,7 @@ int main(int argc, char** argv) indices_outer_size = indices_size / indices->dim[indices->dim_count - 1]; int input_inner_size = 1; - for(int i = axis; i < input->dim_count; i++) { + for (int i = axis; i < input->dim_count; i++) { input_inner_size *= input->dim[i]; output->dim[output->dim_count] = input->dim[i]; output->dim_count++; @@ -65,17 +66,17 @@ int main(int argc, char** argv) out_size = indices_outer_size * input_inner_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); - input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); - reference->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); + reference->data = + (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_gather_nd_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather_nd(input, indices, output, ¶ms); + if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather_nd(input, indices, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/gather_nd_i8.c b/tests/validation/gather_nd_i8.c index f2775fa7..c3dc2bf0 100644 --- a/tests/validation/gather_nd_i8.c +++ b/tests/validation/gather_nd_i8.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather_nd i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_nd_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_nd_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL); int in_size = 1, out_size = 1, indices_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -39,15 +40,15 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 0; // init output->dim_count = 0 - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } indices->dim_count = buffer[1 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[i + 2 + input->dim_count]; indices_size *= indices->dim[i]; - if(i < indices->dim_count - 1) { + if (i < indices->dim_count - 1) { output->dim_count++; output->dim[i] = indices->dim[i]; } @@ -59,7 +60,7 @@ int main(int argc, char** argv) indices_outer_size = indices_size / indices->dim[indices->dim_count - 1]; int input_inner_size = 1; - for(int i = axis; i < input->dim_count; i++) { + for (int i = axis; i < input->dim_count; i++) { input_inner_size *= input->dim[i]; output->dim[output->dim_count] = input->dim[i]; output->dim_count++; @@ -70,39 +71,39 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); - float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); - float *ref = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); + indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); + float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); + float *ref = + (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -110,15 +111,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_gather_nd_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather_nd(input, indices, output, ¶ms); + if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather_nd(input, indices, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/gather_nd_u8.c b/tests/validation/gather_nd_u8.c index c9d0bce1..bd7080df 100644 --- a/tests/validation/gather_nd_u8.c +++ b/tests/validation/gather_nd_u8.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather_nd u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_nd_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_nd_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL); int in_size = 1, out_size = 1, indices_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -39,15 +40,15 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 0; // init output->dim_count = 0 - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } indices->dim_count = buffer[1 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[i + 2 + input->dim_count]; indices_size *= indices->dim[i]; - if(i < indices->dim_count - 1) { + if (i < indices->dim_count - 1) { output->dim_count++; output->dim[i] = indices->dim[i]; } @@ -59,7 +60,7 @@ int main(int argc, char** argv) indices_outer_size = indices_size / indices->dim[indices->dim_count - 1]; int input_inner_size = 1; - for(int i = axis; i < input->dim_count; i++) { + for (int i = axis; i < input->dim_count; i++) { input_inner_size *= input->dim[i]; output->dim[output->dim_count] = input->dim[i]; output->dim_count++; @@ -75,34 +76,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); - float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); - float *ref = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); + indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); + float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); + float *ref = + (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -110,15 +111,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_gather_nd_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather_nd(input, indices, output, ¶ms); + if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather_nd(input, indices, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/gather_u8.c b/tests/validation/gather_u8.c index 62349a3b..7bfcf146 100644 --- a/tests/validation/gather_u8.c +++ b/tests/validation/gather_u8.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *indices = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_gather_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_params), NULL); int in_size = 1, indices_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -39,13 +40,13 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); int axis = buffer[0]; input->dim_count = buffer[1]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } indices->dim_count = buffer[2 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[3 + input->dim_count + i]; indices_size *= indices->dim[i]; } @@ -73,35 +74,35 @@ int main(int argc, char** argv) indices->dtype = CSINN_DTYPE_INT32; output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; + params->base.api = CSINN_API; + params->axis = axis; - float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count); - indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); - float *ref = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); + float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count); + indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); + float *ref = + (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,16 +110,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_gather_init(input, indices, output, ¶ms) == CSINN_TRUE) { - csi_gather(input, indices, output, ¶ms); + if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) { + csinn_gather(input, indices, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_avgpool_i8.c b/tests/validation/global_avgpool_i8.c index 84d48900..6e7b3ccb 100644 --- a/tests/validation/global_avgpool_i8.c +++ b/tests/validation/global_avgpool_i8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global avgpool i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // out_height - output->dim[2] = buffer[5]; // out_width - output->dim[3] = buffer[3]; // in_channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // out_height + output->dim[2] = buffer[5]; // out_width + output->dim[3] = buffer[3]; // in_channel input->dim_count = 4; output->dim_count = 4; @@ -59,39 +59,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_avgpool2d(input, output, ¶ms); + if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_avgpool_nchw_i8.c b/tests/validation/global_avgpool_nchw_i8.c index 574bdc92..da7a516e 100644 --- a/tests/validation/global_avgpool_nchw_i8.c +++ b/tests/validation/global_avgpool_nchw_i8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global avgpool nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // in_channel - output->dim[2] = buffer[4]; // out_height - output->dim[3] = buffer[5]; // out_width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // in_channel + output->dim[2] = buffer[4]; // out_height + output->dim[3] = buffer[5]; // out_width input->dim_count = 4; output->dim_count = 4; @@ -55,44 +55,40 @@ int main(int argc, char** argv) input->is_const = 0; input->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -100,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_avgpool2d(input, output, ¶ms); + if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_avgpool_nchw_u8.c b/tests/validation/global_avgpool_nchw_u8.c index 50b106a6..fa2b1d77 100644 --- a/tests/validation/global_avgpool_nchw_u8.c +++ b/tests/validation/global_avgpool_nchw_u8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global avgpool nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // in_channel - output->dim[2] = buffer[4]; // out_height - output->dim[3] = buffer[5]; // out_width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // in_channel + output->dim[2] = buffer[4]; // out_height + output->dim[3] = buffer[5]; // out_width input->dim_count = 4; output->dim_count = 4; @@ -59,39 +59,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_avgpool2d(input, output, ¶ms); + if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_avgpool_u8.c b/tests/validation/global_avgpool_u8.c index 8f7f9662..936ea713 100644 --- a/tests/validation/global_avgpool_u8.c +++ b/tests/validation/global_avgpool_u8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global avgpool u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // out_height - output->dim[2] = buffer[5]; // out_width - output->dim[3] = buffer[3]; // in_channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // out_height + output->dim[2] = buffer[5]; // out_width + output->dim[3] = buffer[3]; // in_channel input->dim_count = 4; output->dim_count = 4; @@ -62,36 +62,33 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_avgpool2d(input, output, ¶ms); + if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_avgpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_maxpool_i8.c b/tests/validation/global_maxpool_i8.c index 9989c9a6..e44ebf41 100644 --- a/tests/validation/global_maxpool_i8.c +++ b/tests/validation/global_maxpool_i8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global maxpool i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // out_height - output->dim[2] = buffer[5]; // out_width - output->dim[3] = buffer[3]; // in_channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // out_height + output->dim[2] = buffer[5]; // out_width + output->dim[3] = buffer[3]; // in_channel input->dim_count = 4; output->dim_count = 4; @@ -62,36 +62,33 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_maxpool2d(input, output, ¶ms); + if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_maxpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_maxpool_nchw_i8.c b/tests/validation/global_maxpool_nchw_i8.c index 1abb1d25..163c834f 100644 --- a/tests/validation/global_maxpool_nchw_i8.c +++ b/tests/validation/global_maxpool_nchw_i8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global maxpool nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // in_channel - output->dim[2] = buffer[4]; // out_height - output->dim[3] = buffer[5]; // out_width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // in_channel + output->dim[2] = buffer[4]; // out_height + output->dim[3] = buffer[5]; // out_width input->dim_count = 4; output->dim_count = 4; @@ -59,39 +59,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_maxpool2d(input, output, ¶ms); + if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_maxpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_maxpool_nchw_u8.c b/tests/validation/global_maxpool_nchw_u8.c index e86b1950..5f1fc7e3 100644 --- a/tests/validation/global_maxpool_nchw_u8.c +++ b/tests/validation/global_maxpool_nchw_u8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global maxpool nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // in_channel - output->dim[2] = buffer[4]; // out_height - output->dim[3] = buffer[5]; // out_width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // in_channel + output->dim[2] = buffer[4]; // out_height + output->dim[3] = buffer[5]; // out_width input->dim_count = 4; output->dim_count = 4; @@ -59,39 +59,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_maxpool2d(input, output, ¶ms); + if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_maxpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/global_maxpool_u8.c b/tests/validation/global_maxpool_u8.c index 2ae18904..1c3e2285 100644 --- a/tests/validation/global_maxpool_u8.c +++ b/tests/validation/global_maxpool_u8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global maxpool u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 0; - int out_size =0; + int out_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // out_height - output->dim[2] = buffer[5]; // out_width - output->dim[3] = buffer[3]; // in_channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // out_height + output->dim[2] = buffer[5]; // out_width + output->dim[3] = buffer[3]; // in_channel input->dim_count = 4; output->dim_count = 4; @@ -62,36 +62,33 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 6); - float *ref = (float *)(buffer + 6 + in_size); + float *src_in = (float *)(buffer + 6); + float *ref = (float *)(buffer + 6 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_global_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_global_maxpool2d(input, output, ¶ms); + if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_global_maxpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/greater_equal_f32.c b/tests/validation/greater_equal_f32.c index 0345f5e9..355c18a4 100644 --- a/tests/validation/greater_equal_f32.c +++ b/tests/validation/greater_equal_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -52,17 +52,16 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/greater_equal_i8.c b/tests/validation/greater_equal_i8.c index 98233020..31c18251 100644 --- a/tests/validation/greater_equal_i8.c +++ b/tests/validation/greater_equal_i8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater equal i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -72,61 +72,58 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,17 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/greater_equal_u8.c b/tests/validation/greater_equal_u8.c index a5970369..fddc587b 100644 --- a/tests/validation/greater_equal_u8.c +++ b/tests/validation/greater_equal_u8.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater equal u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 0; int out_size = 0; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -72,61 +72,58 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,17 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/greater_f32.c b/tests/validation/greater_f32.c index 0345f5e9..355c18a4 100644 --- a/tests/validation/greater_f32.c +++ b/tests/validation/greater_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -52,17 +52,16 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/greater_i8.c b/tests/validation/greater_i8.c index 534d0208..8bc5fc71 100644 --- a/tests/validation/greater_i8.c +++ b/tests/validation/greater_i8.c @@ -16,35 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; float max_error = 0; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -65,41 +65,38 @@ int main(int argc, char** argv) input1->is_const = 0; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/greater_u8.c b/tests/validation/greater_u8.c index 520b4cd6..15ed641a 100644 --- a/tests/validation/greater_u8.c +++ b/tests/validation/greater_u8.c @@ -16,35 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size, out_size; float max_error = 0; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -65,41 +65,38 @@ int main(int argc, char** argv) input1->is_const = 0; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_greater_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_greater(input0, input1, output, ¶ms); + if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_greater(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/group_convolution_f32.c b/tests/validation/group_convolution_f32.c index 502f13fd..3f3bc21b 100644 --- a/tests/validation/group_convolution_f32.c +++ b/tests/validation/group_convolution_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; if (argc == 1) { @@ -40,30 +41,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -73,21 +74,20 @@ int main(int argc, char** argv) kernel->dtype = CSINN_DTYPE_FLOAT32; bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 18); - kernel->data = (float *)(buffer + 18 + in_size); - bias->data = (float *)(buffer + 18 + in_size + weight_size); + input->data = (float *)(buffer + 18); + kernel->data = (float *)(buffer + 18 + in_size); + bias->data = (float *)(buffer + 18 + in_size + weight_size); reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/group_convolution_i8.c b/tests/validation/group_convolution_i8.c index fbdc106b..e76ff904 100644 --- a/tests/validation/group_convolution_i8.c +++ b/tests/validation/group_convolution_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -94,41 +95,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,60 +137,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_nchw_f32.c b/tests/validation/group_convolution_nchw_f32.c index ee59af16..36f94401 100644 --- a/tests/validation/group_convolution_nchw_f32.c +++ b/tests/validation/group_convolution_nchw_f32.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0; if (argc == 1) { @@ -42,28 +43,27 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 18); input->data = input_data; - kernel->dim[0] = buffer[12]; // o - kernel->dim[1] = buffer[1] / group; // i - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w + kernel->dim[0] = buffer[12]; // o + kernel->dim[1] = buffer[1] / group; // i + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w kernel->dim_count = 4; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; kernel->name = "kernel"; float *kernel_data = (float *)(buffer + 18 + in_size); kernel->data = kernel_data; - bias->dim[0] = buffer[12]; bias->dim_count = 1; bias_size = bias->dim[0]; @@ -71,37 +71,34 @@ int main(int argc, char** argv) float *bias_data = (float *)(buffer + 18 + in_size + weight_size); bias->data = bias_data; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); output->data = reference->data; output->name = "output"; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.group = group; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.base.name = "params"; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->group = group; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.name = "params"; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) != CSINN_TRUE) { + if (csinn_conv2d_init(input, output, kernel, bias, params) != CSINN_TRUE) { printf("group conv2d init fail.\n\t"); return -1; } @@ -110,5 +107,4 @@ int main(int argc, char** argv) free(buffer); return done_testing(); - } diff --git a/tests/validation/group_convolution_nchw_i8.c b/tests/validation/group_convolution_nchw_i8.c index 5f9e136b..aae9877b 100644 --- a/tests/validation/group_convolution_nchw_i8.c +++ b/tests/validation/group_convolution_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[12]; // o - kernel->dim[1] = buffer[1] / group; // i - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[12]; // o + kernel->dim[1] = buffer[1] / group; // i + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w kernel->dim_count = 4; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -93,42 +94,41 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,53 +137,49 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_nchw_u8.c b/tests/validation/group_convolution_nchw_u8.c index 3da0b7c5..9b9a9e86 100644 --- a/tests/validation/group_convolution_nchw_u8.c +++ b/tests/validation/group_convolution_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[12]; // o - kernel->dim[1] = buffer[1] / group; // i - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[12]; // o + kernel->dim[1] = buffer[1] / group; // i + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w kernel->dim_count = 4; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -93,42 +94,41 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,53 +137,49 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu6_i8.c b/tests/validation/group_convolution_relu6_i8.c index 1654d0db..acc809d6 100644 --- a/tests/validation/group_convolution_relu6_i8.c +++ b/tests/validation/group_convolution_relu6_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu6 i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -92,41 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -135,57 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu6_nchw_i8.c b/tests/validation/group_convolution_relu6_nchw_i8.c index 12c61035..06842d6d 100644 --- a/tests/validation/group_convolution_relu6_nchw_i8.c +++ b/tests/validation/group_convolution_relu6_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu6 nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -94,41 +95,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,60 +137,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu6_nchw_u8.c b/tests/validation/group_convolution_relu6_nchw_u8.c index 774b5657..96077837 100644 --- a/tests/validation/group_convolution_relu6_nchw_u8.c +++ b/tests/validation/group_convolution_relu6_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu6 nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -94,41 +95,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,60 +137,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu6_u8.c b/tests/validation/group_convolution_relu6_u8.c index 0fbcfefa..f482d704 100644 --- a/tests/validation/group_convolution_relu6_u8.c +++ b/tests/validation/group_convolution_relu6_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu6 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -92,103 +93,98 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - kernel->data = kernel_in; get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu6_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu6(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu6(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu_i8.c b/tests/validation/group_convolution_relu_i8.c index 52e626a0..dd019e7f 100644 --- a/tests/validation/group_convolution_relu_i8.c +++ b/tests/validation/group_convolution_relu_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -92,41 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -135,57 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu_nchw_i8.c b/tests/validation/group_convolution_relu_nchw_i8.c index bbc75957..7027a4c5 100644 --- a/tests/validation/group_convolution_relu_nchw_i8.c +++ b/tests/validation/group_convolution_relu_nchw_i8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -94,41 +95,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); int8_t *input_tmp = malloc(in_size * sizeof(char)); - int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + int8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,60 +137,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu_nchw_u8.c b/tests/validation/group_convolution_relu_nchw_u8.c index 688285d5..d1add96e 100644 --- a/tests/validation/group_convolution_relu_nchw_u8.c +++ b/tests/validation/group_convolution_relu_nchw_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -94,41 +95,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,60 +137,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[1]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[1]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_relu_u8.c b/tests/validation/group_convolution_relu_u8.c index cde197e7..a0eed6d3 100644 --- a/tests/validation/group_convolution_relu_u8.c +++ b/tests/validation/group_convolution_relu_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,30 +45,30 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -92,41 +93,40 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -135,60 +135,57 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - scale3=output->qinfo->scale; - scale=(scale1*scale2)/scale3; - csi_quantize_multiplier(scale, &quantized_multiplier, &shift); + scale3 = output->qinfo->scale; + scale = (scale1 * scale2) / scale3; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_relu_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d_relu(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d_relu(input, output, kernel, bias, params); } - csi_quantize_multiplier(scale3, &quantized_multiplier, &shift); + shl_quantize_multiplier(scale3, &quantized_multiplier, &shift); output->qinfo->multiplier = quantized_multiplier; - output->qinfo->shift = shift; + output->qinfo->shift = shift; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/group_convolution_u8.c b/tests/validation/group_convolution_u8.c index 209f7b35..0c2a5ad3 100644 --- a/tests/validation/group_convolution_u8.c +++ b/tests/validation/group_convolution_u8.c @@ -16,22 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); int in_size, out_size, weight_size; int zp, quantized_multiplier, shift; float max_value, min_value, scale, scale1, scale2, scale3; @@ -44,31 +45,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[6]; - kernel->dim[2] = buffer[7]; - kernel->dim[3] = buffer[3] / group; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[16]; // height - output->dim[2] = buffer[15]; // width - output->dim[3] = buffer[12]; // out_channel - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NHWC; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[6]; + kernel->dim[2] = buffer[7]; + kernel->dim[3] = buffer[3] / group; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[16]; // height + output->dim[2] = buffer[15]; // width + output->dim[3] = buffer[12]; // out_channel + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NHWC; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -93,42 +94,41 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - float *src_in = (float *)(buffer + 18); - float *kernel_in = (float *)(buffer + 18 + in_size); - float *bias_in = (float *)(buffer + 18 + in_size + weight_size); - float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); + weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 18); + float *kernel_in = (float *)(buffer + 18 + in_size); + float *bias_in = (float *)(buffer + 18 + in_size + weight_size); + float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]); uint8_t *input_tmp = malloc(in_size * sizeof(char)); - uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); - int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); + uint8_t *kernel_tmp = malloc(weight_size * sizeof(char)); + int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t)); input->data = src_in; get_quant_info(input); scale1 = input->qinfo->scale; - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -137,50 +137,47 @@ int main(int argc, char** argv) get_quant_info(kernel); scale2 = kernel->qinfo->scale; - for(int i = 0; i < weight_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); + for (int i = 0; i < weight_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < weight_size; i++) { + for (int i = 0; i < weight_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); - if(isinf(kernel_in[i]) || isnan(kernel_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo); + if (isinf(kernel_in[i]) || isnan(kernel_in[i])) { continue; } else { - error1 = fabs(kernel_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9); + error1 = fabs(kernel_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } max_error = (error[0] + error[1]); - - - scale=scale1*scale2; - for(int i = 0; i < output->dim[3]; i++) { - bias_tmp[i] =(int32_t)(bias_in[i]/scale); + scale = scale1 * scale2; + for (int i = 0; i < output->dim[3]; i++) { + bias_tmp[i] = (int32_t)(bias_in[i] / scale); } output->data = ref; get_quant_info(output); - input->data = input_tmp; - kernel->data = kernel_tmp; - bias->data = bias_tmp; + input->data = input_tmp; + kernel->data = kernel_tmp; + bias->data = bias_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/hard_sigmoid_f32.c b/tests/validation/hard_sigmoid_f32.c index 2d3c685a..49139b05 100644 --- a/tests/validation/hard_sigmoid_f32.c +++ b/tests/validation/hard_sigmoid_f32.c @@ -16,26 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of hard_sigmoid f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +45,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_hard_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_hard_sigmoid(input, output, ¶ms); + if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_hard_sigmoid(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/hard_sigmoid_i8.c b/tests/validation/hard_sigmoid_i8.c index 2e8ec6be..4de96d7d 100644 --- a/tests/validation/hard_sigmoid_i8.c +++ b/tests/validation/hard_sigmoid_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of hard_sigmoid i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +39,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,34 +55,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -89,15 +89,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_hard_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_hard_sigmoid(input, output, ¶ms); + if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_hard_sigmoid(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/hard_sigmoid_u8.c b/tests/validation/hard_sigmoid_u8.c index 38035f4c..18d241e7 100644 --- a/tests/validation/hard_sigmoid_u8.c +++ b/tests/validation/hard_sigmoid_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of hard_sigmoid u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +39,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -55,33 +56,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -89,15 +89,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_hard_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_hard_sigmoid(input, output, ¶ms); + if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_hard_sigmoid(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/im2col_f32.c b/tests/validation/im2col_f32.c index 28116902..352be4c5 100644 --- a/tests/validation/im2col_f32.c +++ b/tests/validation/im2col_f32.c @@ -16,64 +16,68 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of im2col f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct im2col_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_im2col_params *params = + csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width input->dim_count = 4; - params.kernel_h = buffer[4]; - params.kernel_w = buffer[5]; - params.stride_h = buffer[6]; - params.stride_w = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; + params->kernel_h = buffer[4]; + params->kernel_w = buffer[5]; + params->stride_h = buffer[6]; + params->stride_w = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { in_size *= input->dim[i]; } - int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1; - int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1; + int out_h = + (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h + + 1; + int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) / + params->stride_w + + 1; - output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w; + output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w; output->dim[1] = input->dim[0] * out_h * out_w; output->dim_count = 2; - out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w; + out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 12); + input->data = (float *)(buffer + 12); reference->data = (float *)(buffer + 12 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_im2col_init(input, output, ¶ms) == CSINN_TRUE) { - csi_im2col(input, output, ¶ms); + if (csinn_im2col_init(input, output, params) == CSINN_TRUE) { + csinn_im2col(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); @@ -82,4 +86,3 @@ int main(int argc, char** argv) free(output->data); return done_testing(); } - diff --git a/tests/validation/im2col_i8.c b/tests/validation/im2col_i8.c index 32f4078d..9f35a898 100644 --- a/tests/validation/im2col_i8.c +++ b/tests/validation/im2col_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of im2col nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct im2col_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_im2col_params *params = + csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,33 +39,37 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width input->dim_count = 4; - params.kernel_h = buffer[4]; - params.kernel_w = buffer[5]; - params.stride_h = buffer[6]; - params.stride_w = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; + params->kernel_h = buffer[4]; + params->kernel_w = buffer[5]; + params->stride_h = buffer[6]; + params->stride_w = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { in_size *= input->dim[i]; } - int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1; - int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1; + int out_h = + (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h + + 1; + int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) / + params->stride_w + + 1; - output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w; + output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w; output->dim[1] = input->dim[0] * out_h * out_w; output->dim_count = 2; - out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w; + out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -74,75 +79,72 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 12); - float *ref = (float *)(buffer + 12 + in_size); + float *src_in = (float *)(buffer + 12); + float *ref = (float *)(buffer + 12 + in_size); int8_t *src_tmp = (int8_t *)malloc(in_size * sizeof(int8_t)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the input's max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - output->data = ref; get_quant_info(output); int8_t *dst_tmp = (int8_t *)malloc(out_size * sizeof(int8_t)); - for(int i = 0; i < out_size; i++) { - dst_tmp[i] = csi_ref_quantize_f32_to_i8(ref[i], output->qinfo); + for (int i = 0; i < out_size; i++) { + dst_tmp[i] = shl_ref_quantize_f32_to_i8(ref[i], output->qinfo); } /* compute the output's max quantize error */ - for(int i = 0; i < out_size; i++) { + for (int i = 0; i < out_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(dst_tmp[i], output->qinfo); - if(isinf(ref[i]) || isnan(ref[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(dst_tmp[i], output->qinfo); + if (isinf(ref[i]) || isnan(ref[i])) { continue; } else { - error1 = fabs(ref[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(ref[i] - output_tmp)/fabs(ref[i] + 1e-9); + error1 = fabs(ref[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(ref[i] - output_tmp) / fabs(ref[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } - - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); + output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); max_error = (error[0] + error[1]); - float difference = argc > 2 ? atof(argv[2]) : max_error; + float difference = argc > 2 ? atof(argv[2]) : max_error; - if (csi_im2col_init(input, output, ¶ms) == CSINN_TRUE) { - csi_im2col(input, output, ¶ms); + if (csinn_im2col_init(input, output, params) == CSINN_TRUE) { + csinn_im2col(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/im2col_u8.c b/tests/validation/im2col_u8.c index c85de842..6d5af58a 100644 --- a/tests/validation/im2col_u8.c +++ b/tests/validation/im2col_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of im2col nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct im2col_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_im2col_params *params = + csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,33 +39,37 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width input->dim_count = 4; - params.kernel_h = buffer[4]; - params.kernel_w = buffer[5]; - params.stride_h = buffer[6]; - params.stride_w = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; + params->kernel_h = buffer[4]; + params->kernel_w = buffer[5]; + params->stride_h = buffer[6]; + params->stride_w = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { in_size *= input->dim[i]; } - int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1; - int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1; + int out_h = + (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h + + 1; + int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) / + params->stride_w + + 1; - output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w; + output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w; output->dim[1] = input->dim[0] * out_h * out_w; output->dim_count = 2; - out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w; + out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -74,76 +79,73 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 12); - float *ref = (float *)(buffer + 12 + in_size); + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 12); + float *ref = (float *)(buffer + 12 + in_size); uint8_t *src_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the input's max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - output->data = ref; get_quant_info(output); uint8_t *dst_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); - for(int i = 0; i < out_size; i++) { - dst_tmp[i] = csi_ref_quantize_f32_to_u8(ref[i], output->qinfo); + for (int i = 0; i < out_size; i++) { + dst_tmp[i] = shl_ref_quantize_f32_to_u8(ref[i], output->qinfo); } /* compute the output's max quantize error */ - for(int i = 0; i < out_size; i++) { + for (int i = 0; i < out_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(dst_tmp[i], output->qinfo); - if(isinf(ref[i]) || isnan(ref[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(dst_tmp[i], output->qinfo); + if (isinf(ref[i]) || isnan(ref[i])) { continue; } else { - error1 = fabs(ref[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(ref[i] - output_tmp)/fabs(ref[i] + 1e-9); + error1 = fabs(ref[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(ref[i] - output_tmp) / fabs(ref[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } - - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); + output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); max_error = (error[0] + error[1]); - float difference = argc > 2 ? atof(argv[2]) : max_error; + float difference = argc > 2 ? atof(argv[2]) : max_error; - if (csi_im2col_init(input, output, ¶ms) == CSINN_TRUE) { - csi_im2col(input, output, ¶ms); + if (csinn_im2col_init(input, output, params) == CSINN_TRUE) { + csinn_im2col(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/is_nan_f32.c b/tests/validation/is_nan_f32.c index a73591a0..df172495 100644 --- a/tests/validation/is_nan_f32.c +++ b/tests/validation/is_nan_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of isnan f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -43,16 +43,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (bool *)(buffer + 1 + input->dim_count + in_size); - output->data = (bool *)malloc(out_size * sizeof(bool)); + output->data = (bool *)malloc(out_size * sizeof(bool)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_isnan_bool_init(input, output, ¶ms) == CSINN_TRUE) { - csi_isnan_bool(input, output, ¶ms); + if (csinn_isnan_bool_init(input, output, params) == CSINN_TRUE) { + csinn_isnan_bool(input, output, params); } result_verify_bool(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/l2_norm_f32.c b/tests/validation/l2_norm_f32.c index cb4bcdde..b0228207 100644 --- a/tests/validation/l2_norm_f32.c +++ b/tests/validation/l2_norm_f32.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of l2 normalization f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct l2n_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL); int size = 1; int *buffer = read_input_data_f32(argv[1]); /* get the dim para */ output->dim_count = input->dim_count = buffer[0]; - params.epsilon = *(float *)&buffer[1]; + params->epsilon = *(float *)&buffer[1]; int32_t axis[] = {1}; - params.axis = axis; - params.n = 1; - + params->axis = axis; + params->n = 1; + for (int i = 0; i < input->dim_count; ++i) { output->dim[i] = input->dim[i] = buffer[2 + i]; } @@ -50,17 +50,16 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - //params.epsilon = *(float *)&buffer[1 + input->dim_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + // params->epsilon = *(float *)&buffer[1 + input->dim_count]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count); reference->data = (float *)(buffer + 2 + input->dim_count + size); - output->data = malloc(size * sizeof(float)); + output->data = malloc(size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_l2_normalization_init(input, output, ¶ms) == CSINN_TRUE) { - csi_l2_normalization(input, output, ¶ms); + if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) { + csinn_l2_normalization(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, size, false); diff --git a/tests/validation/l2_norm_i8.c b/tests/validation/l2_norm_i8.c index 2d20bbb3..63698e84 100644 --- a/tests/validation/l2_norm_i8.c +++ b/tests/validation/l2_norm_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of l2 normalization i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct l2n_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL); int size = 1; int zp, quantized_multiplier, shift; float max_value, min_value, scale; @@ -38,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); /* get the dim para */ output->dim_count = input->dim_count = buffer[0]; - params.epsilon = *(float *)&buffer[1]; + params->epsilon = *(float *)&buffer[1]; int32_t axis[] = {1}; - params.axis = axis; - params.n = 1; + params->axis = axis; + params->n = 1; for (int i = 0; i < input->dim_count; ++i) { output->dim[i] = input->dim[i] = buffer[2 + i]; @@ -60,33 +60,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + size); + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + size); int8_t *input_tmp = malloc(size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,14 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; + input->data = input_tmp; reference->data = ref; - output->data = malloc(size * sizeof(char)); + output->data = malloc(size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_l2_normalization_init(input, output, ¶ms) == CSINN_TRUE) { - csi_l2_normalization(input, output, ¶ms); + if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) { + csinn_l2_normalization(input, output, params); } result_verify_8(reference->data, output, input->data, difference, size, false); diff --git a/tests/validation/l2_norm_u8.c b/tests/validation/l2_norm_u8.c index 9e4aed4c..eb59c0f8 100644 --- a/tests/validation/l2_norm_u8.c +++ b/tests/validation/l2_norm_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of l2 normalization u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct l2n_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL); int size = 1; int zp, quantized_multiplier, shift; float max_value, min_value, scale; @@ -38,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); /* get the dim para */ output->dim_count = input->dim_count = buffer[0]; - params.epsilon = *(float *)&buffer[1]; + params->epsilon = *(float *)&buffer[1]; int32_t axis[] = {1}; - params.axis = axis; - params.n = 1; + params->axis = axis; + params->n = 1; for (int i = 0; i < input->dim_count; ++i) { output->dim[i] = input->dim[i] = buffer[2 + i]; @@ -60,34 +60,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + size); uint8_t *input_tmp = malloc(size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,14 +95,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; + input->data = input_tmp; reference->data = ref; - output->data = malloc(size * sizeof(char)); + output->data = malloc(size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_l2_normalization_init(input, output, ¶ms) == CSINN_TRUE) { - csi_l2_normalization(input, output, ¶ms); + if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) { + csinn_l2_normalization(input, output, params); } result_verify_8(reference->data, output, input->data, difference, size, false); diff --git a/tests/validation/leaky_relu_f32.c b/tests/validation/leaky_relu_f32.c index f85eeff7..2dd411a4 100644 --- a/tests/validation/leaky_relu_f32.c +++ b/tests/validation/leaky_relu_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of leaky_relu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,19 +47,18 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.n = *((float *)buffer + 4); + params->n = *((float *)buffer + 4); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_leaky_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_leaky_relu(input, output, ¶ms); + if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) { + csinn_leaky_relu(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/leaky_relu_i8.c b/tests/validation/leaky_relu_i8.c index 2722e59a..3f51bf25 100644 --- a/tests/validation/leaky_relu_i8.c +++ b/tests/validation/leaky_relu_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of leaky_relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,38 +58,35 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.n = *((float *)buffer + 4); + params->n = *((float *)buffer + 4); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; - + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -97,15 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_leaky_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_leaky_relu(input, output, ¶ms); + if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) { + csinn_leaky_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/leaky_relu_u8.c b/tests/validation/leaky_relu_u8.c index 3140eba8..8127b64f 100644 --- a/tests/validation/leaky_relu_u8.c +++ b/tests/validation/leaky_relu_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of leaky_relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size = 0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,39 +57,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.n = *((float *)buffer + 4); - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->n = *((float *)buffer + 4); + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -97,15 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_leaky_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_leaky_relu(input, output, ¶ms); + if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) { + csinn_leaky_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/less_equal_f32.c b/tests/validation/less_equal_f32.c index 499ee073..a1af0d45 100644 --- a/tests/validation/less_equal_f32.c +++ b/tests/validation/less_equal_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less equal f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -51,17 +51,16 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_less_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less_equal(input0, input1, output, ¶ms); + if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less_equal(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/less_equal_i8.c b/tests/validation/less_equal_i8.c index 8e589762..2e6bf5f0 100644 --- a/tests/validation/less_equal_i8.c +++ b/tests/validation/less_equal_i8.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less equal i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -70,62 +70,58 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -135,17 +131,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - - - float difference = argc > 2 ? atof(argv[2]) : 0.9; + output->data = malloc(in_size * sizeof(char)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_less_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less_equal(input0, input1, output, ¶ms); + if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/less_equal_u8.c b/tests/validation/less_equal_u8.c index 4281eaf5..feb5aedc 100644 --- a/tests/validation/less_equal_u8.c +++ b/tests/validation/less_equal_u8.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less equal u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -71,62 +71,58 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,17 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - - - float difference = argc > 2 ? atof(argv[2]) : 0.9; + output->data = malloc(in_size * sizeof(char)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_less_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less_equal(input0, input1, output, ¶ms); + if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/less_f32.c b/tests/validation/less_f32.c index f655e34b..88c6cb94 100644 --- a/tests/validation/less_f32.c +++ b/tests/validation/less_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -52,18 +52,17 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_less_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less(input0, input1, output, ¶ms); + if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/less_i8.c b/tests/validation/less_i8.c index 326bf87b..465a7d36 100644 --- a/tests/validation/less_i8.c +++ b/tests/validation/less_i8.c @@ -16,35 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -71,60 +71,58 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -134,17 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_less_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less(input0, input1, output, ¶ms); + if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/less_u8.c b/tests/validation/less_u8.c index 3f97a658..e55cf2e8 100644 --- a/tests/validation/less_u8.c +++ b/tests/validation/less_u8.c @@ -16,35 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -71,60 +71,58 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -134,17 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_less_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_less(input0, input1, output, ¶ms); + if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_less(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/log1p_f32.c b/tests/validation/log1p_f32.c index 2b39ff25..daeb8d85 100644 --- a/tests/validation/log1p_f32.c +++ b/tests/validation/log1p_f32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log1p f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -49,16 +49,15 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size0 ); + input0->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size0); output->data = (float *)malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log1p_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log1p(input0, output, ¶ms); + if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) { + csinn_log1p(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false); diff --git a/tests/validation/log1p_i8.c b/tests/validation/log1p_i8.c index 300c54a5..3456e0a0 100644 --- a/tests/validation/log1p_i8.c +++ b/tests/validation/log1p_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log1p i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -59,33 +59,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0 ); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,14 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log1p_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log1p(input0, output, ¶ms); + if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) { + csinn_log1p(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/log1p_u8.c b/tests/validation/log1p_u8.c index 98b0a730..e5d6d64c 100644 --- a/tests/validation/log1p_u8.c +++ b/tests/validation/log1p_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log1p u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -59,34 +59,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0 ); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,14 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log1p_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log1p(input0, output, ¶ms); + if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) { + csinn_log1p(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/log_f32.c b/tests/validation/log_f32.c index 68b8e5ed..7920b178 100644 --- a/tests/validation/log_f32.c +++ b/tests/validation/log_f32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -49,16 +49,15 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); + input0->data = (float *)(buffer + 4); reference->data = (float *)(buffer + 4 + in_size0); output->data = (float *)malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log(input0, output, ¶ms); + if (csinn_log_init(input0, output, params) == CSINN_TRUE) { + csinn_log(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false); diff --git a/tests/validation/log_i8.c b/tests/validation/log_i8.c index 111503df..b0b51c5e 100644 --- a/tests/validation/log_i8.c +++ b/tests/validation/log_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -59,34 +59,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0 ); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,14 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log(input0, output, ¶ms); + if (csinn_log_init(input0, output, params) == CSINN_TRUE) { + csinn_log(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/log_softmax_f32.c b/tests/validation/log_softmax_f32.c index 92ff200c..823e675d 100644 --- a/tests/validation/log_softmax_f32.c +++ b/tests/validation/log_softmax_f32.c @@ -16,28 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log_softmax f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - params.axis = buffer[0]; + params->axis = buffer[0]; input->dim_count = buffer[1]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -46,16 +47,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); - reference->data = (float *)(buffer + 2 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 2 + input->dim_count); + reference->data = (float *)(buffer + 2 + input->dim_count + in_size); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_log_softmax(input, output, ¶ms); + if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_log_softmax(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/log_softmax_i8.c b/tests/validation/log_softmax_i8.c index 0e41c331..c33c05fa 100644 --- a/tests/validation/log_softmax_i8.c +++ b/tests/validation/log_softmax_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log_softmax i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - params.axis = buffer[0]; + params->axis = buffer[0]; input->dim_count = buffer[1]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -56,34 +57,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -91,15 +90,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_log_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_log_softmax(input, output, ¶ms); + if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_log_softmax(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/log_softmax_u8.c b/tests/validation/log_softmax_u8.c index 184c8cee..15ab67d4 100644 --- a/tests/validation/log_softmax_u8.c +++ b/tests/validation/log_softmax_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log_softmax u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +38,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - params.axis = buffer[0]; + params->axis = buffer[0]; input->dim_count = buffer[1]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -56,34 +57,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -91,15 +90,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_log_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_log_softmax(input, output, ¶ms); + if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_log_softmax(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/log_u8.c b/tests/validation/log_u8.c index c382436b..e4757c3e 100644 --- a/tests/validation/log_u8.c +++ b/tests/validation/log_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -59,33 +59,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0 ); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,14 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_log_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_log(input0, output, ¶ms); + if (csinn_log_init(input0, output, params) == CSINN_TRUE) { + csinn_log(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/logical_and_f32.c b/tests/validation/logical_and_f32.c index a0147d70..10032245 100644 --- a/tests/validation/logical_and_f32.c +++ b/tests/validation/logical_and_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical and f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -50,17 +50,16 @@ int main(int argc, char** argv) input1->dim_count = 4; output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_and_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_and(input0, input1, output, ¶ms); + if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_and(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_and_i8.c b/tests/validation/logical_and_i8.c index 1653a366..73b13ea9 100644 --- a/tests/validation/logical_and_i8.c +++ b/tests/validation/logical_and_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical and i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,16 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -72,36 +72,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + params->base.api = CSINN_API; + + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -109,23 +108,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo ); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo ); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -135,15 +134,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_and_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_and(input0, input1, output, ¶ms); + if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_and(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_and_u8.c b/tests/validation/logical_and_u8.c index 84c44d1c..ac7825a8 100644 --- a/tests/validation/logical_and_u8.c +++ b/tests/validation/logical_and_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical and u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,16 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -73,35 +73,34 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -109,23 +108,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo ); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo ); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -135,15 +134,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_and_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_and(input0, input1, output, ¶ms); + if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_and(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_not_f32.c b/tests/validation/logical_not_f32.c index e67d9c4b..68451503 100644 --- a/tests/validation/logical_not_f32.c +++ b/tests/validation/logical_not_f32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical not f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -48,16 +48,15 @@ int main(int argc, char** argv) input0->dim_count = 4; output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); + input0->data = (float *)(buffer + 4); reference->data = (float *)(buffer + 4 + in_size0); output->data = (float *)malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_not_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_logical_not(input0, output, ¶ms); + if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) { + csinn_logical_not(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false); diff --git a/tests/validation/logical_not_i8.c b/tests/validation/logical_not_i8.c index adb0975f..c914ff7e 100644 --- a/tests/validation/logical_not_i8.c +++ b/tests/validation/logical_not_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical not i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -60,34 +60,33 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,14 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_not_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_logical_not(input0, output, ¶ms); + if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) { + csinn_logical_not(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/logical_not_u8.c b/tests/validation/logical_not_u8.c index 0a03332a..a6436e60 100644 --- a/tests/validation/logical_not_u8.c +++ b/tests/validation/logical_not_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical not u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size0; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,10 +37,10 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -60,34 +60,33 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size0); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,14 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_not_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_logical_not(input0, output, ¶ms); + if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) { + csinn_logical_not(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/logical_or_f32.c b/tests/validation/logical_or_f32.c index cf4c3c27..459735cb 100644 --- a/tests/validation/logical_or_f32.c +++ b/tests/validation/logical_or_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical or f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -50,17 +50,16 @@ int main(int argc, char** argv) input1->dim_count = 4; output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_or_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_or(input0, input1, output, ¶ms); + if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_or(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_or_i8.c b/tests/validation/logical_or_i8.c index c7ef28af..8f9aca70 100644 --- a/tests/validation/logical_or_i8.c +++ b/tests/validation/logical_or_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical or i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,16 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -74,36 +74,35 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -111,23 +110,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo ); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo ); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -137,15 +136,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_or_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_or(input0, input1, output, ¶ms); + if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_or(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_or_u8.c b/tests/validation/logical_or_u8.c index b6754616..d55c942f 100644 --- a/tests/validation/logical_or_u8.c +++ b/tests/validation/logical_or_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical or u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,16 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -74,36 +74,35 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -111,23 +110,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo ); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo ); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -137,15 +136,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_or_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_or(input0, input1, output, ¶ms); + if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_or(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_xor_f32.c b/tests/validation/logical_xor_f32.c index 92301851..21ffb365 100644 --- a/tests/validation/logical_xor_f32.c +++ b/tests/validation/logical_xor_f32.c @@ -16,35 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical xor f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; - + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -58,17 +57,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_xor_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_xor(input0, input1, output, ¶ms); + if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_xor(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_xor_i8.c b/tests/validation/logical_xor_i8.c index 04603aba..74de1f0f 100644 --- a/tests/validation/logical_xor_i8.c +++ b/tests/validation/logical_xor_i8.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical xor i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -71,62 +71,59 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,15 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_xor_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_xor(input0, input1, output, ¶ms); + if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_xor(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/logical_xor_u8.c b/tests/validation/logical_xor_u8.c index 1acb1663..0817c1b0 100644 --- a/tests/validation/logical_xor_u8.c +++ b/tests/validation/logical_xor_u8.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical xor u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; float error[2] = {0}; float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -71,62 +71,59 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,15 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_logical_xor_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_logical_xor(input0, input1, output, ¶ms); + if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_logical_xor(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/lrn_f32.c b/tests/validation/lrn_f32.c index ff393658..ea6feb24 100644 --- a/tests/validation/lrn_f32.c +++ b/tests/validation/lrn_f32.c @@ -16,40 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of lrn f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct lrn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.range = buffer[4] * 2 + 1; - params.bias = *(float *)(buffer + 5); - params.alpha = *(float *)(buffer + 6); - params.beta = *(float *)(buffer + 7); + params->range = buffer[4] * 2 + 1; + params->bias = *(float *)(buffer + 5); + params->alpha = *(float *)(buffer + 6); + params->beta = *(float *)(buffer + 7); - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -58,16 +58,15 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 8); reference->data = (float *)(buffer + 8 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_lrn_init(input, output, ¶ms) == CSINN_TRUE) { - csi_lrn(input, output, ¶ms); + if (csinn_lrn_init(input, output, params) == CSINN_TRUE) { + csinn_lrn(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/lrn_i8.c b/tests/validation/lrn_i8.c index 460296e9..3da77565 100644 --- a/tests/validation/lrn_i8.c +++ b/tests/validation/lrn_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of lrn i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct lrn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -47,8 +47,8 @@ int main(int argc, char** argv) output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.range = buffer[4]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->range = buffer[4]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; @@ -65,46 +65,42 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 8); - float *ref = (float *)(buffer + 8 + in_size); + float *src_in = (float *)(buffer + 8); + float *ref = (float *)(buffer + 8 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } + shl_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift); + params->bias_multiplier = quantized_multiplier; + params->bias_shift = shift; - csi_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift); - params.bias_multiplier = quantized_multiplier; - params.bias_shift = shift; - - csi_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift); - params.alpha_multiplier = quantized_multiplier; - params.alpha_shift = shift; - + shl_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift); + params->alpha_multiplier = quantized_multiplier; + params->alpha_shift = shift; - csi_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift); - params.beta_multiplier = quantized_multiplier; - params.beta_shift = shift; + shl_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift); + params->beta_multiplier = quantized_multiplier; + params->beta_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-2; - if (csi_lrn_init(input, output, ¶ms) == CSINN_TRUE) { - csi_lrn(input, output, ¶ms); + if (csinn_lrn_init(input, output, params) == CSINN_TRUE) { + csinn_lrn(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/lrn_u8.c b/tests/validation/lrn_u8.c index a1f8820d..2079c688 100644 --- a/tests/validation/lrn_u8.c +++ b/tests/validation/lrn_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of lrn u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct lrn_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -47,8 +47,8 @@ int main(int argc, char** argv) output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.range = buffer[4]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->range = buffer[4]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; @@ -65,46 +65,42 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 8); - float *ref = (float *)(buffer + 8 + in_size); + float *src_in = (float *)(buffer + 8); + float *ref = (float *)(buffer + 8 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } + shl_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift); + params->bias_multiplier = quantized_multiplier; + params->bias_shift = shift; - csi_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift); - params.bias_multiplier = quantized_multiplier; - params.bias_shift = shift; - - csi_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift); - params.alpha_multiplier = quantized_multiplier; - params.alpha_shift = shift; - + shl_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift); + params->alpha_multiplier = quantized_multiplier; + params->alpha_shift = shift; - csi_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift); - params.beta_multiplier = quantized_multiplier; - params.beta_shift = shift; + shl_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift); + params->beta_multiplier = quantized_multiplier; + params->beta_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-2; - if (csi_lrn_init(input, output, ¶ms) == CSINN_TRUE) { - csi_lrn(input, output, ¶ms); + if (csinn_lrn_init(input, output, params) == CSINN_TRUE) { + csinn_lrn(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/matmul_f32.c b/tests/validation/matmul_f32.c index 74681de8..7681f38d 100644 --- a/tests/validation/matmul_f32.c +++ b/tests/validation/matmul_f32.c @@ -16,28 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of matmul f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct matmul_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_matmul_params *params = + csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = input1->dim_count = buffer[2]; output->dim_count = input0->dim_count; - params.trans_a = buffer[0]; - params.trans_b = buffer[1]; + params->trans_a = buffer[0]; + params->trans_b = buffer[1]; for (int i = 0; i < input0->dim_count; ++i) { input0->dim[i] = buffer[3 + i]; input1->dim[i] = buffer[3 + input0->dim_count + i]; @@ -62,17 +63,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 3 + 3 * input0->dim_count); - input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); + input0->data = (float *)(buffer + 3 + 3 * input0->dim_count); + input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); reference->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_matmul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_matmul(input0, input1, output, ¶ms); + if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_matmul(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/matmul_i8.c b/tests/validation/matmul_i8.c index 69ffd9b2..82884298 100644 --- a/tests/validation/matmul_i8.c +++ b/tests/validation/matmul_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of matmul i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct matmul_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_matmul_params *params = + csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL); int in_size0, in_size1, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; @@ -37,8 +38,8 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input0->dim_count = input1->dim_count = buffer[2]; output->dim_count = input0->dim_count; - params.trans_a = buffer[0]; - params.trans_b = buffer[1]; + params->trans_a = buffer[0]; + params->trans_b = buffer[1]; for (int i = 0; i < input0->dim_count; ++i) { input0->dim[i] = buffer[3 + i]; input1->dim[i] = buffer[3 + input0->dim_count + i]; @@ -74,35 +75,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; int8_t *input_tmp0 = malloc(in_size0 * sizeof(char)); int8_t *input_tmp1 = malloc(in_size1 * sizeof(char)); - float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count); - float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); - float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); + float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count); + float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); + float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); input0->data = src_in0; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - input_tmp0[i] = csi_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + input_tmp0[i] = shl_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in0[i], input0->qinfo); - if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in0[i], input0->qinfo); + if (src_in0[i] == INFINITY && output_tmp == INFINITY || + src_in0[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in0[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9); + error1 = fabs(src_in0[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -112,26 +113,26 @@ int main(int argc, char** argv) input1->data = src_in1; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - input_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + input_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo); } output->data = ref; get_quant_info(output); - input0->data = input_tmp0; - input1->data = input_tmp1; - reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + input0->data = input_tmp0; + input1->data = input_tmp1; + reference->data = ref; + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_matmul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_matmul(input0, input1, output, ¶ms); + if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_matmul(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); - + free(buffer); free(input_tmp0); free(input_tmp1); diff --git a/tests/validation/matmul_u8.c b/tests/validation/matmul_u8.c index 2a88dd2e..8b2a58c5 100644 --- a/tests/validation/matmul_u8.c +++ b/tests/validation/matmul_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of matmul u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct matmul_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_matmul_params *params = + csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL); int in_size0, in_size1, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; @@ -37,8 +38,8 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input0->dim_count = input1->dim_count = buffer[2]; output->dim_count = input0->dim_count; - params.trans_a = buffer[0]; - params.trans_b = buffer[1]; + params->trans_a = buffer[0]; + params->trans_b = buffer[1]; for (int i = 0; i < input0->dim_count; ++i) { input0->dim[i] = buffer[3 + i]; input1->dim[i] = buffer[3 + input0->dim_count + i]; @@ -74,35 +75,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; uint8_t *input_tmp0 = malloc(in_size0 * sizeof(char)); uint8_t *input_tmp1 = malloc(in_size1 * sizeof(char)); - float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count); - float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); - float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); - + float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count); + float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); + float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); input0->data = src_in0; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - input_tmp0[i] = csi_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + input_tmp0[i] = shl_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in0[i], input0->qinfo); - if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in0[i], input0->qinfo); + if (src_in0[i] == INFINITY && output_tmp == INFINITY || + src_in0[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in0[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9); + error1 = fabs(src_in0[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -112,26 +112,26 @@ int main(int argc, char** argv) input1->data = src_in1; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - input_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + input_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo); } output->data = ref; get_quant_info(output); - input0->data = input_tmp0; - input1->data = input_tmp1; - reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + input0->data = input_tmp0; + input1->data = input_tmp1; + reference->data = ref; + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_matmul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_matmul(input0, input1, output, ¶ms); + if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_matmul(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); - + free(buffer); free(input_tmp0); free(input_tmp1); diff --git a/tests/validation/max_stride_f32.c b/tests/validation/max_stride_f32.c index 0736dd45..b5d68c2a 100644 --- a/tests/validation/max_stride_f32.c +++ b/tests/validation/max_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of max f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_max_init(input, output, ¶ms) == CSINN_TRUE) { - csi_max(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_max_init(input, output, params) == CSINN_TRUE) { + csinn_max(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/max_stride_u8.c b/tests/validation/max_stride_u8.c index 3db21059..c00a6e35 100644 --- a/tests/validation/max_stride_u8.c +++ b/tests/validation/max_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of max u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -84,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_max_init(input, output, ¶ms) == CSINN_TRUE) { - csi_max(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_max_init(input, output, params) == CSINN_TRUE) { + csinn_max(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/maximum_f32.c b/tests/validation/maximum_f32.c index 66182bf7..37b57fad 100644 --- a/tests/validation/maximum_f32.c +++ b/tests/validation/maximum_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maximum f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; in_size *= input0->dim[i]; @@ -47,17 +47,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); - reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size); - output->data = malloc(out_size * sizeof(float)); + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maximum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_maximum(input0, input1, output, ¶ms); + if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_maximum(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/maximum_i8.c b/tests/validation/maximum_i8.c index 8f00f7e8..60dbfa8c 100644 --- a/tests/validation/maximum_i8.c +++ b/tests/validation/maximum_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maximum i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -40,7 +40,7 @@ int main(int argc, char** argv) input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -62,36 +62,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in1 = (float *)(buffer + 1 + input0->dim_count); - float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count); + float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); int8_t *src_tmp1 = malloc(in_size * sizeof(char)); int8_t *src_tmp2 = malloc(in_size * sizeof(char)); input0->data = src_in1; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo); - if(isinf(src_in1[i]) || isnan(src_in1[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo); + if (isinf(src_in1[i]) || isnan(src_in1[i])) { continue; } else { - error1 = fabs(src_in1[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + error1 = fabs(src_in1[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,40 +97,39 @@ int main(int argc, char** argv) input1->data = src_in2; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src_tmp2[i] = csi_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp2[i] = shl_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo); - if(isinf(src_in2[i]) || isnan(src_in2[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo); + if (isinf(src_in2[i]) || isnan(src_in2[i])) { continue; } else { - error1 = fabs(src_in2[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9); + error1 = fabs(src_in2[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input0->data = src_tmp1; - input1->data = src_tmp2; + input0->data = src_tmp1; + input1->data = src_tmp2; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maximum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_maximum(input0, input1, output, ¶ms); + if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_maximum(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/maximum_u8.c b/tests/validation/maximum_u8.c index be045c01..7ac00b16 100644 --- a/tests/validation/maximum_u8.c +++ b/tests/validation/maximum_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maximum u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -40,7 +40,7 @@ int main(int argc, char** argv) input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -62,36 +62,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in1 = (float *)(buffer + 1 + input0->dim_count); - float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count); + float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); uint8_t *src_tmp1 = malloc(in_size * sizeof(char)); uint8_t *src_tmp2 = malloc(in_size * sizeof(char)); input0->data = src_in1; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo); - if(isinf(src_in1[i]) || isnan(src_in1[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo); + if (isinf(src_in1[i]) || isnan(src_in1[i])) { continue; } else { - error1 = fabs(src_in1[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + error1 = fabs(src_in1[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,40 +97,39 @@ int main(int argc, char** argv) input1->data = src_in2; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src_tmp2[i] = csi_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp2[i] = shl_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo); - if(isinf(src_in2[i]) || isnan(src_in2[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo); + if (isinf(src_in2[i]) || isnan(src_in2[i])) { continue; } else { - error1 = fabs(src_in2[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9); + error1 = fabs(src_in2[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input0->data = src_tmp1; - input1->data = src_tmp2; + input0->data = src_tmp1; + input1->data = src_tmp2; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maximum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_maximum(input0, input1, output, ¶ms); + if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_maximum(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/maxpool3d_f32.c b/tests/validation/maxpool3d_f32.c index 522910a7..06e36fcf 100644 --- a/tests/validation/maxpool3d_f32.c +++ b/tests/validation/maxpool3d_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool3d f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -46,20 +46,20 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -68,17 +68,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 20); reference->data = (float *)(buffer + 20 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maxpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool3d(input, output, ¶ms); + if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool3d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/maxpool3d_i8.c b/tests/validation/maxpool3d_i8.c index 93d7e7dc..28d45f97 100644 --- a/tests/validation/maxpool3d_i8.c +++ b/tests/validation/maxpool3d_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool3d i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,11 +37,11 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -49,63 +49,62 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; - - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; + + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCDHW; input->is_const = 0; input->quant_channel = 1; - output->dtype = CSINN_DTYPE_INT8; + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCDHW; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 5; output->dim_count = 5; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 20); - float *ref = (float *)(buffer + 20 + in_size); + float *src_in = (float *)(buffer + 20); + float *ref = (float *)(buffer + 20 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -113,15 +112,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_maxpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool3d(input, output, ¶ms); + if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool3d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/maxpool3d_u8.c b/tests/validation/maxpool3d_u8.c index c4f1cb9a..f64dd5a9 100644 --- a/tests/validation/maxpool3d_u8.c +++ b/tests/validation/maxpool3d_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool3d u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -37,11 +37,11 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -49,20 +49,20 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; - - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; + + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCDHW; @@ -73,39 +73,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCDHW; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 5; output->dim_count = 5; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 20); - float *ref = (float *)(buffer + 20 + in_size); + float *src_in = (float *)(buffer + 20); + float *ref = (float *)(buffer + 20 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -113,15 +112,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_maxpool3d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool3d(input, output, ¶ms); + if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool3d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/maxpool_f32.c b/tests/validation/maxpool_f32.c index 76b3bfd4..07bbc6c0 100644 --- a/tests/validation/maxpool_f32.c +++ b/tests/validation/maxpool_f32.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel output->dim[0] = buffer[0]; output->dim[1] = buffer[12]; output->dim[2] = buffer[13]; output->dim[3] = buffer[3]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -62,17 +62,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 14); reference->data = (float *)(buffer + 14 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool2d(input, output, ¶ms); + if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool2d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/maxpool_nchw_f32.c b/tests/validation/maxpool_nchw_f32.c index 3a147919..11ff07ad 100644 --- a/tests/validation/maxpool_nchw_f32.c +++ b/tests/validation/maxpool_nchw_f32.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; @@ -62,17 +62,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 14); reference->data = (float *)(buffer + 14 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool2d(input, output, ¶ms); + if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool2d(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/maxpool_u8.c b/tests/validation/maxpool_u8.c index 81528a3d..9d8566cb 100644 --- a/tests/validation/maxpool_u8.c +++ b/tests/validation/maxpool_u8.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // in_channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // in_channel output->dim[0] = buffer[0]; output->dim[1] = buffer[12]; output->dim[2] = buffer[13]; output->dim[3] = buffer[3]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NHWC; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NHWC; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; @@ -64,37 +64,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - + input->dim_count = 4; output->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 14); - float *ref = (float *)(buffer + 14 + in_size); + float *src_in = (float *)(buffer + 14); + float *ref = (float *)(buffer + 14 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - if (csi_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool2d(input, output, ¶ms); + if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool2d(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/mean_stride_f32.c b/tests/validation/mean_stride_f32.c index 1d73e12b..493af418 100644 --- a/tests/validation/mean_stride_f32.c +++ b/tests/validation/mean_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mean f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_mean_init(input, output, ¶ms) == CSINN_TRUE) { - csi_mean(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_mean_init(input, output, params) == CSINN_TRUE) { + csinn_mean(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/mean_stride_u8.c b/tests/validation/mean_stride_u8.c index d73bc410..f0d647cc 100644 --- a/tests/validation/mean_stride_u8.c +++ b/tests/validation/mean_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mean u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -84,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_mean_init(input, output, ¶ms) == CSINN_TRUE) { - csi_mean(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_mean_init(input, output, params) == CSINN_TRUE) { + csinn_mean(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/min_stride_f32.c b/tests/validation/min_stride_f32.c index 00466290..05e85148 100644 --- a/tests/validation/min_stride_f32.c +++ b/tests/validation/min_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of min f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_min_init(input, output, ¶ms) == CSINN_TRUE) { - csi_min(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_min_init(input, output, params) == CSINN_TRUE) { + csinn_min(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/min_stride_u8.c b/tests/validation/min_stride_u8.c index 9c72841f..af98b30a 100644 --- a/tests/validation/min_stride_u8.c +++ b/tests/validation/min_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of min u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -84,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_min_init(input, output, ¶ms) == CSINN_TRUE) { - csi_min(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_min_init(input, output, params) == CSINN_TRUE) { + csinn_min(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/minimum_f32.c b/tests/validation/minimum_f32.c index f724c2b0..9196e056 100644 --- a/tests/validation/minimum_f32.c +++ b/tests/validation/minimum_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of minimum f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; in_size *= input0->dim[i]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); - reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size); - output->data = malloc(out_size * sizeof(float)); + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_minimum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_minimum(input0, input1, output, ¶ms); + if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_minimum(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/minimum_i8.c b/tests/validation/minimum_i8.c index 99a888b7..849e583a 100644 --- a/tests/validation/minimum_i8.c +++ b/tests/validation/minimum_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of minimum i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -40,7 +40,7 @@ int main(int argc, char** argv) input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -62,36 +62,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in1 = (float *)(buffer + 1 + input0->dim_count); - float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count); + float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); int8_t *src_tmp1 = malloc(in_size * sizeof(char)); int8_t *src_tmp2 = malloc(in_size * sizeof(char)); input0->data = src_in1; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo); - if(isinf(src_in1[i]) || isnan(src_in1[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo); + if (isinf(src_in1[i]) || isnan(src_in1[i])) { continue; } else { - error1 = fabs(src_in1[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + error1 = fabs(src_in1[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,23 +97,23 @@ int main(int argc, char** argv) input1->data = src_in2; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src_tmp2[i] = csi_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp2[i] = shl_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo); - if(isinf(src_in2[i]) || isnan(src_in2[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo); + if (isinf(src_in2[i]) || isnan(src_in2[i])) { continue; } else { - error1 = fabs(src_in2[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9); + error1 = fabs(src_in2[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -123,15 +121,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp1; - input1->data = src_tmp2; + input0->data = src_tmp1; + input1->data = src_tmp2; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_minimum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_minimum(input0, input1, output, ¶ms); + if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_minimum(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/minimum_u8.c b/tests/validation/minimum_u8.c index 879c6cf4..df754a6c 100644 --- a/tests/validation/minimum_u8.c +++ b/tests/validation/minimum_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of minimum u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -40,7 +40,7 @@ int main(int argc, char** argv) input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -62,36 +62,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in1 = (float *)(buffer + 1 + input0->dim_count); - float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); - float *ref = (float *)(buffer + 1 + input0->dim_count + 2*in_size); + float *src_in1 = (float *)(buffer + 1 + input0->dim_count); + float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size); + float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); uint8_t *src_tmp1 = malloc(in_size * sizeof(char)); uint8_t *src_tmp2 = malloc(in_size * sizeof(char)); input0->data = src_in1; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo); - if(isinf(src_in1[i]) || isnan(src_in1[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo); + if (isinf(src_in1[i]) || isnan(src_in1[i])) { continue; } else { - error1 = fabs(src_in1[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9); + error1 = fabs(src_in1[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,23 +97,23 @@ int main(int argc, char** argv) input1->data = src_in2; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src_tmp2[i] = csi_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp2[i] = shl_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo); - if(isinf(src_in2[i]) || isnan(src_in2[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo); + if (isinf(src_in2[i]) || isnan(src_in2[i])) { continue; } else { - error1 = fabs(src_in2[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9); + error1 = fabs(src_in2[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -123,15 +121,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp1; - input1->data = src_tmp2; + input0->data = src_tmp1; + input1->data = src_tmp2; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_minimum_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_minimum(input0, input1, output, ¶ms); + if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_minimum(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/mod_f32.c b/tests/validation/mod_f32.c index b2057c14..560b9a8b 100644 --- a/tests/validation/mod_f32.c +++ b/tests/validation/mod_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mod f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -50,7 +50,7 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -62,17 +62,16 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - input1->data = (float *)(buffer + 5 + in_size0); + input0->data = (float *)(buffer + 5); + input1->data = (float *)(buffer + 5 + in_size0); reference->data = (float *)(buffer + 5 + in_size0 + in_size1); - output->data = malloc(in_size0 * sizeof(float)); + output->data = malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mod(input0, input1, output, ¶ms); + if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mod(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false); diff --git a/tests/validation/mod_i8.c b/tests/validation/mod_i8.c index 60cddfdc..7ec577cd 100644 --- a/tests/validation/mod_i8.c +++ b/tests/validation/mod_i8.c @@ -16,34 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mod i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,7 +67,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -80,36 +79,35 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); int8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - int8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + int8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -117,23 +115,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -143,17 +141,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mod(input0, input1, output, ¶ms); + if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mod(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/mod_u8.c b/tests/validation/mod_u8.c index 4ad679f7..7b9e6c88 100644 --- a/tests/validation/mod_u8.c +++ b/tests/validation/mod_u8.c @@ -16,34 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mod u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,7 +67,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -80,36 +79,35 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); uint8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -117,23 +115,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -143,17 +141,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_mod_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mod(input0, input1, output, ¶ms); + if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mod(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/mul_f32.c b/tests/validation/mul_f32.c index e30287c1..1d902401 100644 --- a/tests/validation/mul_f32.c +++ b/tests/validation/mul_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mul f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -50,7 +50,7 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -62,18 +62,17 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - input0->data = (float *)(buffer + 5); - input1->data = (float *)(buffer + 5 + in_size0); + input0->data = (float *)(buffer + 5); + input1->data = (float *)(buffer + 5 + in_size0); reference->data = (float *)(buffer + 5 + in_size0 + in_size1); - output->data = malloc(in_size0 * sizeof(float)); + output->data = malloc(in_size0 * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_mul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mul(input0, input1, output, ¶ms); + if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mul(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false); diff --git a/tests/validation/mul_i8.c b/tests/validation/mul_i8.c index a64fbdda..015e6834 100644 --- a/tests/validation/mul_i8.c +++ b/tests/validation/mul_i8.c @@ -16,34 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mul i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -67,8 +66,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - if(flag) { + + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -80,60 +79,58 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); int8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - int8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + int8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -143,17 +140,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_mul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mul(input0, input1, output, ¶ms); + if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mul(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/mul_u8.c b/tests/validation/mul_u8.c index 7e7a8042..c9a74df9 100644 --- a/tests/validation/mul_u8.c +++ b/tests/validation/mul_u8.c @@ -16,34 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mul u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size0, in_size1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float error[2] = {0}; float max_error; - int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,7 +67,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -80,60 +79,58 @@ int main(int argc, char** argv) input1->dim_count = 4; in_size1 = in_size0; } - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 5); - float *src1_in = (float *)(buffer + 5 + in_size0); - float *ref = (float *)(buffer + 5 + in_size0 + in_size1); + float *src0_in = (float *)(buffer + 5); + float *src1_in = (float *)(buffer + 5 + in_size0); + float *ref = (float *)(buffer + 5 + in_size0 + in_size1); uint8_t *src0_tmp = malloc(in_size0 * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size1 * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size1; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size1; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size1; i++) { + for (int i = 0; i < in_size1; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -143,17 +140,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size0 * sizeof(char)); - + output->data = malloc(in_size0 * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_mul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mul(input0, input1, output, ¶ms); + if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mul(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size0, false); diff --git a/tests/validation/ndarray_size_f32.c b/tests/validation/ndarray_size_f32.c index 6beda6d3..0275aba1 100644 --- a/tests/validation/ndarray_size_f32.c +++ b/tests/validation/ndarray_size_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ndarray size f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct ndarray_size_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_ndarray_size_params *params; int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 1; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -43,16 +43,15 @@ int main(int argc, char** argv) out_size = 1; input->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_ndarray_size_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ndarray_size(input, output, ¶ms); + if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) { + csinn_ndarray_size(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/ndarray_size_i8.c b/tests/validation/ndarray_size_i8.c index cbc2209b..949ccc21 100644 --- a/tests/validation/ndarray_size_i8.c +++ b/tests/validation/ndarray_size_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ndarray size i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct ndarray_size_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_ndarray_size_params *params; int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 1; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -55,33 +55,29 @@ int main(int argc, char** argv) input->is_const = 0; input->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 1 + input->dim_count); + float *src_in = (float *)(buffer + 1 + input->dim_count); float *ref = (float *)(buffer + 1 + input->dim_count + in_size); float difference = argc > 2 ? atof(argv[2]) : 0.9; int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); - if (csi_ndarray_size_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ndarray_size(input, output, ¶ms); + if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) { + csinn_ndarray_size(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/ndarray_size_u8.c b/tests/validation/ndarray_size_u8.c index 2a857f6d..de93a691 100644 --- a/tests/validation/ndarray_size_u8.c +++ b/tests/validation/ndarray_size_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ndarray size u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct ndarray_size_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_ndarray_size_params *params; int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 1; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -55,33 +55,29 @@ int main(int argc, char** argv) input->is_const = 0; input->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - float *src_in = (float *)(buffer + 1 + input->dim_count); + float *src_in = (float *)(buffer + 1 + input->dim_count); float *ref = (float *)(buffer + 1 + input->dim_count + in_size); float difference = argc > 2 ? atof(argv[2]) : 0.9; uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); - if (csi_ndarray_size_init(input, output, ¶ms) == CSINN_TRUE) { - csi_ndarray_size(input, output, ¶ms); + if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) { + csinn_ndarray_size(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/negative_f32.c b/tests/validation/negative_f32.c index c676f2e1..08468e1c 100644 --- a/tests/validation/negative_f32.c +++ b/tests/validation/negative_f32.c @@ -16,25 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of negative f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -43,16 +43,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_negative_init(input, output, ¶ms) == CSINN_TRUE) { - csi_negative(input, output, ¶ms); + if (csinn_negative_init(input, output, params) == CSINN_TRUE) { + csinn_negative(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/negative_i8.c b/tests/validation/negative_i8.c index 817599a6..fba5d198 100644 --- a/tests/validation/negative_i8.c +++ b/tests/validation/negative_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of negative i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,35 +54,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,18 +87,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_negative_init(input, output, ¶ms) == CSINN_TRUE) { - csi_negative(input, output, ¶ms); + if (csinn_negative_init(input, output, params) == CSINN_TRUE) { + csinn_negative(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/negative_u8.c b/tests/validation/negative_u8.c index 731e001b..7474c2e3 100644 --- a/tests/validation/negative_u8.c +++ b/tests/validation/negative_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of negative u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,35 +54,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -90,18 +87,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_negative_init(input, output, ¶ms) == CSINN_TRUE) { - csi_negative(input, output, ¶ms); + if (csinn_negative_init(input, output, params) == CSINN_TRUE) { + csinn_negative(input, output, params); } - - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/non_max_suppression_f32.c b/tests/validation/non_max_suppression_f32.c index df3644fd..b7700d33 100644 --- a/tests/validation/non_max_suppression_f32.c +++ b/tests/validation/non_max_suppression_f32.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of non_max_suppression f32.\n"); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct non_max_suppression_params params; + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_non_max_suppression_params *params = + csinn_alloc_params(sizeof(struct csinn_non_max_suppression_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); @@ -40,28 +41,27 @@ int main(int argc, char** argv) input0->dim[1] = 4; input1->dim[0] = buffer[0]; - params.max_output_size = buffer[1]; - params.iou_threshold = *((float *)buffer + 3); + params->max_output_size = buffer[1]; + params->iou_threshold = *((float *)buffer + 3); output->dim_count = 2; - output->dim[0] = params.max_output_size; + output->dim[0] = params->max_output_size; output->dim[1] = 4; - in_size = input0->dim[0] * 4; + in_size = input0->dim[0] * 4; out_size = buffer[2]; input0->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (int *)(buffer + 4 + in_size + in_size / 4); - output->data = (int *)malloc(out_size * sizeof(int)); + output->data = (int *)malloc(out_size * sizeof(int)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_non_max_suppression_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_non_max_suppression(input0, input1, output, ¶ms); + if (csinn_non_max_suppression_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_non_max_suppression(input0, input1, output, params); } result_verify_int32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/not_equal_f32.c b/tests/validation/not_equal_f32.c index 0840a9d0..6e19426b 100644 --- a/tests/validation/not_equal_f32.c +++ b/tests/validation/not_equal_f32.c @@ -16,29 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not equal f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -50,17 +50,16 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_not_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_not_equal(input0, input1, output, ¶ms); + if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_not_equal(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/not_equal_i8.c b/tests/validation/not_equal_i8.c index 7dfe9457..f9d66679 100644 --- a/tests/validation/not_equal_i8.c +++ b/tests/validation/not_equal_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not equal i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -73,37 +72,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -111,23 +107,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -137,17 +133,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - - - float difference = argc > 2 ? atof(argv[2]) : 0.9; + output->data = malloc(in_size * sizeof(char)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_not_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_not_equal(input0, input1, output, ¶ms); + if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_not_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/not_equal_u8.c b/tests/validation/not_equal_u8.c index 080ff77c..06f5c547 100644 --- a/tests/validation/not_equal_u8.c +++ b/tests/validation/not_equal_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not equal u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -68,39 +67,36 @@ int main(int argc, char** argv) input1->layout = CSINN_LAYOUT_NCHW; input1->is_const = 0; input1->quant_channel = 1; - - output->dtype = CSINN_DTYPE_UINT8; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + output->dtype = CSINN_DTYPE_UINT8; + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -108,23 +104,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -134,17 +130,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - - - float difference = argc > 2 ? atof(argv[2]) : 0.9; + output->data = malloc(in_size * sizeof(char)); + float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_not_equal_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_not_equal(input0, input1, output, ¶ms); + if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_not_equal(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/not_f32.c b/tests/validation/not_f32.c index 68e6a84d..c7e85898 100644 --- a/tests/validation/not_f32.c +++ b/tests/validation/not_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_not_init(input, output, ¶ms) == CSINN_TRUE) { - csi_not(input, output, ¶ms); + if (csinn_not_init(input, output, params) == CSINN_TRUE) { + csinn_not(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/not_u32.c b/tests/validation/not_u32.c index 597a4af9..1319c19e 100644 --- a/tests/validation/not_u32.c +++ b/tests/validation/not_u32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not u32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,20 +44,19 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_UINT32; output->dtype = CSINN_DTYPE_UINT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (uint32_t *)(buffer + 1 + input->dim_count); + input->data = (uint32_t *)(buffer + 1 + input->dim_count); reference->data = (uint32_t *)(buffer + 1 + input->dim_count + in_size); - output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); + output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_not_init(input, output, ¶ms) == CSINN_TRUE) { - csi_not(input, output, ¶ms); + if (csinn_not_init(input, output, params) == CSINN_TRUE) { + csinn_not(input, output, params); } result_verify_int32(reference->data, output->data, input->data, difference, out_size, false); - + free(buffer); free(output->data); return done_testing(); diff --git a/tests/validation/or_u32.c b/tests/validation/or_u32.c index 12e05f4e..4d7e73e8 100644 --- a/tests/validation/or_u32.c +++ b/tests/validation/or_u32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of or u32.\n"); - struct csi_tensor *input_0 = csi_alloc_tensor(NULL); - struct csi_tensor *input_1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input_0->dim_count = buffer[0]; input_1->dim_count = buffer[0]; output->dim_count = input_0->dim_count; - for(int i = 0; i < input_0->dim_count; i++) { + for (int i = 0; i < input_0->dim_count; i++) { input_0->dim[i] = buffer[i + 1]; input_1->dim[i] = buffer[i + 1]; output->dim[i] = input_0->dim[i]; @@ -48,20 +48,18 @@ int main(int argc, char** argv) input_0->dtype = CSINN_DTYPE_UINT32; input_1->dtype = CSINN_DTYPE_UINT32; output->dtype = CSINN_DTYPE_UINT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); - input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); + input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); + input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size); - output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); + output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_or_init(input_0, input_1, output, ¶ms) == CSINN_TRUE) { - csi_or(input_0, input_1, output, ¶ms); + if (csinn_or_init(input_0, input_1, output, params) == CSINN_TRUE) { + csinn_or(input_0, input_1, output, params); } - result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/pad_f32.c b/tests/validation/pad_f32.c index c2fae295..7911c11c 100644 --- a/tests/validation/pad_f32.c +++ b/tests/validation/pad_f32.c @@ -16,29 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pad f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pad_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL); int in_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; @@ -52,32 +51,30 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; - params.pad_num = input->dim_count; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NHWC; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; + params->pad_num = input->dim_count; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; int32_t pad_top = buffer[6]; int32_t pad_down = buffer[7]; - int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC - int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC + int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC + int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC - params.pad_before = pad_before; - params.pad_after = pad_after; + params->pad_before = pad_before; + params->pad_after = pad_after; input->data = (float *)(buffer + 8); reference->data = (float *)(buffer + 8 + in_size); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_pad_init(input, output, ¶ms) == CSINN_TRUE) { - csi_pad(input, output, ¶ms); + if (csinn_pad_init(input, output, params) == CSINN_TRUE) { + csinn_pad(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/pad_nchw_f32.c b/tests/validation/pad_nchw_f32.c index 8a386eba..283ff2d9 100644 --- a/tests/validation/pad_nchw_f32.c +++ b/tests/validation/pad_nchw_f32.c @@ -16,29 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pad nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pad_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL); int in_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; @@ -51,32 +50,30 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; - params.pad_num = input->dim_count; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; + params->pad_num = input->dim_count; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; int32_t pad_top = buffer[6]; int32_t pad_down = buffer[7]; - int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC - int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC + int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC + int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC - params.pad_before = pad_before; - params.pad_after = pad_after; + params->pad_before = pad_before; + params->pad_after = pad_after; input->data = (float *)(buffer + 8); reference->data = (float *)(buffer + 8 + in_size); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_pad_init(input, output, ¶ms) == CSINN_TRUE) { - csi_pad(input, output, ¶ms); + if (csinn_pad_init(input, output, params) == CSINN_TRUE) { + csinn_pad(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/pad_nchw_u8.c b/tests/validation/pad_nchw_u8.c index e2ad2861..5aa2021b 100644 --- a/tests/validation/pad_nchw_u8.c +++ b/tests/validation/pad_nchw_u8.c @@ -16,29 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pad nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pad_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL); int in_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; @@ -58,47 +57,44 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; int32_t pad_top = buffer[6]; int32_t pad_down = buffer[7]; - int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC - int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC + int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC + int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC - params.pad_before = pad_before; - params.pad_after = pad_after; - params.pad_num = input->dim_count; + params->pad_before = pad_before; + params->pad_after = pad_after; + params->pad_num = input->dim_count; - - float *src_in = (float *)(buffer + 8); - float *ref = (float *)(buffer + 8 + in_size); + float *src_in = (float *)(buffer + 8); + float *ref = (float *)(buffer + 8 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_pad_init(input, output, ¶ms) == CSINN_TRUE) { - csi_pad(input, output, ¶ms); + if (csinn_pad_init(input, output, params) == CSINN_TRUE) { + csinn_pad(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/pad_u8.c b/tests/validation/pad_u8.c index 06e55e77..97ac948f 100644 --- a/tests/validation/pad_u8.c +++ b/tests/validation/pad_u8.c @@ -16,29 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pad f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pad_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL); int in_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; @@ -59,46 +58,44 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NHWC; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; int32_t pad_top = buffer[6]; int32_t pad_down = buffer[7]; - int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC - int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC + int32_t pad_before[4] = {0, pad_top, pad_left, 0}; // NHWC + int32_t pad_after[4] = {0, pad_down, pad_right, 0}; // NHWC - params.pad_before = pad_before; - params.pad_after = pad_after; - params.pad_num = input->dim_count; + params->pad_before = pad_before; + params->pad_after = pad_after; + params->pad_num = input->dim_count; - float *src_in = (float *)(buffer + 8); - float *ref = (float *)(buffer + 8 + in_size); + float *src_in = (float *)(buffer + 8); + float *ref = (float *)(buffer + 8 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - if (csi_pad_init(input, output, ¶ms) == CSINN_TRUE) { - csi_pad(input, output, ¶ms); + if (csinn_pad_init(input, output, params) == CSINN_TRUE) { + csinn_pad(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/pow_f32.c b/tests/validation/pow_f32.c index 135210f5..8ffb70ea 100644 --- a/tests/validation/pow_f32.c +++ b/tests/validation/pow_f32.c @@ -16,34 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pow f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel - input1->dim[0] = buffer[0]; // batch - input1->dim[1] = buffer[1]; // height - input1->dim[2] = buffer[2]; // width - input1->dim[3] = buffer[3]; // channel + input1->dim[0] = buffer[0]; // batch + input1->dim[1] = buffer[1]; // height + input1->dim[2] = buffer[2]; // width + input1->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -57,17 +57,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_power_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_power(input0, input1, output, ¶ms); + if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_power(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/pow_i8.c b/tests/validation/pow_i8.c index f589632c..ad1168e5 100644 --- a/tests/validation/pow_i8.c +++ b/tests/validation/pow_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pow i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -73,61 +72,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -137,17 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_power_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_power(input0, input1, output, ¶ms); + if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_power(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/pow_u8.c b/tests/validation/pow_u8.c index e1075b81..42b86e09 100644 --- a/tests/validation/pow_u8.c +++ b/tests/validation/pow_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pow u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,17 +38,16 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -73,61 +72,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } - input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -137,17 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_power_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_power(input0, input1, output, ¶ms); + if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_power(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/prelu_f32.c b/tests/validation/prelu_f32.c index ee33ce26..121088f3 100644 --- a/tests/validation/prelu_f32.c +++ b/tests/validation/prelu_f32.c @@ -16,49 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // channel - output->dim[2] = input->dim[2] = buffer[2]; // height - output->dim[3] = input->dim[3] = buffer[3]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // channel + output->dim[2] = input->dim[2] = buffer[2]; // height + output->dim[3] = input->dim[3] = buffer[3]; // width alpha_data->dim[0] = buffer[1]; input->dim_count = 4; output->dim_count = 4; - input->dtype = CSINN_DTYPE_FLOAT32; - output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; + input->dtype = CSINN_DTYPE_FLOAT32; + output->dtype = CSINN_DTYPE_FLOAT32; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); + input->data = (float *)(buffer + 4); alpha_data->data = (float *)(buffer + 4 + in_size); - reference->data = (float *)(buffer + 4 + in_size + input->dim[1]); - output->data = malloc(in_size * sizeof(float)); + reference->data = (float *)(buffer + 4 + in_size + input->dim[1]); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/prelu_i8.c b/tests/validation/prelu_i8.c index 7168a2cf..2b0cae50 100644 --- a/tests/validation/prelu_i8.c +++ b/tests/validation/prelu_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // channel - output->dim[2] = input->dim[2] = buffer[2]; // height - output->dim[3] = input->dim[3] = buffer[3]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // channel + output->dim[2] = input->dim[2] = buffer[2]; // height + output->dim[3] = input->dim[3] = buffer[3]; // width alpha_data->dim[0] = buffer[1]; input->dim_count = 4; alpha_data->dim_count = 1; @@ -60,64 +60,62 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); + float *src_in = (float *)(buffer + 4); float *alpha_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + in_size + input->dim[1]); + float *ref = (float *)(buffer + 4 + in_size + input->dim[1]); int8_t *src_tmp = malloc(in_size * sizeof(char)); int8_t *alpha_tmp = malloc(input->dim[1] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - alpha_data->data = alpha_in; get_quant_info(alpha_data); - for(int i = 0; i < input->dim[1]; i++) { - alpha_tmp[i] = csi_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo); + for (int i = 0; i < input->dim[1]; i++) { + alpha_tmp[i] = shl_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[1]; i++) { + for (int i = 0; i < input->dim[1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo); - if(isinf(alpha_in[i]) || isnan(alpha_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo); + if (isinf(alpha_in[i]) || isnan(alpha_in[i])) { continue; } else { - error1 = fabs(alpha_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9); + error1 = fabs(alpha_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -125,16 +123,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; alpha_data->data = alpha_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/prelu_nhwc_f32.c b/tests/validation/prelu_nhwc_f32.c index ef1410a3..814039c0 100644 --- a/tests/validation/prelu_nhwc_f32.c +++ b/tests/validation/prelu_nhwc_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu nhwc f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width - output->dim[3] = input->dim[3] = buffer[3]; // channel + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width + output->dim[3] = input->dim[3] = buffer[3]; // channel input->dim_count = 4; output->dim_count = 4; - input->dtype = CSINN_DTYPE_FLOAT32; - output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; + input->dtype = CSINN_DTYPE_FLOAT32; + output->dtype = CSINN_DTYPE_FLOAT32; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); + input->data = (float *)(buffer + 4); alpha_data->data = (float *)(buffer + 4 + in_size); - reference->data = (float *)(buffer + 4 + in_size + input->dim[3]); - output->data = malloc(in_size * sizeof(float)); + reference->data = (float *)(buffer + 4 + in_size + input->dim[3]); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/prelu_nhwc_i8.c b/tests/validation/prelu_nhwc_i8.c index f4943ba9..c864f527 100644 --- a/tests/validation/prelu_nhwc_i8.c +++ b/tests/validation/prelu_nhwc_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu nhwc i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width - output->dim[3] = input->dim[3] = buffer[3]; // channel + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width + output->dim[3] = input->dim[3] = buffer[3]; // channel alpha_data->dim[0] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -60,63 +60,61 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); + float *src_in = (float *)(buffer + 4); float *alpha_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + in_size + input->dim[3]); + float *ref = (float *)(buffer + 4 + in_size + input->dim[3]); int8_t *src_tmp = malloc(in_size * sizeof(char)); int8_t *alpha_tmp = malloc(input->dim[3] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } alpha_data->data = alpha_in; get_quant_info(alpha_data); - for(int i = 0; i < input->dim[3]; i++) { - alpha_tmp[i] = csi_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo); + for (int i = 0; i < input->dim[3]; i++) { + alpha_tmp[i] = shl_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[3]; i++) { + for (int i = 0; i < input->dim[3]; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo); - if(isinf(alpha_in[i]) || isnan(alpha_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo); + if (isinf(alpha_in[i]) || isnan(alpha_in[i])) { continue; } else { - error1 = fabs(alpha_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9); + error1 = fabs(alpha_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -124,16 +122,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; alpha_data->data = alpha_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/prelu_nhwc_u8.c b/tests/validation/prelu_nhwc_u8.c index 7e6181d5..3990d11e 100644 --- a/tests/validation/prelu_nhwc_u8.c +++ b/tests/validation/prelu_nhwc_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu nhwc u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width - output->dim[3] = input->dim[3] = buffer[3]; // channel + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width + output->dim[3] = input->dim[3] = buffer[3]; // channel alpha_data->dim[0] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -60,63 +60,61 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); + float *src_in = (float *)(buffer + 4); float *alpha_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + in_size + input->dim[3]); + float *ref = (float *)(buffer + 4 + in_size + input->dim[3]); uint8_t *src_tmp = malloc(in_size * sizeof(char)); uint8_t *alpha_tmp = malloc(input->dim[3] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } alpha_data->data = alpha_in; get_quant_info(alpha_data); - for(int i = 0; i < input->dim[3]; i++) { - alpha_tmp[i] = csi_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo); + for (int i = 0; i < input->dim[3]; i++) { + alpha_tmp[i] = shl_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[3]; i++) { + for (int i = 0; i < input->dim[3]; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo); - if(isinf(alpha_in[i]) || isnan(alpha_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo); + if (isinf(alpha_in[i]) || isnan(alpha_in[i])) { continue; } else { - error1 = fabs(alpha_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9); + error1 = fabs(alpha_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -124,16 +122,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; alpha_data->data = alpha_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/prelu_u8.c b/tests/validation/prelu_u8.c index 23536091..152298b2 100644 --- a/tests/validation/prelu_u8.c +++ b/tests/validation/prelu_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,10 +38,10 @@ int main(int argc, char** argv) float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // channel - output->dim[2] = input->dim[2] = buffer[2]; // height - output->dim[3] = input->dim[3] = buffer[3]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // channel + output->dim[2] = input->dim[2] = buffer[2]; // height + output->dim[3] = input->dim[3] = buffer[3]; // width alpha_data->dim[0] = buffer[1]; input->dim_count = 4; alpha_data->dim_count = 1; @@ -60,64 +60,62 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); + float *src_in = (float *)(buffer + 4); float *alpha_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + in_size + input->dim[1]); + float *ref = (float *)(buffer + 4 + in_size + input->dim[1]); uint8_t *src_tmp = malloc(in_size * sizeof(char)); uint8_t *alpha_tmp = malloc(input->dim[1] * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - alpha_data->data = alpha_in; get_quant_info(alpha_data); - for(int i = 0; i < input->dim[1]; i++) { - alpha_tmp[i] = csi_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo); + for (int i = 0; i < input->dim[1]; i++) { + alpha_tmp[i] = shl_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < input->dim[1]; i++) { + for (int i = 0; i < input->dim[1]; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo); - if(isinf(alpha_in[i]) || isnan(alpha_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo); + if (isinf(alpha_in[i]) || isnan(alpha_in[i])) { continue; } else { - error1 = fabs(alpha_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9); + error1 = fabs(alpha_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -125,16 +123,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; alpha_data->data = alpha_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_prelu_init(input, alpha_data, output, ¶ms) == CSINN_TRUE) { - csi_prelu(input, alpha_data, output, ¶ms); + if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) { + csinn_prelu(input, alpha_data, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/prod_stride_f32.c b/tests/validation/prod_stride_f32.c index d0614aae..4b765253 100644 --- a/tests/validation/prod_stride_f32.c +++ b/tests/validation/prod_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prod f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_prod_init(input, output, ¶ms) == CSINN_TRUE) { - csi_prod(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_prod_init(input, output, params) == CSINN_TRUE) { + csinn_prod(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/prod_stride_u8.c b/tests/validation/prod_stride_u8.c index 3bb864b3..ffd9603e 100644 --- a/tests/validation/prod_stride_u8.c +++ b/tests/validation/prod_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prod u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -84,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_prod_init(input, output, ¶ms) == CSINN_TRUE) { - csi_prod(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_prod_init(input, output, params) == CSINN_TRUE) { + csinn_prod(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/psroipooling_f32.c b/tests/validation/psroipooling_f32.c index 8cb73906..010f86bb 100644 --- a/tests/validation/psroipooling_f32.c +++ b/tests/validation/psroipooling_f32.c @@ -16,37 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of psropooling f32.\n"); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *spatial_scale = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct psroipooling_params params; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_psroipooling_params *params = + csinn_alloc_params(sizeof(struct csinn_psroipooling_params), NULL); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_FLOAT32; input0->name = "input0"; - input0->data = (float *)(buffer + 10); - - + input0->data = (float *)(buffer + 10); input1->dim[0] = buffer[6]; input1->dim[1] = 5; @@ -54,11 +53,10 @@ int main(int argc, char** argv) in1_size = input1->dim[0] * input1->dim[1]; input1->dtype = CSINN_DTYPE_FLOAT32; input1->name = "input1"; - input1->data = (float *)(buffer + 10 + in0_size); - + input1->data = (float *)(buffer + 10 + in0_size); - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = buffer[7]; // output_dim + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = buffer[7]; // output_dim output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; @@ -69,16 +67,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.spatial_scale = *((float *)buffer + 9); - params.output_dim = buffer[7]; - params.group_size = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->spatial_scale = *((float *)buffer + 9); + params->output_dim = buffer[7]; + params->group_size = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_psroipooling_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_psroipooling(input0, input1, output, ¶ms); + if (csinn_psroipooling_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_psroipooling(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/psroipooling_u8.c b/tests/validation/psroipooling_u8.c index 86a24d4d..f242fcbe 100644 --- a/tests/validation/psroipooling_u8.c +++ b/tests/validation/psroipooling_u8.c @@ -16,69 +16,67 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of psropooling u8.\n"); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *spatial_scale = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct psroipooling_params params; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_psroipooling_params *params = + csinn_alloc_params(sizeof(struct csinn_psroipooling_params), NULL); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); float *spatial = (float *)(buffer + 9); - params.spatial_scale = *(float *)(buffer + 9); - + params->spatial_scale = *(float *)(buffer + 9); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_UINT8; input0->name = "input0"; - float *src0_in = (float *)(buffer + 10); + float *src0_in = (float *)(buffer + 10); uint8_t *src0_tmp = malloc(in0_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in0_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in0_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; in1_size = input1->dim[0] * input1->dim[1]; input1->dtype = CSINN_DTYPE_UINT8; input1->name = "input1"; - float *src1_in = (float *)(buffer + 10 + in0_size); - uint8_t *src1_tmp = malloc(in1_size * sizeof(char)); + float *src1_in = (float *)(buffer + 10 + in0_size); + uint8_t *src1_tmp = malloc(in1_size * sizeof(char)); input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in1_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in1_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = buffer[7]; // output_dim + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = buffer[7]; // output_dim output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->dtype = CSINN_DTYPE_UINT8; - float *ref = (float *)(buffer + 10 + in0_size + in1_size); + float *ref = (float *)(buffer + 10 + in0_size + in1_size); output->name = "output"; output->data = ref; @@ -87,19 +85,18 @@ int main(int argc, char** argv) input0->data = src0_tmp; input1->data = src1_tmp; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-2; - params.output_dim = buffer[7]; - params.group_size = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->output_dim = buffer[7]; + params->group_size = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_psroipooling_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_psroipooling(input0, input1, output, ¶ms); + if (csinn_psroipooling_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_psroipooling(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/reduce_logsumexp_f32.c b/tests/validation/reduce_logsumexp_f32.c index 91d10763..06925ee7 100644 --- a/tests/validation/reduce_logsumexp_f32.c +++ b/tests/validation/reduce_logsumexp_f32.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_logsumexp f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input0->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_logsumexp_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_logsumexp(input0, output, ¶ms); + if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_logsumexp(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_logsumexp_i8.c b/tests/validation/reduce_logsumexp_i8.c index 08bb0e42..f3be5406 100644 --- a/tests/validation/reduce_logsumexp_i8.c +++ b/tests/validation/reduce_logsumexp_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_logsumexp f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,19 +59,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -78,30 +77,30 @@ int main(int argc, char** argv) } } - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,16 +108,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_reduce_logsumexp_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_logsumexp(input0, output, ¶ms); + if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_logsumexp(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/reduce_logsumexp_u8.c b/tests/validation/reduce_logsumexp_u8.c index f614c78e..e49aabb0 100644 --- a/tests/validation/reduce_logsumexp_u8.c +++ b/tests/validation/reduce_logsumexp_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_logsumexp f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,19 +59,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -78,30 +77,30 @@ int main(int argc, char** argv) } } - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,16 +108,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_reduce_logsumexp_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_logsumexp(input0, output, ¶ms); + if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_logsumexp(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/reduce_max_f32.c b/tests/validation/reduce_max_f32.c index 5e62f1b9..e4a6741c 100644 --- a/tests/validation/reduce_max_f32.c +++ b/tests/validation/reduce_max_f32.c @@ -16,70 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_max f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; // batch - reference->dim[1] = input0->dim[1] = buffer[1]; // height - reference->dim[2] = input0->dim[2] = buffer[2]; // width - reference->dim[3] = input0->dim[3] = buffer[3]; // channel + reference->dim[0] = input0->dim[0] = buffer[0]; // batch + reference->dim[1] = input0->dim[1] = buffer[1]; // height + reference->dim[2] = input0->dim[2] = buffer[2]; // width + reference->dim[3] = input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input0->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reduce_max_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_max(input0, output, ¶ms); + if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_max(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_max_i8.c b/tests/validation/reduce_max_i8.c index e106d306..42cf870b 100644 --- a/tests/validation/reduce_max_i8.c +++ b/tests/validation/reduce_max_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_max i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,24 +59,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -86,36 +85,36 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_max_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_max(input0, output, ¶ms); + if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_max(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -123,6 +122,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_max_u8.c b/tests/validation/reduce_max_u8.c index bc97d2c5..b3944cdb 100644 --- a/tests/validation/reduce_max_u8.c +++ b/tests/validation/reduce_max_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_max u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,23 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -85,36 +84,36 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_max_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_max(input0, output, ¶ms); + if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_max(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -122,6 +121,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_mean_f32.c b/tests/validation/reduce_mean_f32.c index 52414efd..168728f5 100644 --- a/tests/validation/reduce_mean_f32.c +++ b/tests/validation/reduce_mean_f32.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_mean f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; // batch - reference->dim[1] = input0->dim[1] = buffer[1]; // height - reference->dim[2] = input0->dim[2] = buffer[2]; // width - reference->dim[3] = input0->dim[3] = buffer[3]; // channel + reference->dim[0] = input0->dim[0] = buffer[0]; // batch + reference->dim[1] = input0->dim[1] = buffer[1]; // height + reference->dim[2] = input0->dim[2] = buffer[2]; // width + reference->dim[3] = input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input0->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_mean_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_mean(input0, output, ¶ms); + if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_mean(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_mean_i8.c b/tests/validation/reduce_mean_i8.c index 01ad72f6..76db40c1 100644 --- a/tests/validation/reduce_mean_i8.c +++ b/tests/validation/reduce_mean_i8.c @@ -16,19 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_mean i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -37,14 +38,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -57,23 +58,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -81,27 +80,26 @@ int main(int argc, char** argv) } } - input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo ); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo ); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,22 +107,20 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reduce_mean_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_mean(input0, output, ¶ms); + if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_mean(input0, output, params); } - result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_mean_u8.c b/tests/validation/reduce_mean_u8.c index fd200918..6abca264 100644 --- a/tests/validation/reduce_mean_u8.c +++ b/tests/validation/reduce_mean_u8.c @@ -16,19 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_mean u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -37,14 +38,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -57,24 +58,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -82,27 +81,26 @@ int main(int argc, char** argv) } } - input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo ); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo ); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -110,22 +108,20 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reduce_mean_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_mean(input0, output, ¶ms); + if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_mean(input0, output, params); } - result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_min_f32.c b/tests/validation/reduce_min_f32.c index 3852fe1d..1c8a9f71 100644 --- a/tests/validation/reduce_min_f32.c +++ b/tests/validation/reduce_min_f32.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_min f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; // batch - reference->dim[1] = input0->dim[1] = buffer[1]; // height - reference->dim[2] = input0->dim[2] = buffer[2]; // width - reference->dim[3] = input0->dim[3] = buffer[3]; // channel + reference->dim[0] = input0->dim[0] = buffer[0]; // batch + reference->dim[1] = input0->dim[1] = buffer[1]; // height + reference->dim[2] = input0->dim[2] = buffer[2]; // width + reference->dim[3] = input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input0->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_min_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_min(input0, output, ¶ms); + if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_min(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_min_i8.c b/tests/validation/reduce_min_i8.c index f1368ae7..57dd0fcb 100644 --- a/tests/validation/reduce_min_i8.c +++ b/tests/validation/reduce_min_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_min i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,22 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -84,23 +84,23 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -108,14 +108,13 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reduce_min_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_min(input0, output, ¶ms); + if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_min(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -123,6 +122,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_min_u8.c b/tests/validation/reduce_min_u8.c index 0c71895b..5213a3bf 100644 --- a/tests/validation/reduce_min_u8.c +++ b/tests/validation/reduce_min_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_min u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -59,21 +60,20 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -84,23 +84,23 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -108,14 +108,13 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reduce_min_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_min(input0, output, ¶ms); + if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_min(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -123,6 +122,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_prod_f32.c b/tests/validation/reduce_prod_f32.c index 2caa4032..c104cdeb 100644 --- a/tests/validation/reduce_prod_f32.c +++ b/tests/validation/reduce_prod_f32.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_prod f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); + input0->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size0); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_prod_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_prod(input0, output, ¶ms); + if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_prod(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_prod_i8.c b/tests/validation/reduce_prod_i8.c index 388b518f..755252b3 100644 --- a/tests/validation/reduce_prod_i8.c +++ b/tests/validation/reduce_prod_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_prod i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,23 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -85,23 +84,23 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,17 +108,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - // output->data = (float *)malloc(out_size * sizeof(float)); // float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_prod_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_prod(input0, output, ¶ms); + if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_prod(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -127,6 +125,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_prod_u8.c b/tests/validation/reduce_prod_u8.c index 0d25cef3..6a4ba4b9 100644 --- a/tests/validation/reduce_prod_u8.c +++ b/tests/validation/reduce_prod_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_prod u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,23 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); - - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -85,23 +84,23 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,17 +108,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - // output->data = (float *)malloc(out_size * sizeof(float)); // float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_prod_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_prod(input0, output, ¶ms); + if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_prod(input0, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); @@ -127,6 +125,6 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_sum_f32.c b/tests/validation/reduce_sum_f32.c index 158ac2f7..3dfe08ea 100644 --- a/tests/validation/reduce_sum_f32.c +++ b/tests/validation/reduce_sum_f32.c @@ -16,69 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_sum f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; // batch - reference->dim[1] = input0->dim[1] = buffer[1]; // height - reference->dim[2] = input0->dim[2] = buffer[2]; // width - reference->dim[3] = input0->dim[3] = buffer[3]; // channel + reference->dim[0] = input0->dim[0] = buffer[0]; // batch + reference->dim[1] = input0->dim[1] = buffer[1]; // height + reference->dim[2] = input0->dim[2] = buffer[2]; // width + reference->dim[3] = input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input0->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; } } } - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reduce_sum_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_sum(input0, output, ¶ms); + if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_sum(input0, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); free(buffer); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_sum_i8.c b/tests/validation/reduce_sum_i8.c index b904bbb6..950ee884 100644 --- a/tests/validation/reduce_sum_i8.c +++ b/tests/validation/reduce_sum_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_sum i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; // batch - reference->dim[1] = input0->dim[1] = buffer[1]; // height - reference->dim[2] = input0->dim[2] = buffer[2]; // width - reference->dim[3] = input0->dim[3] = buffer[3]; // channel + reference->dim[0] = input0->dim[0] = buffer[0]; // batch + reference->dim[1] = input0->dim[1] = buffer[1]; // height + reference->dim[2] = input0->dim[2] = buffer[2]; // width + reference->dim[3] = input0->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,22 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); int8_t *src_tmp = malloc(in_size0 * sizeof(char)); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -81,27 +81,26 @@ int main(int argc, char** argv) } } - input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -109,23 +108,20 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_reduce_sum_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_sum(input0, output, ¶ms); + if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_sum(input0, output, params); } - result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/reduce_sum_u8.c b/tests/validation/reduce_sum_u8.c index 278159d8..5b316b5d 100644 --- a/tests/validation/reduce_sum_u8.c +++ b/tests/validation/reduce_sum_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_sum u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,14 +39,14 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input0->dim[0] = buffer[0]; - reference->dim[1] = input0->dim[1] = buffer[1]; - reference->dim[2] = input0->dim[2] = buffer[2]; - reference->dim[3] = input0->dim[3] = buffer[3]; + reference->dim[0] = input0->dim[0] = buffer[0]; + reference->dim[1] = input0->dim[1] = buffer[1]; + reference->dim[2] = input0->dim[2] = buffer[2]; + reference->dim[3] = input0->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; @@ -58,22 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size0 ); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size0); uint8_t *src_tmp = malloc(in_size0 * sizeof(char)); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input0->dim[params.axis[0]]; + out_size = in_size0 / input0->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input0->dim[i]; @@ -84,46 +84,43 @@ int main(int argc, char** argv) input0->data = src_in; get_quant_info(input0); - for(int i = 0; i < in_size0; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); + for (int i = 0; i < in_size0; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size0; i++) { + for (int i = 0; i < in_size0; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input0->data = src_tmp; + input0->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_reduce_sum_init(input0, output, ¶ms) == CSINN_TRUE) { - csi_reduce_sum(input0, output, ¶ms); + if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) { + csinn_reduce_sum(input0, output, params); } - result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(buffer); free(src_tmp); free(output->data); - free(params.axis); + free(params->axis); return done_testing(); } diff --git a/tests/validation/relu1_f32.c b/tests/validation/relu1_f32.c index 13355cb0..618db891 100644 --- a/tests/validation/relu1_f32.c +++ b/tests/validation/relu1_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu1 f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,16 +48,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_relu1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu1(input, output, ¶ms); + if (csinn_relu1_init(input, output, params) == CSINN_TRUE) { + csinn_relu1(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/relu1_i8.c b/tests/validation/relu1_i8.c index f53f53dd..a7c57723 100644 --- a/tests/validation/relu1_i8.c +++ b/tests/validation/relu1_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu1 i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,51 +58,47 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { error1 = fabs(src_in[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu1(input, output, ¶ms); + if (csinn_relu1_init(input, output, params) == CSINN_TRUE) { + csinn_relu1(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relu1_u8.c b/tests/validation/relu1_u8.c index b2dd5476..b827e556 100644 --- a/tests/validation/relu1_u8.c +++ b/tests/validation/relu1_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu1 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,51 +58,47 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { error1 = fabs(src_in[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu1_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu1(input, output, ¶ms); + if (csinn_relu1_init(input, output, params) == CSINN_TRUE) { + csinn_relu1(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relu6_f32.c b/tests/validation/relu6_f32.c index 57fc87f5..0549fbfa 100644 --- a/tests/validation/relu6_f32.c +++ b/tests/validation/relu6_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6 f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,16 +48,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_relu6_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu6(input, output, ¶ms); + if (csinn_relu6_init(input, output, params) == CSINN_TRUE) { + csinn_relu6(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/relu6_i8.c b/tests/validation/relu6_i8.c index e0179f5b..1d638860 100644 --- a/tests/validation/relu6_i8.c +++ b/tests/validation/relu6_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6 i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu6_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu6(input, output, ¶ms); + if (csinn_relu6_init(input, output, params) == CSINN_TRUE) { + csinn_relu6(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relu6_u8.c b/tests/validation/relu6_u8.c index 1d6f801a..1438da70 100644 --- a/tests/validation/relu6_u8.c +++ b/tests/validation/relu6_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6 u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu6_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu6(input, output, ¶ms); + if (csinn_relu6_init(input, output, params) == CSINN_TRUE) { + csinn_relu6(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relu_f32.c b/tests/validation/relu_f32.c index baa13cdf..ed7bd3d7 100644 --- a/tests/validation/relu_f32.c +++ b/tests/validation/relu_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,16 +48,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/relu_i8.c b/tests/validation/relu_i8.c index 8a48d4ac..bd8e0a09 100644 --- a/tests/validation/relu_i8.c +++ b/tests/validation/relu_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -51,35 +51,32 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_INT8; output->dtype = CSINN_DTYPE_INT8; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,15 +84,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relu_u8.c b/tests/validation/relu_u8.c index 129ae7d9..eb95aa51 100644 --- a/tests/validation/relu_u8.c +++ b/tests/validation/relu_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,35 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relun_f32.c b/tests/validation/relun_f32.c index f0eda3a9..e254dd7e 100644 --- a/tests/validation/relun_f32.c +++ b/tests/validation/relun_f32.c @@ -16,49 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relun f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_relun_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relun(input, output, ¶ms); + if (csinn_relun_init(input, output, params) == CSINN_TRUE) { + csinn_relun(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/relun_i8.c b/tests/validation/relun_i8.c index 839ad6e5..c82c6b75 100644 --- a/tests/validation/relun_i8.c +++ b/tests/validation/relun_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relun i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -45,7 +45,7 @@ int main(int argc, char** argv) output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -58,57 +58,51 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - - csi_quantize_multiplier(params.n, &quantized_multiplier, &shift); - params.n_multiplier = quantized_multiplier; - params.n_shift = shift; - + shl_quantize_multiplier(params->n, &quantized_multiplier, &shift); + params->n_multiplier = quantized_multiplier; + params->n_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relun_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relun(input, output, ¶ms); + if (csinn_relun_init(input, output, params) == CSINN_TRUE) { + csinn_relun(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/relun_u8.c b/tests/validation/relun_u8.c index 2f18e669..a7047974 100644 --- a/tests/validation/relun_u8.c +++ b/tests/validation/relun_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relun u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -45,7 +45,7 @@ int main(int argc, char** argv) output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -58,57 +58,51 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - - csi_quantize_multiplier(params.n, &quantized_multiplier, &shift); - params.n_multiplier = quantized_multiplier; - params.n_shift = shift; - + shl_quantize_multiplier(params->n, &quantized_multiplier, &shift); + params->n_multiplier = quantized_multiplier; + params->n_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_relun_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relun(input, output, ¶ms); + if (csinn_relun_init(input, output, params) == CSINN_TRUE) { + csinn_relun(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/reshape_f32.c b/tests/validation/reshape_f32.c index 87293d3b..c53b9b48 100644 --- a/tests/validation/reshape_f32.c +++ b/tests/validation/reshape_f32.c @@ -16,43 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reshape f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reshape_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reshape_params *params = + csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); int reshape_count = buffer[4]; int *reshape = (int *)malloc(reshape_count * sizeof(int)); - for(int i = 0; i < reshape_count; i++) { + for (int i = 0; i < reshape_count; i++) { reshape[i] = buffer[5 + i]; } - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 5 + reshape_count); - input->data = input_data; + input->data = input_data; input->dtype = CSINN_DTYPE_FLOAT32; output->dim_count = reshape_count; out_size = in_size; - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { output->dim[i] = reshape[i]; // out_size *= output->dim[i]; } @@ -62,16 +63,15 @@ int main(int argc, char** argv) output->name = "output"; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; - params.shape = reshape; - params.shape_num = output->dim_count; - + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->shape = reshape; + params->shape_num = output->dim_count; + float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reshape_init(input, output, ¶ms) == CSINN_TRUE) { - csi_reshape(input, output, ¶ms); + if (csinn_reshape_init(input, output, params) == CSINN_TRUE) { + csinn_reshape(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/resize_bilinear_f32.c b/tests/validation/resize_bilinear_f32.c index d1d03c99..55280e9c 100644 --- a/tests/validation/resize_bilinear_f32.c +++ b/tests/validation/resize_bilinear_f32.c @@ -16,50 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize bilinear f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // height - output->dim[2] = buffer[5]; // width - output->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // height + output->dim[2] = buffer[5]; // width + output->dim[3] = buffer[3]; // channel input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_BILINEAR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_BILINEAR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NHWC; - input->data = (float *)(buffer + 7); - reference->data = (float *)(buffer + 7 + in_size); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 7); + reference->data = (float *)(buffer + 7 + in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/resize_bilinear_i8.c b/tests/validation/resize_bilinear_i8.c index cb824924..f9c7ba64 100644 --- a/tests/validation/resize_bilinear_i8.c +++ b/tests/validation/resize_bilinear_i8.c @@ -16,38 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize bilinear i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[0]; - output->dim[1] = buffer[4]; - output->dim[2] = buffer[5]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[0]; + output->dim[1] = buffer[4]; + output->dim[2] = buffer[5]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_BILINEAR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_BILINEAR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -57,37 +58,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,17 +93,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); - } + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/resize_bilinear_u8.c b/tests/validation/resize_bilinear_u8.c index d582f53f..ce958fed 100644 --- a/tests/validation/resize_bilinear_u8.c +++ b/tests/validation/resize_bilinear_u8.c @@ -16,38 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize bilinear u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[0]; - output->dim[1] = buffer[4]; - output->dim[2] = buffer[5]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[0]; + output->dim[1] = buffer[4]; + output->dim[2] = buffer[5]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_BILINEAR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_BILINEAR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -57,37 +58,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,17 +93,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); - } + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_f32.c b/tests/validation/resize_nearestneighbor_f32.c index 2a0afbe4..ee4b930e 100644 --- a/tests/validation/resize_nearestneighbor_f32.c +++ b/tests/validation/resize_nearestneighbor_f32.c @@ -16,50 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // height - output->dim[2] = buffer[5]; // width - output->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // height + output->dim[2] = buffer[5]; // width + output->dim[3] = buffer[3]; // channel input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NHWC; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 7); - reference->data = (float *)(buffer + 7 + in_size); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 7); + reference->data = (float *)(buffer + 7 + in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_i8.c b/tests/validation/resize_nearestneighbor_i8.c index d13aeb32..1ebddc8d 100644 --- a/tests/validation/resize_nearestneighbor_i8.c +++ b/tests/validation/resize_nearestneighbor_i8.c @@ -16,38 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[0]; - output->dim[1] = buffer[4]; - output->dim[2] = buffer[5]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[0]; + output->dim[1] = buffer[4]; + output->dim[2] = buffer[5]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; @@ -57,37 +58,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NHWC; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_nchw_f32.c b/tests/validation/resize_nearestneighbor_nchw_f32.c index cc8cd775..4553fc19 100644 --- a/tests/validation/resize_nearestneighbor_nchw_f32.c +++ b/tests/validation/resize_nearestneighbor_nchw_f32.c @@ -16,55 +16,55 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // channel - output->dim[2] = buffer[4]; // height - output->dim[3] = buffer[5]; // width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // channel + output->dim[2] = buffer[4]; // height + output->dim[3] = buffer[5]; // width input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 7); - reference->data = (float *)(buffer + 7 + in_size); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 7); + reference->data = (float *)(buffer + 7 + in_size); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_nchw_i8.c b/tests/validation/resize_nearestneighbor_nchw_i8.c index 2b5e2e25..ef77e6b6 100644 --- a/tests/validation/resize_nearestneighbor_nchw_i8.c +++ b/tests/validation/resize_nearestneighbor_nchw_i8.c @@ -16,39 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // channel - output->dim[2] = buffer[4]; // height - output->dim[3] = buffer[5]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // channel + output->dim[2] = buffer[4]; // height + output->dim[3] = buffer[5]; // width input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -58,36 +59,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,20 +95,19 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - // input->data = (float *)(buffer + 7); // reference->data = (float *)(buffer + 7 + in_size); // output->data = malloc(out_size * sizeof(float)); // float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_nchw_u8.c b/tests/validation/resize_nearestneighbor_nchw_u8.c index ee461bfa..a572135a 100644 --- a/tests/validation/resize_nearestneighbor_nchw_u8.c +++ b/tests/validation/resize_nearestneighbor_nchw_u8.c @@ -16,39 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // channel - output->dim[2] = buffer[4]; // height - output->dim[3] = buffer[5]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // channel + output->dim[2] = buffer[4]; // height + output->dim[3] = buffer[5]; // width input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -58,36 +59,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,20 +95,19 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - // input->data = (float *)(buffer + 7); // reference->data = (float *)(buffer + 7 + in_size); // output->data = malloc(out_size * sizeof(float)); // float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/resize_nearestneighbor_u8.c b/tests/validation/resize_nearestneighbor_u8.c index c198b595..34d5700a 100644 --- a/tests/validation/resize_nearestneighbor_u8.c +++ b/tests/validation/resize_nearestneighbor_u8.c @@ -16,38 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // height - output->dim[2] = buffer[5]; // width - output->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // height + output->dim[2] = buffer[5]; // width + output->dim[3] = buffer[3]; // channel input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; @@ -57,37 +58,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NHWC; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NHWC; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 7); - float *ref = (float *)(buffer + 7 + in_size); + float *src_in = (float *)(buffer + 7); + float *ref = (float *)(buffer + 7 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +94,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_resize_init(input, output, ¶ms) == CSINN_TRUE) { - csi_resize(input, output, ¶ms); + if (csinn_resize_init(input, output, params) == CSINN_TRUE) { + csinn_resize(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/reverse_f32.c b/tests/validation/reverse_f32.c index 28a66c56..e21d99ea 100644 --- a/tests/validation/reverse_f32.c +++ b/tests/validation/reverse_f32.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reverse f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reverse_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reverse_params *params = + csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -53,16 +54,15 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); + input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_reverse_init(input, output, ¶ms) == CSINN_TRUE) { - csi_reverse(input, output, ¶ms); + if (csinn_reverse_init(input, output, params) == CSINN_TRUE) { + csinn_reverse(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/reverse_i8.c b/tests/validation/reverse_i8.c index 9f3d53e4..b4f9070d 100644 --- a/tests/validation/reverse_i8.c +++ b/tests/validation/reverse_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reverse i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reverse_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reverse_params *params = + csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -63,35 +64,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +97,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reverse_init(input, output, ¶ms) == CSINN_TRUE) { - csi_reverse(input, output, ¶ms); + if (csinn_reverse_init(input, output, params) == CSINN_TRUE) { + csinn_reverse(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/reverse_u8.c b/tests/validation/reverse_u8.c index 01b79b4b..e63d7469 100644 --- a/tests/validation/reverse_u8.c +++ b/tests/validation/reverse_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reverse u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reverse_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reverse_params *params = + csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -63,35 +64,32 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -99,15 +97,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_reverse_init(input, output, ¶ms) == CSINN_TRUE) { - csi_reverse(input, output, ¶ms); + if (csinn_reverse_init(input, output, params) == CSINN_TRUE) { + csinn_reverse(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/riscv_xt9xx/relu_fp16.c b/tests/validation/riscv_xt9xx/relu_fp16.c index 72f199d5..96482066 100644 --- a/tests/validation/riscv_xt9xx/relu_fp16.c +++ b/tests/validation/riscv_xt9xx/relu_fp16.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" -#include "csi_c906.h" +#include "shl_c906.h" +#include "test_utils.h" int main(int argc, char** argv) { init_testsuite("Testing function of relu fp16.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; char *buffer = read_input_data_fp16(argv[1], 4); @@ -51,14 +51,14 @@ int main(int argc, char** argv) input->dim_count = 4; output->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; + params->base.api = CSINN_API; input->data = (__fp16 *)(fp16_buffer); reference->data = (__fp16 *)(fp16_buffer + in_size); output->data = malloc(in_size * sizeof(__fp16)); float difference = argc > 2 ? atof(argv[2]) : 0.1; - csi_c906_relu_fp16(input, output, ¶ms); // TODO: use nn2_api + shl_c906_relu_fp16(input, output, params); // TODO: use nn2_api result_verify_fp16(output->data, reference->data, input->data, difference, in_size, false); diff --git a/tests/validation/roialign_f32.c b/tests/validation/roialign_f32.c index 13396400..33530454 100644 --- a/tests/validation/roialign_f32.c +++ b/tests/validation/roialign_f32.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of roialign f32.\n"); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct roi_align_params params; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_roi_align_params *params = + csinn_alloc_params(sizeof(struct csinn_roi_align_params), NULL); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_FLOAT32; input0->name = "input0"; input0->data = (float *)(buffer + 11); - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; @@ -54,9 +54,8 @@ int main(int argc, char** argv) input1->name = "input1"; input1->data = (float *)(buffer + 11 + in0_size); - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = input0->dim[1]; // channel + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = input0->dim[1]; // channel output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; @@ -67,18 +66,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.spatial_scale = *((float *)buffer + 9); - params.sample_ratio = *((int32_t *)buffer + 10); - params.pooled_size_h = buffer[7]; - params.pooled_size_w = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - + params->spatial_scale = *((float *)buffer + 9); + params->sample_ratio = *((int32_t *)buffer + 10); + params->pooled_size_h = buffer[7]; + params->pooled_size_w = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_roi_align_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_roi_align(input0, input1, output, ¶ms); + if (csinn_roi_align_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_roi_align(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/roipooling_f32.c b/tests/validation/roipooling_f32.c index 594ed45e..76bf04b2 100644 --- a/tests/validation/roipooling_f32.c +++ b/tests/validation/roipooling_f32.c @@ -16,36 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of roipooling f32.\n"); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct roi_pool_params params; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_roi_pool_params *params = + csinn_alloc_params(sizeof(struct csinn_roi_pool_params), NULL); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_FLOAT32; input0->name = "input0"; input0->data = (float *)(buffer + 10); - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; @@ -54,9 +54,8 @@ int main(int argc, char** argv) input1->name = "input1"; input1->data = (float *)(buffer + 10 + in0_size); - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = input0->dim[1]; // channel + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = input0->dim[1]; // channel output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; @@ -67,17 +66,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.spatial_scale = *((float *)buffer + 9); - params.pooled_size_h = buffer[7]; - params.pooled_size_w = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - + params->spatial_scale = *((float *)buffer + 9); + params->pooled_size_h = buffer[7]; + params->pooled_size_w = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_roipool_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_roipool(input0, input1, output, ¶ms); + if (csinn_roipool_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_roipool(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false); diff --git a/tests/validation/roipooling_u8.c b/tests/validation/roipooling_u8.c index c9d2e9fe..e424409c 100644 --- a/tests/validation/roipooling_u8.c +++ b/tests/validation/roipooling_u8.c @@ -16,32 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ropooling u8.\n"); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *spatial_scale = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct roi_pool_params params; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_roi_pool_params *params = + csinn_alloc_params(sizeof(struct csinn_roi_pool_params), NULL); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); float *spatial = (float *)(buffer + 9); - params.spatial_scale = *(float *)(buffer + 9); + params->spatial_scale = *(float *)(buffer + 9); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_UINT8; @@ -50,15 +51,14 @@ int main(int argc, char** argv) input0->quant_channel = 1; input0->name = "input0"; - float *src0_in = (float *)(buffer + 10); + float *src0_in = (float *)(buffer + 10); uint8_t *src0_tmp = malloc(in0_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in0_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in0_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; @@ -70,22 +70,21 @@ int main(int argc, char** argv) input1->is_const = 0; input1->quant_channel = 1; - float *src1_in = (float *)(buffer + 10 + in0_size); - uint8_t *src1_tmp = malloc(in1_size * sizeof(char)); + float *src1_in = (float *)(buffer + 10 + in0_size); + uint8_t *src1_tmp = malloc(in1_size * sizeof(char)); input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in1_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in1_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = input0->dim[1]; // channel + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = input0->dim[1]; // channel output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - float *ref = (float *)(buffer + 10 + in0_size + in1_size); + float *ref = (float *)(buffer + 10 + in0_size + in1_size); output->name = "output"; output->dtype = CSINN_DTYPE_UINT8; @@ -98,20 +97,18 @@ int main(int argc, char** argv) input0->data = src0_tmp; input1->data = src1_tmp; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-2; - params.pooled_size_h = buffer[7]; - params.pooled_size_w = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - + params->pooled_size_h = buffer[7]; + params->pooled_size_w = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_roipool_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_roipool(input0, input1, output, ¶ms); + if (csinn_roipool_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_roipool(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, out_size, false); diff --git a/tests/validation/round_f32.c b/tests/validation/round_f32.c index f2a49637..4b27cc12 100644 --- a/tests/validation/round_f32.c +++ b/tests/validation/round_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of round f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,17 +48,16 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_round_init(input, output, ¶ms) == CSINN_TRUE) { - csi_round(input, output, ¶ms); - } + if (csinn_round_init(input, output, params) == CSINN_TRUE) { + csinn_round(input, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/round_i8.c b/tests/validation/round_i8.c index 62908cf0..4e7fcb22 100644 --- a/tests/validation/round_i8.c +++ b/tests/validation/round_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of round i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,16 +91,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_round_init(input, output, ¶ms) == CSINN_TRUE) { - csi_round(input, output, ¶ms); - } + if (csinn_round_init(input, output, params) == CSINN_TRUE) { + csinn_round(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/round_u8.c b/tests/validation/round_u8.c index 73468b27..8b64d3d5 100644 --- a/tests/validation/round_u8.c +++ b/tests/validation/round_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of round u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,16 +91,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); - + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_round_init(input, output, ¶ms) == CSINN_TRUE) { - csi_round(input, output, ¶ms); - } + if (csinn_round_init(input, output, params) == CSINN_TRUE) { + csinn_round(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/rsqrt_f32.c b/tests/validation/rsqrt_f32.c index d3c66b74..81f42c20 100644 --- a/tests/validation/rsqrt_f32.c +++ b/tests/validation/rsqrt_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of rsqrt f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,16 +48,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_rsqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_rsqrt(input, output, ¶ms); + if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) { + csinn_rsqrt(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/rsqrt_i8.c b/tests/validation/rsqrt_i8.c index 54187dc1..9a7b8896 100644 --- a/tests/validation/rsqrt_i8.c +++ b/tests/validation/rsqrt_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of rsqrt i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -59,35 +59,32 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_rsqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_rsqrt(input, output, ¶ms); + if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) { + csinn_rsqrt(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/rsqrt_u8.c b/tests/validation/rsqrt_u8.c index 6e5637c6..9b3964d8 100644 --- a/tests/validation/rsqrt_u8.c +++ b/tests/validation/rsqrt_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of rsqrt u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,35 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_rsqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_rsqrt(input, output, ¶ms); + if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) { + csinn_rsqrt(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/segment_max_f32.c b/tests/validation/segment_max_f32.c index 4aa028c4..f5d9474a 100644 --- a/tests/validation/segment_max_f32.c +++ b/tests/validation/segment_max_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment max f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); - } + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/segment_max_i8.c b/tests/validation/segment_max_i8.c index 79a97ab3..ed5ef2eb 100644 --- a/tests/validation/segment_max_i8.c +++ b/tests/validation/segment_max_i8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment max i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,37 +57,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -94,23 +96,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == -FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == -FLT_MAX) { ref[i] = min_value; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_max_u8.c b/tests/validation/segment_max_u8.c index 84162633..c2bbfe95 100644 --- a/tests/validation/segment_max_u8.c +++ b/tests/validation/segment_max_u8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment max u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,37 +57,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -94,23 +96,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == -FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == -FLT_MAX) { ref[i] = min_value; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_mean_f32.c b/tests/validation/segment_mean_f32.c index f6396a1a..594976c4 100644 --- a/tests/validation/segment_mean_f32.c +++ b/tests/validation/segment_mean_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment mean f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); - } + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/segment_mean_i8.c b/tests/validation/segment_mean_i8.c index 238a6cef..c0a0e12d 100644 --- a/tests/validation/segment_mean_i8.c +++ b/tests/validation/segment_mean_i8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment mean i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -57,37 +58,38 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,17 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_mean_u8.c b/tests/validation/segment_mean_u8.c index cdf06cee..35f6973c 100644 --- a/tests/validation/segment_mean_u8.c +++ b/tests/validation/segment_mean_u8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment mean u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -57,38 +58,38 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,17 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_min_f32.c b/tests/validation/segment_min_f32.c index c531263e..2dd86833 100644 --- a/tests/validation/segment_min_f32.c +++ b/tests/validation/segment_min_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment min f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); - } + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/segment_min_i8.c b/tests/validation/segment_min_i8.c index 1b59745f..ace8b497 100644 --- a/tests/validation/segment_min_i8.c +++ b/tests/validation/segment_min_i8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment min i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,37 +57,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -94,16 +96,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_min_u8.c b/tests/validation/segment_min_u8.c index 06271951..96ae657d 100644 --- a/tests/validation/segment_min_u8.c +++ b/tests/validation/segment_min_u8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment min u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,38 +57,39 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,23 +97,21 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == FLT_MAX) { ref[i] = max_value; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_prod_f32.c b/tests/validation/segment_prod_f32.c index 04109a83..398f1fa9 100644 --- a/tests/validation/segment_prod_f32.c +++ b/tests/validation/segment_prod_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment prod f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); - } + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/segment_prod_i8.c b/tests/validation/segment_prod_i8.c index 1567dfa4..d85307c7 100644 --- a/tests/validation/segment_prod_i8.c +++ b/tests/validation/segment_prod_i8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment prod i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,57 +57,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - error = error * pow(abs(max_value), input->dim[0] - params.num_segments + 1); + error = error * pow(abs(max_value), input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_prod_u8.c b/tests/validation/segment_prod_u8.c index 850beabb..c1988dee 100644 --- a/tests/validation/segment_prod_u8.c +++ b/tests/validation/segment_prod_u8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment prod u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,57 +57,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - error = error * pow(abs(max_value), input->dim[0] - params.num_segments + 1); + error = error * pow(abs(max_value), input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_sum_f32.c b/tests/validation/segment_sum_f32.c index dcab8e5f..f1a68850 100644 --- a/tests/validation/segment_sum_f32.c +++ b/tests/validation/segment_sum_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment sum f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); - } + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/segment_sum_i8.c b/tests/validation/segment_sum_i8.c index 134e156b..d115e943 100644 --- a/tests/validation/segment_sum_i8.c +++ b/tests/validation/segment_sum_i8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment sum i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,57 +57,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } /* sum */ - error = error * (input->dim[0] - params.num_segments + 1); + error = error * (input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/segment_sum_u8.c b/tests/validation/segment_sum_u8.c index 4781f597..605ccf64 100644 --- a/tests/validation/segment_sum_u8.c +++ b/tests/validation/segment_sum_u8.c @@ -16,34 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment sum u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -56,56 +57,56 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } /* sum */ - error = error * (input->dim[0] - params.num_segments + 1); + error = error * (input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/select_f32.c b/tests/validation/select_f32.c index 7d202d04..e7269522 100644 --- a/tests/validation/select_f32.c +++ b/tests/validation/select_f32.c @@ -16,30 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of select f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *condition = csi_alloc_tensor(NULL); - struct select_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *condition = csinn_alloc_tensor(NULL); + struct csinn_select_params *params = + csinn_alloc_params(sizeof(struct csinn_select_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -52,18 +53,17 @@ int main(int argc, char** argv) output->dim_count = 4; input0->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); condition->data = (float *)(buffer + 4 + 2 * in_size); reference->data = (float *)(buffer + 4 + 3 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_select_init(condition, input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_select(condition, input0, input1, output, ¶ms); + if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) { + csinn_select(condition, input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/select_i8.c b/tests/validation/select_i8.c index 219d72ec..c4b67e33 100644 --- a/tests/validation/select_i8.c +++ b/tests/validation/select_i8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of select i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *condition = csi_alloc_tensor(NULL); - struct select_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *condition = csinn_alloc_tensor(NULL); + struct csinn_select_params *params = + csinn_alloc_params(sizeof(struct csinn_select_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -67,38 +68,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); float *cond_in = (float *)(buffer + 4 + 2 * in_size); - float *ref = (float *)(buffer + 4 + 3 * in_size); + float *ref = (float *)(buffer + 4 + 3 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); - int8_t *cond_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *cond_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -106,23 +105,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -131,42 +130,40 @@ int main(int argc, char** argv) condition->data = cond_in; get_quant_info(condition); - for(int i = 0; i < in_size; i++) { - cond_tmp[i] = csi_ref_quantize_f32_to_i8(cond_in[i], condition->qinfo); + for (int i = 0; i < in_size; i++) { + cond_tmp[i] = shl_ref_quantize_f32_to_i8(cond_in[i], condition->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(cond_tmp[i], condition->qinfo); - if(isinf(cond_in[i]) || isnan(cond_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(cond_tmp[i], condition->qinfo); + if (isinf(cond_in[i]) || isnan(cond_in[i])) { continue; } else { - error1 = fabs(cond_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(cond_in[i] - output_tmp)/fabs(cond_in[i] + 1e-9); + error1 = fabs(cond_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(cond_in[i] - output_tmp) / fabs(cond_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; condition->data = cond_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_select_init(condition, input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_select(condition, input0, input1, output, ¶ms); + if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) { + csinn_select(condition, input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/select_u8.c b/tests/validation/select_u8.c index c500fe9d..4ecab065 100644 --- a/tests/validation/select_u8.c +++ b/tests/validation/select_u8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of select u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *condition = csi_alloc_tensor(NULL); - struct select_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *condition = csinn_alloc_tensor(NULL); + struct csinn_select_params *params = + csinn_alloc_params(sizeof(struct csinn_select_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -67,38 +68,36 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); float *cond_in = (float *)(buffer + 4 + 2 * in_size); - float *ref = (float *)(buffer + 4 + 3 * in_size); + float *ref = (float *)(buffer + 4 + 3 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); - uint8_t *cond_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *cond_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -106,23 +105,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -131,43 +130,40 @@ int main(int argc, char** argv) condition->data = cond_in; get_quant_info(condition); - - for(int i = 0; i < in_size; i++) { - cond_tmp[i] = csi_ref_quantize_f32_to_u8(cond_in[i], condition->qinfo); + for (int i = 0; i < in_size; i++) { + cond_tmp[i] = shl_ref_quantize_f32_to_u8(cond_in[i], condition->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(cond_tmp[i], condition->qinfo); - if(isinf(cond_in[i]) || isnan(cond_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(cond_tmp[i], condition->qinfo); + if (isinf(cond_in[i]) || isnan(cond_in[i])) { continue; } else { - error1 = fabs(cond_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(cond_in[i] - output_tmp)/fabs(cond_in[i] + 1e-9); + error1 = fabs(cond_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(cond_in[i] - output_tmp) / fabs(cond_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; condition->data = cond_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_select_init(condition, input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_select(condition, input0, input1, output, ¶ms); + if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) { + csinn_select(condition, input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/shuffle_channel_f32.c b/tests/validation/shuffle_channel_f32.c index 6f5e42fa..bf7bfcf1 100644 --- a/tests/validation/shuffle_channel_f32.c +++ b/tests/validation/shuffle_channel_f32.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,23 +48,23 @@ int main(int argc, char **argv) input->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; output->dim_count = 4; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if(csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/shuffle_channel_i8.c b/tests/validation/shuffle_channel_i8.c index fb68f372..8454a091 100644 --- a/tests/validation/shuffle_channel_i8.c +++ b/tests/validation/shuffle_channel_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -37,11 +38,11 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -53,9 +54,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; output->dim_count = 4; output->dtype = CSINN_DTYPE_INT8; @@ -64,7 +64,8 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; float *src_in_data = (float *)(buffer + 5); float *ref_data = (float *)(buffer + 5 + in_size); @@ -74,23 +75,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -103,8 +105,8 @@ int main(int argc, char** argv) output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/shuffle_channel_nchw_f32.c b/tests/validation/shuffle_channel_nchw_f32.c index cf1e3052..3f97275c 100644 --- a/tests/validation/shuffle_channel_nchw_f32.c +++ b/tests/validation/shuffle_channel_nchw_f32.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel nchw f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,23 +48,23 @@ int main(int argc, char **argv) input->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; output->dim_count = 4; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if(csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/shuffle_channel_nchw_i8.c b/tests/validation/shuffle_channel_nchw_i8.c index 56ab3599..649643fd 100644 --- a/tests/validation/shuffle_channel_nchw_i8.c +++ b/tests/validation/shuffle_channel_nchw_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel nchw i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -37,11 +38,11 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -53,9 +54,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; output->dim_count = 4; output->dtype = CSINN_DTYPE_INT8; @@ -64,7 +64,8 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; float *src_in_data = (float *)(buffer + 5); float *ref_data = (float *)(buffer + 5 + in_size); @@ -74,23 +75,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -103,8 +105,8 @@ int main(int argc, char** argv) output->data = (int8_t *)malloc(out_size * sizeof(int8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/shuffle_channel_nchw_u8.c b/tests/validation/shuffle_channel_nchw_u8.c index 584e4d7d..a29246ee 100644 --- a/tests/validation/shuffle_channel_nchw_u8.c +++ b/tests/validation/shuffle_channel_nchw_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel nchw u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -37,11 +38,11 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -53,9 +54,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; output->dim_count = 4; output->dtype = CSINN_DTYPE_UINT8; @@ -64,7 +64,8 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; float *src_in_data = (float *)(buffer + 5); float *ref_data = (float *)(buffer + 5 + in_size); @@ -74,23 +75,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -103,8 +105,8 @@ int main(int argc, char** argv) output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/shuffle_channel_u8.c b/tests/validation/shuffle_channel_u8.c index cb58c074..f5fec963 100644 --- a/tests/validation/shuffle_channel_u8.c +++ b/tests/validation/shuffle_channel_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -37,11 +38,11 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -53,9 +54,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NHWC; input->is_const = 0; input->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; output->dim_count = 4; output->dtype = CSINN_DTYPE_UINT8; @@ -64,7 +64,8 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; float *src_in_data = (float *)(buffer + 5); float *ref_data = (float *)(buffer + 5 + in_size); @@ -74,23 +75,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -103,8 +105,8 @@ int main(int argc, char** argv) output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_shuffle_channel_init(input, output, ¶ms) == CSINN_TRUE) { - csi_shuffle_channel(input, output, ¶ms); + if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) { + csinn_shuffle_channel(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/sigmoid_f32.c b/tests/validation/sigmoid_f32.c index 0ce02e38..be2388ad 100644 --- a/tests/validation/sigmoid_f32.c +++ b/tests/validation/sigmoid_f32.c @@ -16,27 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,16 +49,15 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sigmoid(input, output, ¶ms); + if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_sigmoid(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/sigmoid_i8.c b/tests/validation/sigmoid_i8.c index 3fff55a1..b19a7dac 100644 --- a/tests/validation/sigmoid_i8.c +++ b/tests/validation/sigmoid_i8.c @@ -16,30 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -92,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sigmoid(input, output, ¶ms); + if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_sigmoid(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/sigmoid_u8.c b/tests/validation/sigmoid_u8.c index c31868aa..ea11290e 100644 --- a/tests/validation/sigmoid_u8.c +++ b/tests/validation/sigmoid_u8.c @@ -16,30 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,34 +58,32 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -92,15 +91,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sigmoid(input, output, ¶ms); + if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_sigmoid(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/sign_f32.c b/tests/validation/sign_f32.c index 3e378a8a..8fbcb8cb 100644 --- a/tests/validation/sign_f32.c +++ b/tests/validation/sign_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sign f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sign_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sign(input, output, ¶ms); + if (csinn_sign_init(input, output, params) == CSINN_TRUE) { + csinn_sign(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/sin_f32.c b/tests/validation/sin_f32.c index a24c5311..2c3399bf 100644 --- a/tests/validation/sin_f32.c +++ b/tests/validation/sin_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sin f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sin(input, output, ¶ms); + if (csinn_sin_init(input, output, params) == CSINN_TRUE) { + csinn_sin(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/sin_i8.c b/tests/validation/sin_i8.c index 3c3dd779..fc1f3b80 100644 --- a/tests/validation/sin_i8.c +++ b/tests/validation/sin_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sin i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,14 +96,12 @@ int main(int argc, char** argv) // max error: 0.018 for input [-3.14, 3.14] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sin(input, output, ¶ms); + if (csinn_sin_init(input, output, params) == CSINN_TRUE) { + csinn_sin(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); - free(buffer); free(output->data); free(input_data); diff --git a/tests/validation/sin_u8.c b/tests/validation/sin_u8.c index bab3d3ff..7c5e1299 100644 --- a/tests/validation/sin_u8.c +++ b/tests/validation/sin_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sin u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,14 +96,12 @@ int main(int argc, char** argv) // max error: 0.018 for input [-3.14, 3.14] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sin_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sin(input, output, ¶ms); + if (csinn_sin_init(input, output, params) == CSINN_TRUE) { + csinn_sin(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); - free(buffer); free(output->data); free(input_data); diff --git a/tests/validation/sinh_f32.c b/tests/validation/sinh_f32.c index 2b7fe837..a038a18f 100644 --- a/tests/validation/sinh_f32.c +++ b/tests/validation/sinh_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sinh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sinh(input, output, ¶ms); + if (csinn_sinh_init(input, output, params) == CSINN_TRUE) { + csinn_sinh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/sinh_i8.c b/tests/validation/sinh_i8.c index 05605e37..49771206 100644 --- a/tests/validation/sinh_i8.c +++ b/tests/validation/sinh_i8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sinh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,18 +86,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sinh(input, output, ¶ms); + if (csinn_sinh_init(input, output, params) == CSINN_TRUE) { + csinn_sinh(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/sinh_u8.c b/tests/validation/sinh_u8.c index 66c0f384..ed5cfa65 100644 --- a/tests/validation/sinh_u8.c +++ b/tests/validation/sinh_u8.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sinh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,34 +52,33 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - float *src_in = (float *)(buffer + 1 + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + in_size); + params->base.api = CSINN_API; + + float *src_in = (float *)(buffer + 1 + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -87,18 +86,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); - + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sinh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sinh(input, output, ¶ms); + if (csinn_sinh_init(input, output, params) == CSINN_TRUE) { + csinn_sinh(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/slice_f32.c b/tests/validation/slice_f32.c index eb5014e0..94889dc5 100644 --- a/tests/validation/slice_f32.c +++ b/tests/validation/slice_f32.c @@ -16,63 +16,62 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of slice f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - input->data = (float *)(buffer + 12); - params.slice_num = 4; - params.begin = (int *)malloc(4 * sizeof(int)); - params.end = (int *)malloc(4 * sizeof(int)); - for(int i = 0; i < 4; i++) { - params.begin[i] = buffer[4+i]; - params.end[i] = buffer[8+i]; + input->data = (float *)(buffer + 12); + params->slice_num = 4; + params->begin = (int *)malloc(4 * sizeof(int)); + params->end = (int *)malloc(4 * sizeof(int)); + for (int i = 0; i < 4; i++) { + params->begin[i] = buffer[4 + i]; + params->end[i] = buffer[8 + i]; } - output->dim[0] = params.end[0] - params.begin[0]; - output->dim[1] = params.end[1] - params.begin[1]; - output->dim[2] = params.end[2] - params.begin[2]; - output->dim[3] = params.end[3] - params.begin[3]; + output->dim[0] = params->end[0] - params->begin[0]; + output->dim[1] = params->end[1] - params->begin[1]; + output->dim[2] = params->end[2] - params->begin[2]; + output->dim[3] = params->end[3] - params->begin[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; + reference->data = (float *)(buffer + 12 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_slice(input, output, ¶ms); + if (csinn_slice_init(input, output, params) == CSINN_TRUE) { + csinn_slice(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); free(output->data); - free(params.begin); - free(params.end); + free(params->begin); + free(params->end); return done_testing(); } diff --git a/tests/validation/slice_i8.c b/tests/validation/slice_i8.c index 7d02fb3d..294f03cb 100644 --- a/tests/validation/slice_i8.c +++ b/tests/validation/slice_i8.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of slice i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.slice_num = 4; - params.begin = (int *)malloc(4 * sizeof(int)); - params.end = (int *)malloc(4 * sizeof(int)); - for(int i = 0; i < 4; i++) { - params.begin[i] = buffer[4+i]; - params.end[i] = buffer[8+i]; + params->slice_num = 4; + params->begin = (int *)malloc(4 * sizeof(int)); + params->end = (int *)malloc(4 * sizeof(int)); + for (int i = 0; i < 4; i++) { + params->begin[i] = buffer[4 + i]; + params->end[i] = buffer[8 + i]; } - output->dim[0] = params.end[0] - params.begin[0]; - output->dim[1] = params.end[1] - params.begin[1]; - output->dim[2] = params.end[2] - params.begin[2]; - output->dim[3] = params.end[3] - params.begin[3]; + output->dim[0] = params->end[0] - params->begin[0]; + output->dim[1] = params->end[1] - params->begin[1]; + output->dim[2] = params->end[2] - params->begin[2]; + output->dim[3] = params->end[3] - params->begin[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; input->dim_count = 4; @@ -67,47 +67,46 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 12); - float *ref = (float *)(buffer + 12 + in_size); + float *src_in = (float *)(buffer + 12); + float *ref = (float *)(buffer + 12 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_slice(input, output, ¶ms); + if (csinn_slice_init(input, output, params) == CSINN_TRUE) { + csinn_slice(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); @@ -115,7 +114,7 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.begin); - free(params.end); + free(params->begin); + free(params->end); return done_testing(); } diff --git a/tests/validation/slice_u8.c b/tests/validation/slice_u8.c index f1b0ecdf..2af82b69 100644 --- a/tests/validation/slice_u8.c +++ b/tests/validation/slice_u8.c @@ -16,44 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of slice u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL); int in_size = 1, out_size = 1; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.slice_num = 4; - params.begin = (int *)malloc(4 * sizeof(int)); - params.end = (int *)malloc(4 * sizeof(int)); - for(int i = 0; i < 4; i++) { - params.begin[i] = buffer[4+i]; - params.end[i] = buffer[8+i]; + params->slice_num = 4; + params->begin = (int *)malloc(4 * sizeof(int)); + params->end = (int *)malloc(4 * sizeof(int)); + for (int i = 0; i < 4; i++) { + params->begin[i] = buffer[4 + i]; + params->end[i] = buffer[8 + i]; } - output->dim[0] = params.end[0] - params.begin[0]; - output->dim[1] = params.end[1] - params.begin[1]; - output->dim[2] = params.end[2] - params.begin[2]; - output->dim[3] = params.end[3] - params.begin[3]; + output->dim[0] = params->end[0] - params->begin[0]; + output->dim[1] = params->end[1] - params->begin[1]; + output->dim[2] = params->end[2] - params->begin[2]; + output->dim[3] = params->end[3] - params->begin[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; input->dim_count = 4; @@ -67,47 +67,46 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 12); - float *ref = (float *)(buffer + 12 + in_size); + float *src_in = (float *)(buffer + 12); + float *ref = (float *)(buffer + 12 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_slice(input, output, ¶ms); + if (csinn_slice_init(input, output, params) == CSINN_TRUE) { + csinn_slice(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); @@ -115,7 +114,7 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.begin); - free(params.end); + free(params->begin); + free(params->end); return done_testing(); } diff --git a/tests/validation/softmax_f32.c b/tests/validation/softmax_f32.c index a3ef1fd1..0ae76cda 100644 --- a/tests/validation/softmax_f32.c +++ b/tests/validation/softmax_f32.c @@ -16,51 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softmax(input, output, ¶ms); + if (csinn_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_softmax(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/softmax_i8.c b/tests/validation/softmax_i8.c index d3b9d115..6717f5fc 100644 --- a/tests/validation/softmax_i8.c +++ b/tests/validation/softmax_i8.c @@ -16,37 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; input->dim_count = 4; output->dim_count = 4; @@ -59,37 +60,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.layout = CSINN_LAYOUT_NHWC; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -97,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softmax(input, output, ¶ms); + if (csinn_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_softmax(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softmax_u8.c b/tests/validation/softmax_u8.c index 5e335e60..87d7355a 100644 --- a/tests/validation/softmax_u8.c +++ b/tests/validation/softmax_u8.c @@ -16,37 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; input->dim_count = 4; output->dim_count = 4; @@ -59,37 +60,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -97,15 +96,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softmax(input, output, ¶ms); + if (csinn_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_softmax(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softplus_f32.c b/tests/validation/softplus_f32.c index a021c80c..85b2a1d8 100644 --- a/tests/validation/softplus_f32.c +++ b/tests/validation/softplus_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softplus f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_softplus_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softplus(input, output, ¶ms); + if (csinn_softplus_init(input, output, params) == CSINN_TRUE) { + csinn_softplus(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/softplus_i8.c b/tests/validation/softplus_i8.c index b7a20a55..cc6e6955 100644 --- a/tests/validation/softplus_i8.c +++ b/tests/validation/softplus_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softplus i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -52,42 +52,39 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softplus_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softplus(input, output, ¶ms); + if (csinn_softplus_init(input, output, params) == CSINN_TRUE) { + csinn_softplus(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softplus_u8.c b/tests/validation/softplus_u8.c index 81f4cfab..2b1fe2fa 100644 --- a/tests/validation/softplus_u8.c +++ b/tests/validation/softplus_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softplus u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,37 +57,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softplus_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softplus(input, output, ¶ms); + if (csinn_softplus_init(input, output, params) == CSINN_TRUE) { + csinn_softplus(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softrelu_f32.c b/tests/validation/softrelu_f32.c index 2e7db9d2..c188d308 100644 --- a/tests/validation/softrelu_f32.c +++ b/tests/validation/softrelu_f32.c @@ -16,49 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softrelu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_softrelu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softrelu(input, output, ¶ms); + if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) { + csinn_softrelu(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/softrelu_i8.c b/tests/validation/softrelu_i8.c index 69cb40c7..6546ec2e 100644 --- a/tests/validation/softrelu_i8.c +++ b/tests/validation/softrelu_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softrelu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -46,7 +46,7 @@ int main(int argc, char** argv) output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -60,56 +60,51 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - - csi_quantize_multiplier(params.n, &quantized_multiplier, &shift); - params.n_multiplier = quantized_multiplier; - params.n_shift = shift; - + shl_quantize_multiplier(params->n, &quantized_multiplier, &shift); + params->n_multiplier = quantized_multiplier; + params->n_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softrelu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softrelu(input, output, ¶ms); + if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) { + csinn_softrelu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softrelu_u8.c b/tests/validation/softrelu_u8.c index 7e45da63..efd97c10 100644 --- a/tests/validation/softrelu_u8.c +++ b/tests/validation/softrelu_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softrelu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -46,7 +46,7 @@ int main(int argc, char** argv) output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -59,56 +59,51 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - - csi_quantize_multiplier(params.n, &quantized_multiplier, &shift); - params.n_multiplier = quantized_multiplier; - params.n_shift = shift; - + shl_quantize_multiplier(params->n, &quantized_multiplier, &shift); + params->n_multiplier = quantized_multiplier; + params->n_shift = shift; output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softrelu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softrelu(input, output, ¶ms); + if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) { + csinn_softrelu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softsign_f32.c b/tests/validation/softsign_f32.c index ff19bdf2..d5b0f63d 100644 --- a/tests/validation/softsign_f32.c +++ b/tests/validation/softsign_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softsign f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_softsign_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softsign(input, output, ¶ms); + if (csinn_softsign_init(input, output, params) == CSINN_TRUE) { + csinn_softsign(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/softsign_i8.c b/tests/validation/softsign_i8.c index 86dad06a..9a46fadc 100644 --- a/tests/validation/softsign_i8.c +++ b/tests/validation/softsign_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softsign i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,35 +57,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softsign_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softsign(input, output, ¶ms); + if (csinn_softsign_init(input, output, params) == CSINN_TRUE) { + csinn_softsign(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/softsign_u8.c b/tests/validation/softsign_u8.c index c9e26f5b..08ecf256 100644 --- a/tests/validation/softsign_u8.c +++ b/tests/validation/softsign_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softsign u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -52,40 +52,39 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_softsign_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softsign(input, output, ¶ms); + if (csinn_softsign_init(input, output, params) == CSINN_TRUE) { + csinn_softsign(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/space_to_batch_f32.c b/tests/validation/space_to_batch_f32.c index 617f9a25..838ac4de 100644 --- a/tests/validation/space_to_batch_f32.c +++ b/tests/validation/space_to_batch_f32.c @@ -16,40 +16,41 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_batch_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_batch_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; - params.pad_top = buffer[5]; - params.pad_bottom = buffer[6]; - params.pad_left = buffer[7]; - params.pad_right = buffer[8]; + params->block_size = buffer[4]; + params->pad_top = buffer[5]; + params->pad_bottom = buffer[6]; + params->pad_left = buffer[7]; + params->pad_right = buffer[8]; - output->dim[0] = input->dim[0] * params.block_size * params.block_size; + output->dim[0] = input->dim[0] * params->block_size * params->block_size; output->dim[1] = input->dim[1]; - output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size; - output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size; + output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size; + output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -58,16 +59,15 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 9); reference->data = (float *)(buffer + 9 + in_size); output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_space_to_batch_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_batch(input, output, ¶ms); + if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_batch(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/space_to_batch_i8.c b/tests/validation/space_to_batch_i8.c index 8dc5db12..51f4f48f 100644 --- a/tests/validation/space_to_batch_i8.c +++ b/tests/validation/space_to_batch_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_batch_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_batch_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,21 +39,21 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; - params.pad_top = buffer[5]; - params.pad_bottom = buffer[6]; - params.pad_left = buffer[7]; - params.pad_right = buffer[8]; + params->block_size = buffer[4]; + params->pad_top = buffer[5]; + params->pad_bottom = buffer[6]; + params->pad_left = buffer[7]; + params->pad_right = buffer[8]; - output->dim[0] = input->dim[0] * params.block_size * params.block_size; + output->dim[0] = input->dim[0] * params->block_size * params->block_size; output->dim[1] = input->dim[1]; - output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size; - output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size; + output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size; + output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -68,33 +69,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 9); - float *ref = (float *)(buffer + 9 + in_size); + float *src_in = (float *)(buffer + 9); + float *ref = (float *)(buffer + 9 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -102,14 +102,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_space_to_batch_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_batch(input, output, ¶ms); + if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_batch(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/space_to_batch_u8.c b/tests/validation/space_to_batch_u8.c index 905a88cb..331ac81c 100644 --- a/tests/validation/space_to_batch_u8.c +++ b/tests/validation/space_to_batch_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_batch_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_batch_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,21 +39,21 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; - params.pad_top = buffer[5]; - params.pad_bottom = buffer[6]; - params.pad_left = buffer[7]; - params.pad_right = buffer[8]; + params->block_size = buffer[4]; + params->pad_top = buffer[5]; + params->pad_bottom = buffer[6]; + params->pad_left = buffer[7]; + params->pad_right = buffer[8]; - output->dim[0] = input->dim[0] * params.block_size * params.block_size; + output->dim[0] = input->dim[0] * params->block_size * params->block_size; output->dim[1] = input->dim[1]; - output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size; - output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size; + output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size; + output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -68,33 +69,32 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 9); - float *ref = (float *)(buffer + 9 + in_size); + float *src_in = (float *)(buffer + 9); + float *ref = (float *)(buffer + 9 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -102,14 +102,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_space_to_batch_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_batch(input, output, ¶ms); + if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_batch(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/space_to_depth_f32.c b/tests/validation/space_to_depth_f32.c index da3fd537..927de98e 100644 --- a/tests/validation/space_to_depth_f32.c +++ b/tests/validation/space_to_depth_f32.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_depth_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_depth_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] * params.block_size * params.block_size; - output->dim[2] = input->dim[2] / params.block_size; - output->dim[3] = input->dim[3] / params.block_size; + output->dim[1] = input->dim[1] * params->block_size * params->block_size; + output->dim[2] = input->dim[2] / params->block_size; + output->dim[3] = input->dim[3] / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -54,19 +55,17 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_space_to_depth_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_depth(input, output, ¶ms); + if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_depth(input, output, params); } - result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/space_to_depth_i8.c b/tests/validation/space_to_depth_i8.c index 27f3ad66..98edee38 100644 --- a/tests/validation/space_to_depth_i8.c +++ b/tests/validation/space_to_depth_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_depth_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_depth_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] * params.block_size * params.block_size; - output->dim[2] = input->dim[2] / params.block_size; - output->dim[3] = input->dim[3] / params.block_size; + output->dim[1] = input->dim[1] * params->block_size * params->block_size; + output->dim[2] = input->dim[2] / params->block_size; + output->dim[3] = input->dim[3] / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -64,55 +65,49 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_space_to_depth_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_depth(input, output, ¶ms); + if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_depth(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/space_to_depth_u8.c b/tests/validation/space_to_depth_u8.c index 57c0284d..09778992 100644 --- a/tests/validation/space_to_depth_u8.c +++ b/tests/validation/space_to_depth_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_depth_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_space_to_depth_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL); int in_size = 0; int out_size = 0; int zp, quantized_multiplier, shift; @@ -38,17 +39,17 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] * params.block_size * params.block_size; - output->dim[2] = input->dim[2] / params.block_size; - output->dim[3] = input->dim[3] / params.block_size; + output->dim[1] = input->dim[1] * params->block_size * params->block_size; + output->dim[2] = input->dim[2] / params->block_size; + output->dim[3] = input->dim[3] / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -56,7 +57,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; @@ -64,55 +65,49 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); - input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } - output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_space_to_depth_init(input, output, ¶ms) == CSINN_TRUE) { - csi_space_to_depth(input, output, ¶ms); + if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) { + csinn_space_to_depth(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, out_size, false); free(buffer); diff --git a/tests/validation/split_f32.c b/tests/validation/split_f32.c index 8238f640..bc38137e 100644 --- a/tests/validation/split_f32.c +++ b/tests/validation/split_f32.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of split f32.\n"); @@ -31,35 +30,34 @@ int main(int argc, char** argv) int axis = buffer[4]; int output_cnt = buffer[5]; int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t)); - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { split_index[i] = buffer[axis] / output_cnt; } - struct csi_tensor *reference[output_cnt]; - for(int i = 0; i < output_cnt; i++) { - reference[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *reference[output_cnt]; + for (int i = 0; i < output_cnt; i++) { + reference[i] = csinn_alloc_tensor(NULL); } int in_size = 0; int out_size[output_cnt]; int acc_out_size = 0; - - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); input->dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *output[output_cnt]; - for(int i = 0; i < output_cnt; i++) { - output[i] = csi_alloc_tensor(NULL); - for(int j = 0; j < 4; j++) { - if(j == axis) { + struct csinn_tensor *output[output_cnt]; + for (int i = 0; i < output_cnt; i++) { + output[i] = csinn_alloc_tensor(NULL); + for (int j = 0; j < 4; j++) { + if (j == axis) { output[i]->dim[j] = split_index[i]; } else { output[i]->dim[j] = input->dim[j]; @@ -69,42 +67,40 @@ int main(int argc, char** argv) out_size[i] = output[i]->dim[0] * output[i]->dim[1] * output[i]->dim[2] * output[i]->dim[3]; reference[i]->data = (float *)(buffer + 6 + in_size + acc_out_size); - output[i]->data = malloc(out_size[i] * sizeof(float)); + output[i]->data = malloc(out_size[i] * sizeof(float)); acc_out_size += out_size[i]; output[i]->is_const = 0; } - struct split_params params; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; - params.output_num = output_cnt; + struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), NULL); + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->output_num = output_cnt; int temp = 0; - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { temp += split_index[i]; split_index[i] = temp; printf("%d\n", split_index[i]); } - params.split_index = split_index; - + params->split_index = split_index; - if (csi_split_init(input, (struct csi_tensor **)&output, ¶ms) == CSINN_TRUE) { - csi_split(input, (struct csi_tensor **)&output, ¶ms); + if (csinn_split_init(input, (struct csinn_tensor **)&output, params) == CSINN_TRUE) { + csinn_split(input, (struct csinn_tensor **)&output, params); } /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - for(int i = 0; i < output_cnt; i++) { - result_verify_f32(reference[i]->data, output[i]->data, input->data, difference, out_size[i], false); + for (int i = 0; i < output_cnt; i++) { + result_verify_f32(reference[i]->data, output[i]->data, input->data, difference, out_size[i], + false); } - /* free alloced memory */ free(buffer); free(split_index); - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { free(output[i]->data); } return done_testing(); diff --git a/tests/validation/sqrt_f32.c b/tests/validation/sqrt_f32.c index 8b588aa0..bb2f023c 100644 --- a/tests/validation/sqrt_f32.c +++ b/tests/validation/sqrt_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sqrt f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sqrt(input, output, ¶ms); + if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) { + csinn_sqrt(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/sqrt_i8.c b/tests/validation/sqrt_i8.c index ec531a37..ce6497d0 100644 --- a/tests/validation/sqrt_i8.c +++ b/tests/validation/sqrt_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sqrt i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,37 +57,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.layout = CSINN_LAYOUT_NHWC; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sqrt(input, output, ¶ms); + if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) { + csinn_sqrt(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/sqrt_u8.c b/tests/validation/sqrt_u8.c index be0c6d63..aba43aba 100644 --- a/tests/validation/sqrt_u8.c +++ b/tests/validation/sqrt_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sqrt u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,36 +58,34 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,15 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sqrt_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sqrt(input, output, ¶ms); + if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) { + csinn_sqrt(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/square_f32.c b/tests/validation/square_f32.c index f3a01439..45967b2c 100644 --- a/tests/validation/square_f32.c +++ b/tests/validation/square_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of square f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.layout = CSINN_LAYOUT_NHWC; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_square_init(input, output, ¶ms) == CSINN_TRUE) { - csi_square(input, output, ¶ms); + if (csinn_square_init(input, output, params) == CSINN_TRUE) { + csinn_square(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/squeeze_f32.c b/tests/validation/squeeze_f32.c index aede108a..16a9e51d 100644 --- a/tests/validation/squeeze_f32.c +++ b/tests/validation/squeeze_f32.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of squeeze f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct squeeze_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_squeeze_params *params = + csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); int axis_len = buffer[3]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width input->dim[3] = 1; input->dim[4] = 1; input->dim[5] = 1; - for(int i = 0; i < axis_len; i++) { - params.axis[i] = buffer[4 + i]; + for (int i = 0; i < axis_len; i++) { + params->axis[i] = buffer[4 + i]; } input->dim_count = 6; @@ -50,19 +51,18 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.axis_num = axis_len; - params.base.layout = CSINN_LAYOUT_NCHW; + params->axis_num = axis_len; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 3); - reference->data = (float *)(buffer + 3 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 3); + reference->data = (float *)(buffer + 3 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_squeeze_init(input, output, ¶ms) == CSINN_TRUE) { - csi_squeeze(input, output, ¶ms); + if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) { + csinn_squeeze(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/squeeze_i8.c b/tests/validation/squeeze_i8.c index dfc6120c..f74a2c56 100644 --- a/tests/validation/squeeze_i8.c +++ b/tests/validation/squeeze_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of squeeze i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct squeeze_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_squeeze_params *params = + csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,15 +38,15 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); int axis_len = buffer[3]; - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width input->dim[3] = 1; input->dim[4] = 1; input->dim[5] = 1; - for(int i = 0; i < axis_len; i++) { - params.axis[i] = buffer[4 + i]; + for (int i = 0; i < axis_len; i++) { + params->axis[i] = buffer[4 + i]; } input->dim_count = 6; @@ -60,37 +61,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.axis_num = axis_len; - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->axis_num = axis_len; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2]; + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 3); - float *ref = (float *)(buffer + 3 + in_size); + float *src_in = (float *)(buffer + 3); + float *ref = (float *)(buffer + 3 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,18 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_squeeze_init(input, output, ¶ms) == CSINN_TRUE) { - csi_squeeze(input, output, ¶ms); + if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) { + csinn_squeeze(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, in_size, false); free(buffer); diff --git a/tests/validation/squeeze_u8.c b/tests/validation/squeeze_u8.c index 1575b447..9009f1f6 100644 --- a/tests/validation/squeeze_u8.c +++ b/tests/validation/squeeze_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of squeeze u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct squeeze_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_squeeze_params *params = + csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -37,15 +38,15 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); int axis_len = buffer[3]; - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width input->dim[3] = 1; input->dim[4] = 1; input->dim[5] = 1; - for(int i = 0; i < axis_len; i++) { - params.axis[i] = buffer[4 + i]; + for (int i = 0; i < axis_len; i++) { + params->axis[i] = buffer[4 + i]; } input->dim_count = 6; @@ -61,36 +62,34 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.axis_num = axis_len; - params.base.layout = CSINN_LAYOUT_NCHW; + params->axis_num = axis_len; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 3); - float *ref = (float *)(buffer + 3 + in_size); + float *src_in = (float *)(buffer + 3); + float *ref = (float *)(buffer + 3 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -98,18 +97,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_squeeze_init(input, output, ¶ms) == CSINN_TRUE) { - csi_squeeze(input, output, ¶ms); + if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) { + csinn_squeeze(input, output, params); } - result_verify_8(reference->data, output, input->data, difference, in_size, false); free(buffer); diff --git a/tests/validation/stack_f32.c b/tests/validation/stack_f32.c index cefe4a95..052a9989 100644 --- a/tests/validation/stack_f32.c +++ b/tests/validation/stack_f32.c @@ -16,59 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - init_testsuite("Testing function of stack f32.\n"); + init_testsuite("Testing function of stack f32.\n"); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - struct stack_params params; + struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL); - params.inputs_count = buffer[0]; - params.axis = buffer[1]; + params->inputs_count = buffer[0]; + params->axis = buffer[1]; - struct csi_tensor *input[params.inputs_count]; - for (int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); input[i]->dim_count = buffer[2] - 1; input[i]->dtype = CSINN_DTYPE_FLOAT32; for (int j = 0; j < input[i]->dim_count; j++) { - if (j < params.axis) { - input[i]->dim[j] = buffer[3+j]; // input[i]->dim[j] = output->dim[j] + if (j < params->axis) { + input[i]->dim[j] = buffer[3 + j]; // input[i]->dim[j] = output->dim[j] } else { - input[i]->dim[j] = buffer[3+j+1]; // input[i]->dim[j] = output->dim[j + 1] + input[i]->dim[j] = buffer[3 + j + 1]; // input[i]->dim[j] = output->dim[j + 1] } } } - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); output->dim_count = buffer[2]; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = buffer[3+i]; + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = buffer[3 + i]; out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = out_size / params->inputs_count; + params->base.api = CSINN_API; - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { input[i]->data = (float *)(buffer + 3 + output->dim_count + in_size * i); } - reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count); - output->data = (float *)malloc(out_size * sizeof(float)); + reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_stack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_stack(input, output, ¶ms); + if (csinn_stack_init(input, output, params) == CSINN_TRUE) { + csinn_stack(input, output, params); } result_verify_f32(reference->data, output->data, input[0]->data, difference, out_size, false); diff --git a/tests/validation/stack_i8.c b/tests/validation/stack_i8.c index eb64e567..3bbbd8dc 100644 --- a/tests/validation/stack_i8.c +++ b/tests/validation/stack_i8.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of stack i8.\n"); @@ -34,77 +34,75 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - struct stack_params params; + struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL); - params.inputs_count = buffer[0]; - params.axis = buffer[1]; + params->inputs_count = buffer[0]; + params->axis = buffer[1]; - struct csi_tensor *input[params.inputs_count]; - for (int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); input[i]->dim_count = buffer[2] - 1; input[i]->layout = CSINN_LAYOUT_NCHW; input[i]->is_const = 0; input[i]->dtype = CSINN_DTYPE_INT8; for (int j = 0; j < input[i]->dim_count; j++) { - if (j < params.axis) { - input[i]->dim[j] = buffer[3+j]; // input[i]->dim[j] = output->dim[j] + if (j < params->axis) { + input[i]->dim[j] = buffer[3 + j]; // input[i]->dim[j] = output->dim[j] } else { - input[i]->dim[j] = buffer[3+j+1]; // input[i]->dim[j] = output->dim[j + 1] + input[i]->dim[j] = buffer[3 + j + 1]; // input[i]->dim[j] = output->dim[j + 1] } } } - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); output->dim_count = buffer[2]; - float *src_in[params.inputs_count]; + float *src_in[params->inputs_count]; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = buffer[3+i]; + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = buffer[3 + i]; out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; + in_size = out_size / params->inputs_count; output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - int8_t *src_tmp[params.inputs_count]; + int8_t *src_tmp[params->inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { src_in[i] = (float *)(buffer + 3 + output->dim_count + in_size * i); src_tmp[i] = malloc(in_size * sizeof(char)); } - float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count); + float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count); - for(int j = 0; j < params.inputs_count; j++) { + for (int j = 0; j < params->inputs_count; j++) { input[j]->data = src_in[j]; get_quant_info(input[j]); - for(int i = 0; i < in_size; i++) { - src_tmp[j][i] = csi_ref_quantize_f32_to_i8(src_in[j][i], input[j]->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[j][i] = shl_ref_quantize_f32_to_i8(src_in[j][i], input[j]->qinfo); } input[j]->data = src_tmp[j]; - } + } output->data = ref; get_quant_info(output); reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_stack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_stack(input, output, ¶ms); + if (csinn_stack_init(input, output, params) == CSINN_TRUE) { + csinn_stack(input, output, params); } result_verify_8(reference->data, output, input[0]->data, difference, out_size, false); free(buffer); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { free(src_tmp[i]); } free(output->data); diff --git a/tests/validation/stack_u8.c b/tests/validation/stack_u8.c index dd3f6248..831891c1 100644 --- a/tests/validation/stack_u8.c +++ b/tests/validation/stack_u8.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of stack u8.\n"); @@ -34,78 +34,75 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - struct stack_params params; + struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL); - params.inputs_count = buffer[0]; - params.axis = buffer[1]; + params->inputs_count = buffer[0]; + params->axis = buffer[1]; - struct csi_tensor *input[params.inputs_count]; - for (int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(NULL); input[i]->dim_count = buffer[2] - 1; input[i]->layout = CSINN_LAYOUT_NCHW; input[i]->is_const = 0; input[i]->dtype = CSINN_DTYPE_UINT8; for (int j = 0; j < input[i]->dim_count; j++) { - if (j < params.axis) { - input[i]->dim[j] = buffer[3+j]; // input[i]->dim[j] = output->dim[j] + if (j < params->axis) { + input[i]->dim[j] = buffer[3 + j]; // input[i]->dim[j] = output->dim[j] } else { - input[i]->dim[j] = buffer[3+j+1]; // input[i]->dim[j] = output->dim[j + 1] + input[i]->dim[j] = buffer[3 + j + 1]; // input[i]->dim[j] = output->dim[j + 1] } } } - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); output->dim_count = buffer[2]; - float *src_in[params.inputs_count]; + float *src_in[params->inputs_count]; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = buffer[3+i]; + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = buffer[3 + i]; out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; + in_size = out_size / params->inputs_count; output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - int8_t *src_tmp[params.inputs_count]; + int8_t *src_tmp[params->inputs_count]; - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { src_in[i] = (float *)(buffer + 3 + output->dim_count + in_size * i); src_tmp[i] = malloc(in_size * sizeof(char)); + } - } - - float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count); + float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count); - for(int j = 0; j < params.inputs_count; j++) { + for (int j = 0; j < params->inputs_count; j++) { input[j]->data = src_in[j]; get_quant_info(input[j]); - for(int i = 0; i < in_size; i++) { - src_tmp[j][i] = csi_ref_quantize_f32_to_u8(src_in[j][i], input[j]->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[j][i] = shl_ref_quantize_f32_to_u8(src_in[j][i], input[j]->qinfo); } input[j]->data = src_tmp[j]; - } + } output->data = ref; get_quant_info(output); reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_stack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_stack(input, output, ¶ms); + if (csinn_stack_init(input, output, params) == CSINN_TRUE) { + csinn_stack(input, output, params); } result_verify_8(reference->data, output, input[0]->data, difference, out_size, false); free(buffer); - for(int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { free(src_tmp[i]); } free(output->data); diff --git a/tests/validation/strided_slice_f32.c b/tests/validation/strided_slice_f32.c index c32268bb..2dd01882 100644 --- a/tests/validation/strided_slice_f32.c +++ b/tests/validation/strided_slice_f32.c @@ -16,68 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of strided_slice f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct strided_slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_strided_slice_params *params = + csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.slice_count = buffer[1+input->dim_count]; - params.begin = (int *)malloc(params.slice_count * sizeof(int)); - params.end = (int *)malloc(params.slice_count * sizeof(int)); - params.stride = (int *)malloc(params.slice_count * sizeof(int)); - for(int i = 0; i < params.slice_count; i++) { - params.begin[i] = buffer[2+input->dim_count+3*i]; - params.end[i] = buffer[3+input->dim_count+3*i]; - params.stride[i] = buffer[4+input->dim_count+3*i]; + params->slice_count = buffer[1 + input->dim_count]; + params->begin = (int *)malloc(params->slice_count * sizeof(int)); + params->end = (int *)malloc(params->slice_count * sizeof(int)); + params->stride = (int *)malloc(params->slice_count * sizeof(int)); + for (int i = 0; i < params->slice_count; i++) { + params->begin[i] = buffer[2 + input->dim_count + 3 * i]; + params->end[i] = buffer[3 + input->dim_count + 3 * i]; + params->stride[i] = buffer[4 + input->dim_count + 3 * i]; } output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { - if(i < params.slice_count) { - output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]); + for (int i = 0; i < output->dim_count; i++) { + if (i < params->slice_count) { + output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]); } else { output->dim[i] = input->dim[i]; } } - out_size = buffer[2+input->dim_count+3*params.slice_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = buffer[2 + input->dim_count + 3 * params->slice_count]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count); - reference->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size + input->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count); + reference->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count + + in_size); // input->data + in_size input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_strided_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_strided_slice(input, output, ¶ms); + if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) { + csinn_strided_slice(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); free(buffer); free(output->data); - free(params.begin); - free(params.end); - free(params.stride); + free(params->begin); + free(params->end); + free(params->stride); return done_testing(); } diff --git a/tests/validation/strided_slice_i8.c b/tests/validation/strided_slice_i8.c index a06ac00b..f5884914 100644 --- a/tests/validation/strided_slice_i8.c +++ b/tests/validation/strided_slice_i8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of strided_slice i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct strided_slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_strided_slice_params *params = + csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,30 +39,29 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.slice_count = buffer[1+input->dim_count]; - params.begin = (int *)malloc(params.slice_count * sizeof(int)); - params.end = (int *)malloc(params.slice_count * sizeof(int)); - params.stride = (int *)malloc(params.slice_count * sizeof(int)); - for(int i = 0; i < params.slice_count; i++) { - params.begin[i] = buffer[2+input->dim_count+3*i]; - params.end[i] = buffer[3+input->dim_count+3*i]; - params.stride[i] = buffer[4+input->dim_count+3*i]; + params->slice_count = buffer[1 + input->dim_count]; + params->begin = (int *)malloc(params->slice_count * sizeof(int)); + params->end = (int *)malloc(params->slice_count * sizeof(int)); + params->stride = (int *)malloc(params->slice_count * sizeof(int)); + for (int i = 0; i < params->slice_count; i++) { + params->begin[i] = buffer[2 + input->dim_count + 3 * i]; + params->end[i] = buffer[3 + input->dim_count + 3 * i]; + params->stride[i] = buffer[4 + input->dim_count + 3 * i]; } output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { - if(i < params.slice_count) { - output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]); + for (int i = 0; i < output->dim_count; i++) { + if (i < params->slice_count) { + output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]); } else { output->dim[i] = input->dim[i]; } } - out_size = buffer[2+input->dim_count+3*params.slice_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = buffer[2 + input->dim_count + 3 * params->slice_count]; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -72,46 +72,45 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - float *src_in = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count); - float *ref = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size + float *src_in = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count); + float *ref = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count + + in_size); // input->data + in_size int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_strided_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_strided_slice(input, output, ¶ms); + if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) { + csinn_strided_slice(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); @@ -119,8 +118,8 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.begin); - free(params.end); - free(params.stride); + free(params->begin); + free(params->end); + free(params->stride); return done_testing(); } diff --git a/tests/validation/strided_slice_u8.c b/tests/validation/strided_slice_u8.c index 5bf79d97..721c52fe 100644 --- a/tests/validation/strided_slice_u8.c +++ b/tests/validation/strided_slice_u8.c @@ -16,20 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of strided_slice u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct strided_slice_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_strided_slice_params *params = + csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL); int in_size = 1; int out_size = 1; int zp, quantized_multiplier, shift; @@ -38,30 +39,29 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.slice_count = buffer[1+input->dim_count]; - params.begin = (int *)malloc(params.slice_count * sizeof(int)); - params.end = (int *)malloc(params.slice_count * sizeof(int)); - params.stride = (int *)malloc(params.slice_count * sizeof(int)); - for(int i = 0; i < params.slice_count; i++) { - params.begin[i] = buffer[2+input->dim_count+3*i]; - params.end[i] = buffer[3+input->dim_count+3*i]; - params.stride[i] = buffer[4+input->dim_count+3*i]; + params->slice_count = buffer[1 + input->dim_count]; + params->begin = (int *)malloc(params->slice_count * sizeof(int)); + params->end = (int *)malloc(params->slice_count * sizeof(int)); + params->stride = (int *)malloc(params->slice_count * sizeof(int)); + for (int i = 0; i < params->slice_count; i++) { + params->begin[i] = buffer[2 + input->dim_count + 3 * i]; + params->end[i] = buffer[3 + input->dim_count + 3 * i]; + params->stride[i] = buffer[4 + input->dim_count + 3 * i]; } output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { - if(i < params.slice_count) { - output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]); + for (int i = 0; i < output->dim_count; i++) { + if (i < params->slice_count) { + output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]); } else { output->dim[i] = input->dim[i]; } } - out_size = buffer[2+input->dim_count+3*params.slice_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = buffer[2 + input->dim_count + 3 * params->slice_count]; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -72,46 +72,45 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - float *src_in = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count); - float *ref = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size + float *src_in = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count); + float *ref = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count + + in_size); // input->data + in_size uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_strided_slice_init(input, output, ¶ms) == CSINN_TRUE) { - csi_strided_slice(input, output, ¶ms); + if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) { + csinn_strided_slice(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); @@ -119,8 +118,8 @@ int main(int argc, char** argv) free(buffer); free(src_tmp); free(output->data); - free(params.begin); - free(params.end); - free(params.stride); + free(params->begin); + free(params->end); + free(params->stride); return done_testing(); } diff --git a/tests/validation/sub_f32.c b/tests/validation/sub_f32.c index d950a3c7..433685c5 100644 --- a/tests/validation/sub_f32.c +++ b/tests/validation/sub_f32.c @@ -16,34 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sub f32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -57,17 +57,16 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input1->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = malloc(in_size * sizeof(float)); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_sub_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_sub(input0, input1, output, ¶ms); + if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_sub(input0, input1, output, params); } result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false); diff --git a/tests/validation/sub_i8.c b/tests/validation/sub_i8.c index c04ee3cb..3dd43d1d 100644 --- a/tests/validation/sub_i8.c +++ b/tests/validation/sub_i8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sub i8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,19 +38,17 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; - + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; - output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; @@ -73,36 +71,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); int8_t *src0_tmp = malloc(in_size * sizeof(char)); - int8_t *src1_tmp = malloc(in_size * sizeof(char)); + int8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -110,23 +106,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,16 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sub_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_sub(input0, input1, output, ¶ms); + if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_sub(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/sub_u8.c b/tests/validation/sub_u8.c index 6847c74e..e1bc25df 100644 --- a/tests/validation/sub_u8.c +++ b/tests/validation/sub_u8.c @@ -16,21 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sub u8.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; @@ -38,19 +38,17 @@ int main(int argc, char** argv) float max_error; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; - input0->dim[1] = buffer[1]; - input0->dim[2] = buffer[2]; - input0->dim[3] = buffer[3]; - + int flag = buffer[4]; + input0->dim[0] = buffer[0]; + input0->dim[1] = buffer[1]; + input0->dim[2] = buffer[2]; + input0->dim[3] = buffer[3]; - input1->dim[0] = buffer[0]; - input1->dim[1] = buffer[1]; - input1->dim[2] = buffer[2]; - input1->dim[3] = buffer[3]; + input1->dim[0] = buffer[0]; + input1->dim[1] = buffer[1]; + input1->dim[2] = buffer[2]; + input1->dim[3] = buffer[3]; - output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; @@ -73,36 +71,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src0_in = (float *)(buffer + 4); - float *src1_in = (float *)(buffer + 4 + in_size); - float *ref = (float *)(buffer + 4 + 2 * in_size); + float *src0_in = (float *)(buffer + 4); + float *src1_in = (float *)(buffer + 4 + in_size); + float *ref = (float *)(buffer + 4 + 2 * in_size); uint8_t *src0_tmp = malloc(in_size * sizeof(char)); - uint8_t *src1_tmp = malloc(in_size * sizeof(char)); + uint8_t *src1_tmp = malloc(in_size * sizeof(char)); input0->data = src0_in; get_quant_info(input0); - for(int i = 0; i < in_size; i++) { - src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); - if(isinf(src0_in[i]) || isnan(src0_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo); + if (isinf(src0_in[i]) || isnan(src0_in[i])) { continue; } else { - error1 = fabs(src0_in[i]-output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9); + error1 = fabs(src0_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9); } } - if(error1 > error[0]) { + if (error1 > error[0]) { error[0] = error1; } } @@ -110,23 +106,23 @@ int main(int argc, char** argv) input1->data = src1_in; get_quant_info(input1); - for(int i = 0; i < in_size; i++) { - src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); - if(isinf(src1_in[i]) || isnan(src1_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo); + if (isinf(src1_in[i]) || isnan(src1_in[i])) { continue; } else { - error1 = fabs(src1_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9); + error1 = fabs(src1_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9); } } - if(error1 > error[1]) { + if (error1 > error[1]) { error[1] = error1; } } @@ -136,16 +132,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input0->data = src0_tmp; - input1->data = src1_tmp; + input0->data = src0_tmp; + input1->data = src1_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_sub_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_sub(input0, input1, output, ¶ms); + if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_sub(input0, input1, output, params); } result_verify_8(reference->data, output, input0->data, difference, in_size, false); diff --git a/tests/validation/sum_stride_f32.c b/tests/validation/sum_stride_f32.c index d61e2649..dacb804d 100644 --- a/tests/validation/sum_stride_f32.c +++ b/tests/validation/sum_stride_f32.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sum f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -66,31 +64,28 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = malloc(out_size * sizeof(float)); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_sum_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sum(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_sum_init(input, output, params) == CSINN_TRUE) { + csinn_sum(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/sum_stride_u8.c b/tests/validation/sum_stride_u8.c index f7fb2fbf..a70322b1 100644 --- a/tests/validation/sum_stride_u8.c +++ b/tests/validation/sum_stride_u8.c @@ -16,49 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sum u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -84,33 +82,31 @@ int main(int argc, char** argv) input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 1e-4; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - if (csi_sum_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sum(input, output, ¶ms); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + if (csinn_sum_init(input, output, params) == CSINN_TRUE) { + csinn_sum(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/tan_f32.c b/tests/validation/tan_f32.c index 4a1124a5..00de7619 100644 --- a/tests/validation/tan_f32.c +++ b/tests/validation/tan_f32.c @@ -16,26 +16,26 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tan f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -44,16 +44,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_tan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tan(input, output, ¶ms); + if (csinn_tan_init(input, output, params) == CSINN_TRUE) { + csinn_tan(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/tan_i8.c b/tests/validation/tan_i8.c index 903386c0..8b4a9c07 100644 --- a/tests/validation/tan_i8.c +++ b/tests/validation/tan_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tan i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,9 +96,8 @@ int main(int argc, char** argv) // max error: 10000 for input [-1.57, 1.57] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_tan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tan(input, output, ¶ms); + if (csinn_tan_init(input, output, params) == CSINN_TRUE) { + csinn_tan(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/tan_u8.c b/tests/validation/tan_u8.c index ff854708..f5f0c9b8 100644 --- a/tests/validation/tan_u8.c +++ b/tests/validation/tan_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tan u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,9 +54,8 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -66,23 +65,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -96,9 +96,8 @@ int main(int argc, char** argv) // max error: 10000 for input [-1.57, 1.57] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_tan_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tan(input, output, ¶ms); + if (csinn_tan_init(input, output, params) == CSINN_TRUE) { + csinn_tan(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/tanh_f32.c b/tests/validation/tanh_f32.c index 58aed6f1..ae7c8576 100644 --- a/tests/validation/tanh_f32.c +++ b/tests/validation/tanh_f32.c @@ -16,25 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -43,16 +43,15 @@ int main(int argc, char** argv) out_size = in_size; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_tanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tanh(input, output, ¶ms); + if (csinn_tanh_init(input, output, params) == CSINN_TRUE) { + csinn_tanh(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/tanh_i8.c b/tests/validation/tanh_i8.c index 7f990b3d..74a101e1 100644 --- a/tests/validation/tanh_i8.c +++ b/tests/validation/tanh_i8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -54,8 +54,7 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -65,23 +64,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,9 +95,8 @@ int main(int argc, char** argv) // max error: 0.4 for input [-100, 100] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_tanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tanh(input, output, ¶ms); + if (csinn_tanh_init(input, output, params) == CSINN_TRUE) { + csinn_tanh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/tanh_u8.c b/tests/validation/tanh_u8.c index 18bdc788..99022d16 100644 --- a/tests/validation/tanh_u8.c +++ b/tests/validation/tanh_u8.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size = 1, out_size = 1; int zero_point, multiplier, shift; float scale, min_value, max_value; @@ -38,7 +38,7 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -49,13 +49,12 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 1 + input->dim_count); float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size); @@ -65,23 +64,24 @@ int main(int argc, char** argv) input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -95,9 +95,8 @@ int main(int argc, char** argv) // max error: 0.4 for input [-100, 100] float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_tanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tanh(input, output, ¶ms); + if (csinn_tanh_init(input, output, params) == CSINN_TRUE) { + csinn_tanh(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/threshold_relu_f32.c b/tests/validation/threshold_relu_f32.c index 8f5ebcda..1d2d13f0 100644 --- a/tests/validation/threshold_relu_f32.c +++ b/tests/validation/threshold_relu_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of threshold relu f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.n = *(float *)&buffer[4]; // theta + params->n = *(float *)&buffer[4]; // theta in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_threshold_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_threshold_relu(input, output, ¶ms); + if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) { + csinn_threshold_relu(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/threshold_relu_i8.c b/tests/validation/threshold_relu_i8.c index f2d14bdb..640df0be 100644 --- a/tests/validation/threshold_relu_i8.c +++ b/tests/validation/threshold_relu_i8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of threshold relu i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,34 +55,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.n = *(float *)&buffer[4]; // theta + params->base.api = CSINN_API; + params->n = *(float *)&buffer[4]; // theta in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -90,15 +90,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + input->data = input_tmp; + reference->data = ref; + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_threshold_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_threshold_relu(input, output, ¶ms); + if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) { + csinn_threshold_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/threshold_relu_u8.c b/tests/validation/threshold_relu_u8.c index 17deff20..a766c9e8 100644 --- a/tests/validation/threshold_relu_u8.c +++ b/tests/validation/threshold_relu_u8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of threshold relu u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); int in_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,34 +55,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.n = *(float *)&buffer[4]; // theta + params->base.api = CSINN_API; + params->n = *(float *)&buffer[4]; // theta in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size); + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -90,15 +90,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + input->data = input_tmp; + reference->data = ref; + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_threshold_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_threshold_relu(input, output, ¶ms); + if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) { + csinn_threshold_relu(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/tile_f32.c b/tests/validation/tile_f32.c index 8ef5641a..5c4e3344 100644 --- a/tests/validation/tile_f32.c +++ b/tests/validation/tile_f32.c @@ -16,20 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tile f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct tile_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL); int in_size = 1; int out_size = 1; @@ -37,30 +37,29 @@ int main(int argc, char** argv) input->dim_count = buffer[0]; output->dim_count = input->dim_count; - params.reps_num = buffer[0]; + params->reps_num = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.reps = (int *)malloc(params.reps_num * sizeof(int)); - for(int i = 0; i < params.reps_num; i++) { - params.reps[i] = buffer[i+1+input->dim_count]; - output->dim[i] = input->dim[i] * params.reps[i]; - out_size *= params.reps[i]; + params->reps = (int *)malloc(params->reps_num * sizeof(int)); + for (int i = 0; i < params->reps_num; i++) { + params->reps[i] = buffer[i + 1 + input->dim_count]; + output->dim[i] = input->dim[i] * params->reps[i]; + out_size *= params->reps[i]; } out_size = out_size * in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 1 + input->dim_count + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); input->dtype = CSINN_DTYPE_FLOAT32; - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_tile_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tile(input, output, ¶ms); + if (csinn_tile_init(input, output, params) == CSINN_TRUE) { + csinn_tile(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/tile_i8.c b/tests/validation/tile_i8.c index 9c0a8031..f724d0a1 100644 --- a/tests/validation/tile_i8.c +++ b/tests/validation/tile_i8.c @@ -16,44 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tile i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct tile_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL); int in_size = 1; int out_size = 1; float max_error = 0.0f; - int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - params.reps_num = buffer[0]; + params->reps_num = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.reps = (int *)malloc(params.reps_num * sizeof(int)); - for(int i = 0; i < params.reps_num; i++) { - params.reps[i] = buffer[i+1+input->dim_count]; - output->dim[i] = input->dim[i] * params.reps[i]; - out_size *= params.reps[i]; + params->reps = (int *)malloc(params->reps_num * sizeof(int)); + for (int i = 0; i < params->reps_num; i++) { + params->reps[i] = buffer[i + 1 + input->dim_count]; + output->dim[i] = input->dim[i] * params->reps[i]; + out_size *= params->reps[i]; } out_size = out_size * in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -64,30 +62,30 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,14 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_tile_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tile(input, output, ¶ms); + if (csinn_tile_init(input, output, params) == CSINN_TRUE) { + csinn_tile(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/tile_u8.c b/tests/validation/tile_u8.c index a1bd90ec..e97e52f5 100644 --- a/tests/validation/tile_u8.c +++ b/tests/validation/tile_u8.c @@ -16,78 +16,76 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tile u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct tile_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL); int in_size = 1; int out_size = 1; float max_error = 0.0f; - int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - params.reps_num = buffer[0]; + params->reps_num = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.reps = (int *)malloc(params.reps_num * sizeof(int)); - for(int i = 0; i < params.reps_num; i++) { - params.reps[i] = buffer[i+1+input->dim_count]; - output->dim[i] = input->dim[i] * params.reps[i]; - out_size *= params.reps[i]; + params->reps = (int *)malloc(params->reps_num * sizeof(int)); + for (int i = 0; i < params->reps_num; i++) { + params->reps[i] = buffer[i + 1 + input->dim_count]; + output->dim[i] = input->dim[i] * params->reps[i]; + out_size *= params->reps[i]; } out_size = out_size * in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count); - float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); + float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count); + float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -95,14 +93,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_tile_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tile(input, output, ¶ms); + if (csinn_tile_init(input, output, params) == CSINN_TRUE) { + csinn_tile(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/topk_f32.c b/tests/validation/topk_f32.c index 3db6a576..5fef67c0 100644 --- a/tests/validation/topk_f32.c +++ b/tests/validation/topk_f32.c @@ -16,59 +16,59 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of topk f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output1 = csi_alloc_tensor(NULL); - struct csi_tensor *output2 = csi_alloc_tensor(NULL); - struct csi_tensor *reference1 = csi_alloc_tensor(NULL); - struct csi_tensor *reference2 = csi_alloc_tensor(NULL); - struct topk_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output2 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL); + struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - params.k = buffer[0]; + params->k = buffer[0]; input->dim_count = buffer[1]; output1->dim_count = input->dim_count; output2->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output1->dim[i] = input->dim[i]; output2->dim[i] = input->dim[i]; in_size *= input->dim[i]; } - output1->dim[output1->dim_count - 1] = params.k; // values last dim = k - output2->dim[output2->dim_count - 1] = params.k; // indices last dim = k + output1->dim[output1->dim_count - 1] = params->k; // values last dim = k + output2->dim[output2->dim_count - 1] = params->k; // indices last dim = k - out_size = in_size / input->dim[input->dim_count - 1] * params.k; + out_size = in_size / input->dim[input->dim_count - 1] * params->k; input->dtype = CSINN_DTYPE_FLOAT32; output1->dtype = CSINN_DTYPE_FLOAT32; output2->dtype = CSINN_DTYPE_INT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count); reference1->data = (float *)(buffer + 2 + input->dim_count + in_size); reference2->data = (int *)(buffer + 2 + input->dim_count + in_size + out_size); - output1->data = (float *)malloc(out_size * sizeof(float)); - output2->data = (int *)malloc(out_size * sizeof(int)); + output1->data = (float *)malloc(out_size * sizeof(float)); + output2->data = (int *)malloc(out_size * sizeof(int)); float difference1 = argc > 2 ? atof(argv[2]) : 1e-6; float difference2 = argc > 3 ? atof(argv[3]) : 0; - if (csi_topk_init(input, output1, output2, ¶ms) == CSINN_TRUE) { - csi_topk(input, output1, output2, ¶ms); + if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) { + csinn_topk(input, output1, output2, params); } - result_verify_f32((float *)reference1->data, output1->data, input->data, difference1, out_size, false); + result_verify_f32((float *)reference1->data, output1->data, input->data, difference1, out_size, + false); result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false); free(buffer); diff --git a/tests/validation/topk_i8.c b/tests/validation/topk_i8.c index 83eabced..ab56d0e6 100644 --- a/tests/validation/topk_i8.c +++ b/tests/validation/topk_i8.c @@ -16,38 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of topk i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output1 = csi_alloc_tensor(NULL); - struct csi_tensor *output2 = csi_alloc_tensor(NULL); - struct csi_tensor *reference1 = csi_alloc_tensor(NULL); - struct csi_tensor *reference2 = csi_alloc_tensor(NULL); - struct topk_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output2 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL); + struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL); int in_size = 1, out_size = 1; float error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - params.k = buffer[0]; + params->k = buffer[0]; input->dim_count = buffer[1]; output1->dim_count = input->dim_count; output2->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output1->dim[i] = input->dim[i]; output2->dim[i] = input->dim[i]; in_size *= input->dim[i]; } - out_size = in_size / input->dim[input->dim_count - 1] * params.k; + out_size = in_size / input->dim[input->dim_count - 1] * params->k; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -62,41 +62,41 @@ int main(int argc, char** argv) output2->layout = CSINN_LAYOUT_NCHW; output2->is_const = 0; output2->quant_channel = 1; - - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 2 + input->dim_count); float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size); - int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); + int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); int8_t *input_data = (int8_t *)malloc(in_size * sizeof(int8_t)); input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point - if(input->dim_count != 1 || params.k != 1) { + // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point + if (input->dim_count != 1 || params->k != 1) { output1->data = ref_data1; get_quant_info(output1); } else { @@ -113,8 +113,8 @@ int main(int argc, char** argv) float difference2 = argc > 3 ? atof(argv[3]) : 0; printf("The max error is %.6lf.\n", error); - if (csi_topk_init(input, output1, output2, ¶ms) == CSINN_TRUE) { - csi_topk(input, output1, output2, ¶ms); + if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) { + csinn_topk(input, output1, output2, params); } result_verify_8(reference1->data, output1, input->data, difference1, out_size, false); @@ -123,7 +123,8 @@ int main(int argc, char** argv) they all quantized by [200, 200] so their output_indices are reversed */ - // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false); + // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, + // false); free(buffer); free(output1->data); diff --git a/tests/validation/topk_u8.c b/tests/validation/topk_u8.c index dbcb317d..a8916e93 100644 --- a/tests/validation/topk_u8.c +++ b/tests/validation/topk_u8.c @@ -16,38 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of topk u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output1 = csi_alloc_tensor(NULL); - struct csi_tensor *output2 = csi_alloc_tensor(NULL); - struct csi_tensor *reference1 = csi_alloc_tensor(NULL); - struct csi_tensor *reference2 = csi_alloc_tensor(NULL); - struct topk_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output2 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL); + struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL); int in_size = 1, out_size = 1; float error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - params.k = buffer[0]; + params->k = buffer[0]; input->dim_count = buffer[1]; output1->dim_count = input->dim_count; output2->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output1->dim[i] = input->dim[i]; output2->dim[i] = input->dim[i]; in_size *= input->dim[i]; } - out_size = in_size / input->dim[input->dim_count - 1] * params.k; + out_size = in_size / input->dim[input->dim_count - 1] * params->k; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -58,47 +58,46 @@ int main(int argc, char** argv) output1->is_const = 0; output1->quant_channel = 1; - output2->dtype = CSINN_DTYPE_INT32; output2->layout = CSINN_LAYOUT_NCHW; output2->is_const = 0; output2->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 2 + input->dim_count); float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size); - int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); + int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); uint8_t *input_data = (uint8_t *)malloc(in_size * sizeof(uint8_t)); input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point - if(input->dim_count != 1 || params.k != 1) { - output1->data= ref_data1; + // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point + if (input->dim_count != 1 || params->k != 1) { + output1->data = ref_data1; get_quant_info(output1); } else { output1->qinfo = input->qinfo; @@ -114,8 +113,8 @@ int main(int argc, char** argv) float difference2 = argc > 3 ? atof(argv[3]) : 0; printf("The max error is %.6lf.\n", error); - if (csi_topk_init(input, output1, output2, ¶ms) == CSINN_TRUE) { - csi_topk(input, output1, output2, ¶ms); + if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) { + csinn_topk(input, output1, output2, params); } result_verify_8(reference1->data, output1, input->data, difference1, out_size, false); @@ -124,7 +123,8 @@ int main(int argc, char** argv) they all quantized by [200, 200] so their output_indices are reversed */ - // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false); + // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, + // false); free(buffer); free(output1->data); diff --git a/tests/validation/transpose_f32.c b/tests/validation/transpose_f32.c index 788ff335..984dd98b 100644 --- a/tests/validation/transpose_f32.c +++ b/tests/validation/transpose_f32.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of transpose f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct transpose_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_transpose_params *params = + csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim_count = buffer[0]; // input->dim_count == 4 + input->dim_count = buffer[0]; // input->dim_count == 4 output->dim_count = input->dim_count; int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t)); - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i + 1]; - perm[i] = buffer[input->dim_count + i + 1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; + perm[i] = buffer[input->dim_count + i + 1]; output->dim[i] = buffer[2 * input->dim_count + i + 1]; in_size *= input->dim[i]; } @@ -48,19 +49,18 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.permute = perm; - params.permute_num = input->dim_count; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->permute = perm; + params->permute_num = input->dim_count; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (float *)(buffer + 1 + input->dim_count * 3); + input->data = (float *)(buffer + 1 + input->dim_count * 3); reference->data = (float *)(buffer + 1 + input->dim_count * 3 + in_size); - output->data = (float *)malloc(out_size * sizeof(float)); + output->data = (float *)malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_transpose_init(input, output, ¶ms) == CSINN_TRUE) { - csi_transpose(input, output, ¶ms); + if (csinn_transpose_init(input, output, params) == CSINN_TRUE) { + csinn_transpose(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/transpose_i8.c b/tests/validation/transpose_i8.c index ceb62bee..a28c7b25 100644 --- a/tests/validation/transpose_i8.c +++ b/tests/validation/transpose_i8.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of transpose f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct transpose_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_transpose_params *params = + csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim_count = buffer[0]; // input->dim_count == 4 + input->dim_count = buffer[0]; // input->dim_count == 4 output->dim_count = input->dim_count; int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t)); - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i + 1]; - perm[i] = buffer[input->dim_count + i + 1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; + perm[i] = buffer[input->dim_count + i + 1]; output->dim[i] = buffer[2 * input->dim_count + i + 1]; in_size *= input->dim[i]; } @@ -55,36 +56,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.permute = perm; - params.permute_num = input->dim_count; - params.base.layout = CSINN_LAYOUT_NCHW; - - float *src_in = (float *)(buffer + 1 + input->dim_count * 3); - float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size); + params->base.api = CSINN_API; + params->permute = perm; + params->permute_num = input->dim_count; + params->base.layout = CSINN_LAYOUT_NCHW; + + float *src_in = (float *)(buffer + 1 + input->dim_count * 3); + float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(int8_t)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -92,14 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_transpose_init(input, output, ¶ms) == CSINN_TRUE) { - csi_transpose(input, output, ¶ms); + if (csinn_transpose_init(input, output, params) == CSINN_TRUE) { + csinn_transpose(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/transpose_u8.c b/tests/validation/transpose_u8.c index ce86a52f..47401d80 100644 --- a/tests/validation/transpose_u8.c +++ b/tests/validation/transpose_u8.c @@ -16,31 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of transpose u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct transpose_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_transpose_params *params = + csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim_count = buffer[0]; // input->dim_count == 4 + input->dim_count = buffer[0]; // input->dim_count == 4 output->dim_count = input->dim_count; int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t)); - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i + 1]; - perm[i] = buffer[input->dim_count + i + 1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; + perm[i] = buffer[input->dim_count + i + 1]; output->dim[i] = buffer[2 * input->dim_count + i + 1]; in_size *= input->dim[i]; } @@ -55,37 +56,35 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.permute = perm; - params.permute_num = input->dim_count; - params.base.layout = CSINN_LAYOUT_NCHW; - - - float *src_in = (float *)(buffer + 1 + input->dim_count * 3); - float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size); + params->base.api = CSINN_API; + params->permute = perm; + params->permute_num = input->dim_count; + params->base.layout = CSINN_LAYOUT_NCHW; + + float *src_in = (float *)(buffer + 1 + input->dim_count * 3); + float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(uint8_t)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -93,14 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(out_size * sizeof(char)); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_transpose_init(input, output, ¶ms) == CSINN_TRUE) { - csi_transpose(input, output, ¶ms); + if (csinn_transpose_init(input, output, params) == CSINN_TRUE) { + csinn_transpose(input, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/trunc_f32.c b/tests/validation/trunc_f32.c index b137c48e..200ca2d7 100644 --- a/tests/validation/trunc_f32.c +++ b/tests/validation/trunc_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of trunc f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -47,18 +47,17 @@ int main(int argc, char** argv) output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_trunc_init(input, output, ¶ms) == CSINN_TRUE) { - csi_trunc(input, output, ¶ms); + if (csinn_trunc_init(input, output, params) == CSINN_TRUE) { + csinn_trunc(input, output, params); } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/trunc_i8.c b/tests/validation/trunc_i8.c index a2c40b82..fc96f8fb 100644 --- a/tests/validation/trunc_i8.c +++ b/tests/validation/trunc_i8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of trunc i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,36 +57,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_trunc_init(input, output, ¶ms) == CSINN_TRUE) { - csi_trunc(input, output, ¶ms); + if (csinn_trunc_init(input, output, params) == CSINN_TRUE) { + csinn_trunc(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/trunc_u8.c b/tests/validation/trunc_u8.c index 79468421..fd229b63 100644 --- a/tests/validation/trunc_u8.c +++ b/tests/validation/trunc_u8.c @@ -16,30 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of trunc u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -57,36 +57,34 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - - float *src_in = (float *)(buffer + 4); - float *ref = (float *)(buffer + 4 + in_size); + float *src_in = (float *)(buffer + 4); + float *ref = (float *)(buffer + 4 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -94,15 +92,14 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_trunc_init(input, output, ¶ms) == CSINN_TRUE) { - csi_trunc(input, output, ¶ms); + if (csinn_trunc_init(input, output, params) == CSINN_TRUE) { + csinn_trunc(input, output, params); } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/unsorted_segment_max_f32.c b/tests/validation/unsorted_segment_max_f32.c index 76743186..773e5830 100644 --- a/tests/validation/unsorted_segment_max_f32.c +++ b/tests/validation/unsorted_segment_max_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment max f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); - } + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_max_i8.c b/tests/validation/unsorted_segment_max_i8.c index 099d168d..208f2500 100644 --- a/tests/validation/unsorted_segment_max_i8.c +++ b/tests/validation/unsorted_segment_max_i8.c @@ -16,76 +16,78 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment max i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; - input->quant_channel = 1; + input->quant_channel = 1; output->dtype = CSINN_DTYPE_INT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -93,23 +95,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == -FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == -FLT_MAX) { ref[i] = min_value; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_max_u8.c b/tests/validation/unsorted_segment_max_u8.c index d2eabbe4..0260f46e 100644 --- a/tests/validation/unsorted_segment_max_u8.c +++ b/tests/validation/unsorted_segment_max_u8.c @@ -16,76 +16,78 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment max u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size, zp, quantized_multiplier, shift; float max_value, min_value, scale; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_UINT8; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -93,23 +95,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == -FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == -FLT_MAX) { ref[i] = min_value; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_max_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_max(input, segment, output, ¶ms); + if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_max(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_mean_f32.c b/tests/validation/unsorted_segment_mean_f32.c index ea437560..59219d0f 100644 --- a/tests/validation/unsorted_segment_mean_f32.c +++ b/tests/validation/unsorted_segment_mean_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment mean f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); - } + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_mean_i8.c b/tests/validation/unsorted_segment_mean_i8.c index 4dbf2bb8..e074efe1 100644 --- a/tests/validation/unsorted_segment_mean_i8.c +++ b/tests/validation/unsorted_segment_mean_i8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment mean i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -54,37 +55,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -92,17 +94,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_mean_u8.c b/tests/validation/unsorted_segment_mean_u8.c index 5cd6241e..0aeb18ca 100644 --- a/tests/validation/unsorted_segment_mean_u8.c +++ b/tests/validation/unsorted_segment_mean_u8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment mean u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -54,37 +55,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -92,17 +94,16 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_mean_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_mean(input, segment, output, ¶ms); + if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_mean(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_min_f32.c b/tests/validation/unsorted_segment_min_f32.c index 80e7b685..aa2a0c30 100644 --- a/tests/validation/unsorted_segment_min_f32.c +++ b/tests/validation/unsorted_segment_min_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment min f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); - } + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_min_i8.c b/tests/validation/unsorted_segment_min_i8.c index 70dc34bb..c4f2af6e 100644 --- a/tests/validation/unsorted_segment_min_i8.c +++ b/tests/validation/unsorted_segment_min_i8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment min i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -54,37 +55,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -92,23 +94,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == FLT_MAX) { ref[i] = output->qinfo->max; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_min_u8.c b/tests/validation/unsorted_segment_min_u8.c index 60c1ce74..eed14d40 100644 --- a/tests/validation/unsorted_segment_min_u8.c +++ b/tests/validation/unsorted_segment_min_u8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment min u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -54,37 +55,38 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } @@ -92,23 +94,22 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - for(int i = 0; i < out_size; i++) { - if(ref[i] == FLT_MAX) { + for (int i = 0; i < out_size; i++) { + if (ref[i] == FLT_MAX) { ref[i] = output->qinfo->max; } } - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_min_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_min(input, segment, output, ¶ms); + if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_min(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_prod_f32.c b/tests/validation/unsorted_segment_prod_f32.c index 0ee065ea..18a239c2 100644 --- a/tests/validation/unsorted_segment_prod_f32.c +++ b/tests/validation/unsorted_segment_prod_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment prod f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); - } + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_prod_i8.c b/tests/validation/unsorted_segment_prod_i8.c index a22b9a49..2316c288 100644 --- a/tests/validation/unsorted_segment_prod_i8.c +++ b/tests/validation/unsorted_segment_prod_i8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment prod i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -54,57 +55,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - error = error * pow(abs(input->qinfo->max), input->dim[0] - params.num_segments + 1); + error = error * pow(abs(input->qinfo->max), input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_prod_u8.c b/tests/validation/unsorted_segment_prod_u8.c index 5d4f468c..a45d8e64 100644 --- a/tests/validation/unsorted_segment_prod_u8.c +++ b/tests/validation/unsorted_segment_prod_u8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment prod u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -54,57 +55,57 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - error = error * pow(abs(input->qinfo->max), input->dim[0] - params.num_segments + 1); + error = error * pow(abs(input->qinfo->max), input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_prod_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_prod(input, segment, output, ¶ms); + if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_prod(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_sum_f32.c b/tests/validation/unsorted_segment_sum_f32.c index 84c365fc..d8f3ab5b 100644 --- a/tests/validation/unsorted_segment_sum_f32.c +++ b/tests/validation/unsorted_segment_sum_f32.c @@ -16,53 +16,53 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment sum f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = malloc(out_size * sizeof(float)); + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = malloc(out_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); - } + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_sum_i8.c b/tests/validation/unsorted_segment_sum_i8.c index 911dc5e8..9a920272 100644 --- a/tests/validation/unsorted_segment_sum_i8.c +++ b/tests/validation/unsorted_segment_sum_i8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment sum i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_INT8; @@ -54,56 +55,56 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; int8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } /* sum */ - error = error * (input->dim[0] - params.num_segments + 1); + error = error * (input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unsorted_segment_sum_u8.c b/tests/validation/unsorted_segment_sum_u8.c index 8ae7236f..1d45af6b 100644 --- a/tests/validation/unsorted_segment_sum_u8.c +++ b/tests/validation/unsorted_segment_sum_u8.c @@ -16,33 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment sum u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_tensor *segment = csinn_alloc_tensor(NULL); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), NULL); int in_size, out_size; float error = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_UINT8; @@ -54,56 +55,56 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - float *src_in = (float *)(buffer + 5); - float *ref = (float *)(buffer + 5 + in_size + buffer[0]);; + float *src_in = (float *)(buffer + 5); + float *ref = (float *)(buffer + 5 + in_size + buffer[0]); + ; input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); - if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo); + if (src_in[i] == INFINITY && output_tmp == INFINITY || + src_in[i] == NAN && output_tmp == NAN) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } /* sum */ - error = error * (input->dim[0] - params.num_segments + 1); + error = error * (input->dim[0] - params->num_segments + 1); output->data = ref; get_quant_info(output); - input->data = input_tmp; - reference->data = ref; - segment->data = (int *)(buffer + 5 + in_size); - output->data = malloc(out_size * sizeof(char)); - + input->data = input_tmp; + reference->data = ref; + segment->data = (int *)(buffer + 5 + in_size); + output->data = malloc(out_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; printf("The max error is %.6lf.\n", error); - if (csi_segment_sum_init(input, segment, output, ¶ms) == CSINN_TRUE) { - csi_segment_sum(input, segment, output, ¶ms); + if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) { + csinn_segment_sum(input, segment, output, params); } result_verify_8(reference->data, output, input->data, difference, out_size, false); diff --git a/tests/validation/unstack_f32.c b/tests/validation/unstack_f32.c index a260065f..170a3717 100644 --- a/tests/validation/unstack_f32.c +++ b/tests/validation/unstack_f32.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unstack f32.\n"); @@ -30,55 +30,54 @@ int main(int argc, char** argv) int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - struct unstack_params params; - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - params.axis = buffer[0]; + struct csinn_unstack_params *params = + csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + params->axis = buffer[0]; input->dim_count = buffer[1]; input->dtype = CSINN_DTYPE_FLOAT32; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } - params.outputs_count = buffer[2 + params.axis]; - struct csi_tensor *output[params.outputs_count]; - for (int i = 0; i < params.outputs_count; i++) { - output[i] = csi_alloc_tensor(NULL); + params->outputs_count = buffer[2 + params->axis]; + struct csinn_tensor *output[params->outputs_count]; + for (int i = 0; i < params->outputs_count; i++) { + output[i] = csinn_alloc_tensor(NULL); output[i]->dim_count = input->dim_count - 1; output[i]->dtype = CSINN_DTYPE_FLOAT32; - for(int j = 0; j < input->dim_count; j++) { - if(j < params.axis) { + for (int j = 0; j < input->dim_count; j++) { + if (j < params->axis) { output[i]->dim[j] = input->dim[j]; - } else if(j > params.axis) { - output[i]->dim[j-1] = input->dim[j]; + } else if (j > params->axis) { + output[i]->dim[j - 1] = input->dim[j]; } } } - out_size = in_size / params.outputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size / params->outputs_count; + params->base.api = CSINN_API; input->data = (float *)(buffer + 2 + input->dim_count); reference->data = (float *)(buffer + 2 + input->dim_count + in_size); - - for(int i = 0; i < params.outputs_count; i++) { - output[i]->data = (float *)malloc(out_size * sizeof(float)); + for (int i = 0; i < params->outputs_count; i++) { + output[i]->data = (float *)malloc(out_size * sizeof(float)); } float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_unstack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_unstack(input, output, ¶ms); + if (csinn_unstack_init(input, output, params) == CSINN_TRUE) { + csinn_unstack(input, output, params); } float *ref_addr = (float *)reference->data; - for(int i = 0; i < params.outputs_count; i++) { + for (int i = 0; i < params->outputs_count; i++) { result_verify_f32(ref_addr, output[i]->data, input->data, difference, out_size, false); ref_addr += out_size; } free(buffer); - for(int i = 0; i < params.outputs_count; i++) { + for (int i = 0; i < params->outputs_count; i++) { free(output[i]->data); output[i]->data = NULL; } diff --git a/tests/validation/unstack_i8.c b/tests/validation/unstack_i8.c index 5c7711f3..78fd8c01 100644 --- a/tests/validation/unstack_i8.c +++ b/tests/validation/unstack_i8.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unstack i8.\n"); @@ -30,84 +30,82 @@ int main(int argc, char** argv) int out_size = 1; float max_error = 0.05f; - int *buffer = read_input_data_f32(argv[1]); - struct unstack_params params; - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - params.axis = buffer[0]; + struct csinn_unstack_params *params = + csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + params->axis = buffer[0]; input->dim_count = buffer[1]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[2+i]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } - params.outputs_count = input->dim[params.axis]; - struct csi_tensor *output[params.outputs_count]; - for (int i = 0; i < params.outputs_count; i++) { - output[i] = csi_alloc_tensor(NULL); + params->outputs_count = input->dim[params->axis]; + struct csinn_tensor *output[params->outputs_count]; + for (int i = 0; i < params->outputs_count; i++) { + output[i] = csinn_alloc_tensor(NULL); output[i]->dim_count = input->dim_count - 1; output[i]->dtype = CSINN_DTYPE_INT8; output[i]->layout = CSINN_LAYOUT_NCHW; output[i]->is_const = 0; output[i]->quant_channel = 1; - for(int j = 0; j < input->dim_count; j++) { - if(j < params.axis) { + for (int j = 0; j < input->dim_count; j++) { + if (j < params->axis) { output[i]->dim[j] = input->dim[j]; - } else if(j > params.axis) { - output[i]->dim[j-1] = input->dim[j]; + } else if (j > params->axis) { + output[i]->dim[j - 1] = input->dim[j]; } } } - float *src_out[params.outputs_count]; + float *src_out[params->outputs_count]; - out_size = in_size / params.outputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size / params->outputs_count; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_INT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } - for(int i = 0; i < params.outputs_count; i++) { - src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); + for (int i = 0; i < params->outputs_count; i++) { + src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); } - - for(int j = 0; j < params.outputs_count; j++) { + for (int j = 0; j < params->outputs_count; j++) { output[j]->data = src_out[j]; get_quant_info(output[j]); output[j]->dtype = CSINN_DTYPE_INT8; - output[j]->data = malloc(out_size * sizeof(char)); - } + output[j]->data = malloc(out_size * sizeof(char)); + } - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_unstack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_unstack(input, output, ¶ms); + if (csinn_unstack_init(input, output, params) == CSINN_TRUE) { + csinn_unstack(input, output, params); } float *ref_addr = (float *)reference->data; - for(int i = 0; i < params.outputs_count; i++) { - result_verify_8(ref_addr, output[i], input->data, difference, out_size, false); + for (int i = 0; i < params->outputs_count; i++) { + result_verify_8(ref_addr, output[i], input->data, difference, out_size, false); ref_addr += out_size; } free(buffer); - for(int i = 0; i < params.outputs_count; i++) { + for (int i = 0; i < params->outputs_count; i++) { free(output[i]->data); output[i]->data = NULL; } diff --git a/tests/validation/unstack_u8.c b/tests/validation/unstack_u8.c index 785c3376..cb716043 100644 --- a/tests/validation/unstack_u8.c +++ b/tests/validation/unstack_u8.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unstack u8.\n"); @@ -30,89 +30,86 @@ int main(int argc, char** argv) int out_size = 1; float max_error = 0.05f; - int *buffer = read_input_data_f32(argv[1]); - struct unstack_params params; - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - params.axis = buffer[0]; + struct csinn_unstack_params *params = + csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + params->axis = buffer[0]; input->dim_count = buffer[1]; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[2+i]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } - params.outputs_count = input->dim[params.axis]; - struct csi_tensor *output[params.outputs_count]; - for (int i = 0; i < params.outputs_count; i++) { - output[i] = csi_alloc_tensor(NULL); + params->outputs_count = input->dim[params->axis]; + struct csinn_tensor *output[params->outputs_count]; + for (int i = 0; i < params->outputs_count; i++) { + output[i] = csinn_alloc_tensor(NULL); output[i]->dim_count = input->dim_count - 1; output[i]->dtype = CSINN_DTYPE_UINT8; output[i]->layout = CSINN_LAYOUT_NCHW; output[i]->is_const = 0; output[i]->quant_channel = 1; - for(int j = 0; j < input->dim_count; j++) { - if(j < params.axis) { + for (int j = 0; j < input->dim_count; j++) { + if (j < params->axis) { output[i]->dim[j] = input->dim[j]; - } else if(j > params.axis) { - output[i]->dim[j-1] = input->dim[j]; + } else if (j > params->axis) { + output[i]->dim[j - 1] = input->dim[j]; } } } - float *src_out[params.outputs_count]; + float *src_out[params->outputs_count]; - out_size = in_size / params.outputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size / params->outputs_count; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_UINT8; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - - float *src_in = (float *)(buffer + 2 + input->dim_count); - float *ref = (float *)(buffer + 2 + input->dim_count + in_size); + float *src_in = (float *)(buffer + 2 + input->dim_count); + float *ref = (float *)(buffer + 2 + input->dim_count + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } - for(int i = 0; i < params.outputs_count; i++) { - src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); + for (int i = 0; i < params->outputs_count; i++) { + src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); } - - for(int j = 0; j < params.outputs_count; j++) { + for (int j = 0; j < params->outputs_count; j++) { output[j]->data = src_out[j]; get_quant_info(output[j]); output[j]->dtype = CSINN_DTYPE_UINT8; - output[j]->data = malloc(out_size * sizeof(char)); - } + output[j]->data = malloc(out_size * sizeof(char)); + } - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_unstack_init(input, output, ¶ms) == CSINN_TRUE) { - csi_unstack(input, output, ¶ms); + if (csinn_unstack_init(input, output, params) == CSINN_TRUE) { + csinn_unstack(input, output, params); } float *ref_addr = (float *)reference->data; - for(int i = 0; i < params.outputs_count; i++) { - result_verify_8(ref_addr, output[i], input->data, difference, out_size, false); + for (int i = 0; i < params->outputs_count; i++) { + result_verify_8(ref_addr, output[i], input->data, difference, out_size, false); ref_addr += out_size; } free(buffer); - for(int i = 0; i < params.outputs_count; i++) { + for (int i = 0; i < params->outputs_count; i++) { free(output[i]->data); output[i]->data = NULL; } diff --git a/tests/validation/xor_u32.c b/tests/validation/xor_u32.c index 4bfae44d..f7c18f57 100644 --- a/tests/validation/xor_u32.c +++ b/tests/validation/xor_u32.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of xor u32.\n"); - struct csi_tensor *input_0 = csi_alloc_tensor(NULL); - struct csi_tensor *input_1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL); + struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input_0->dim_count = buffer[0]; input_1->dim_count = buffer[0]; output->dim_count = input_0->dim_count; - for(int i = 0; i < input_0->dim_count; i++) { + for (int i = 0; i < input_0->dim_count; i++) { input_0->dim[i] = buffer[i + 1]; input_1->dim[i] = buffer[i + 1]; output->dim[i] = input_0->dim[i]; @@ -48,17 +48,16 @@ int main(int argc, char** argv) input_0->dtype = CSINN_DTYPE_UINT32; input_1->dtype = CSINN_DTYPE_UINT32; output->dtype = CSINN_DTYPE_UINT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); - input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); + input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count); + input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size); reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size); - output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); + output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_xor_init(input_0, input_1, output, ¶ms) == CSINN_TRUE) { - csi_xor(input_0, input_1, output, ¶ms); + if (csinn_xor_init(input_0, input_1, output, params) == CSINN_TRUE) { + csinn_xor(input_0, input_1, output, params); } result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false); diff --git a/tests/validation/yuv_rgb_scale_f32.c b/tests/validation/yuv_rgb_scale_f32.c index df6d4b90..b180e9f2 100644 --- a/tests/validation/yuv_rgb_scale_f32.c +++ b/tests/validation/yuv_rgb_scale_f32.c @@ -16,27 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of yuv2rgb f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = 3; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = 3; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -48,17 +48,16 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 3); - reference->data = (float *)(buffer + 3 + in_size); - output->data = malloc(in_size * sizeof(float)); + input->data = (float *)(buffer + 3); + reference->data = (float *)(buffer + 3 + in_size); + output->data = malloc(in_size * sizeof(float)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - if (csi_yuv_rgb_scale_init(input, output, ¶ms) == CSINN_TRUE) { - csi_yuv_rgb_scale(input, output, ¶ms); - } + if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) { + csinn_yuv_rgb_scale(input, output, params); + } result_verify_f32(reference->data, output->data, input->data, difference, in_size, false); diff --git a/tests/validation/yuv_rgb_scale_i8.c b/tests/validation/yuv_rgb_scale_i8.c index 58c5449b..be7d3096 100644 --- a/tests/validation/yuv_rgb_scale_i8.c +++ b/tests/validation/yuv_rgb_scale_i8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of yuv2rgb i8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = 3; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = 3; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -49,34 +49,32 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_INT8; output->dtype = CSINN_DTYPE_INT8; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 3); - float *ref = (float *)(buffer + 3 + in_size); + float *src_in = (float *)(buffer + 3); + float *ref = (float *)(buffer + 3 + in_size); int8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -84,16 +82,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_yuv_rgb_scale_init(input, output, ¶ms) == CSINN_TRUE) { - csi_yuv_rgb_scale(input, output, ¶ms); - } + if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) { + csinn_yuv_rgb_scale(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation/yuv_rgb_scale_u8.c b/tests/validation/yuv_rgb_scale_u8.c index 97e28016..a6790dde 100644 --- a/tests/validation/yuv_rgb_scale_u8.c +++ b/tests/validation/yuv_rgb_scale_u8.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of yuv2rgb u8.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); int in_size; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = 3; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = 3; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -49,34 +49,32 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_UINT8; output->dtype = CSINN_DTYPE_UINT8; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - float *src_in = (float *)(buffer + 3); - float *ref = (float *)(buffer + 3 + in_size); + float *src_in = (float *)(buffer + 3); + float *ref = (float *)(buffer + 3 + in_size); uint8_t *src_tmp = malloc(in_size * sizeof(char)); input->data = src_in; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); - if(isinf(src_in[i]) || isnan(src_in[i])){ + float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo); + if (isinf(src_in[i]) || isnan(src_in[i])) { continue; } else { - error1 = fabs(src_in[i] -output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9); + error1 = fabs(src_in[i] - output_tmp); + if (error1 > 1e-6) { + error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9); } } - if(error1 > max_error) { + if (error1 > max_error) { max_error = error1; } } @@ -84,16 +82,15 @@ int main(int argc, char** argv) output->data = ref; get_quant_info(output); - input->data = src_tmp; + input->data = src_tmp; reference->data = ref; - output->data = malloc(in_size * sizeof(char)); + output->data = malloc(in_size * sizeof(char)); float difference = argc > 2 ? atof(argv[2]) : 0.9; - - if (csi_yuv_rgb_scale_init(input, output, ¶ms) == CSINN_TRUE) { - csi_yuv_rgb_scale(input, output, ¶ms); - } + if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) { + csinn_yuv_rgb_scale(input, output, params); + } result_verify_8(reference->data, output, input->data, difference, in_size, false); diff --git a/tests/validation_graph/Makefile.anole b/tests/validation_graph/Makefile.anole new file mode 100644 index 00000000..9d1f5f74 --- /dev/null +++ b/tests/validation_graph/Makefile.anole @@ -0,0 +1,124 @@ +CC = csky-abiv2-linux-gcc +INCLUDE = -I../../include -I../utils +CFLAGS += -O2 -g3 -mhard-float -mcpu=c860 +CFLAGS += -DCSINN_API=5 # params->api = CSINN_API = CSINN_ANOLE = 5 +CFLAGS += -DCSINN_TEST_DTYPE=1 # tensor.dtype = CSINN_TEST_DTYPE = CSINN_DTYPE_UINT8 = 1 + +test_objs = + +test_objs += add.o +test_objs += avgpool.o +test_objs += batch_normalization.o +test_objs += concat.o +test_objs += crop.o +test_objs += depth_to_space.o +test_objs += flatten.o +test_objs += global_avgpool.o +test_objs += global_maxpool.o +test_objs += leaky_relu.o +test_objs += lrn.o +test_objs += maximum.o +test_objs += maxpool.o +test_objs += mean.o +test_objs += minimum.o +test_objs += negative.o +test_objs += pad.o +test_objs += prelu.o +test_objs += relu.o +test_objs += relu1.o +test_objs += relu6.o +test_objs += reshape.o +test_objs += resize.o +test_objs += sigmoid.o +test_objs += space_to_depth.o +test_objs += split.o +test_objs += squeeze.o +test_objs += strided_slice.o +test_objs += sub.o +test_objs += tanh.o +test_objs += transpose.o + + +test_objs += ./anole/abs.o +test_objs += ./anole/and.o +test_objs += ./anole/argmax.o +test_objs += ./anole/argmin.o +test_objs += ./anole/batch_to_space.o +test_objs += ./anole/clip.o +test_objs += ./anole/convolution.o +test_objs += ./anole/convolution_relu.o +test_objs += ./anole/convolution_relu6.o +test_objs += ./anole/deconvolution.o +test_objs += ./anole/depthwise_convolution.o +test_objs += ./anole/depthwise_deconvolution.o +test_objs += ./anole/div.o +test_objs += ./anole/elu.o +test_objs += ./anole/equal.o +test_objs += ./anole/exp.o +test_objs += ./anole/expand_dims.o +test_objs += ./anole/floor.o +test_objs += ./anole/floor_divide.o +test_objs += ./anole/fullyconnected.o +test_objs += ./anole/gather.o +test_objs += ./anole/gather_nd.o +test_objs += ./anole/greater.o +test_objs += ./anole/greater_equal.o +test_objs += ./anole/group_convolution.o +test_objs += ./anole/l2_normalization.o +test_objs += ./anole/l2_pool.o +test_objs += ./anole/less.o +test_objs += ./anole/less_equal.o +test_objs += ./anole/log.o +test_objs += ./anole/log_softmax.o +test_objs += ./anole/matmul.o +test_objs += ./anole/max.o +test_objs += ./anole/maxpool2d_locat.o +test_objs += ./anole/min.o +test_objs += ./anole/mul.o +test_objs += ./anole/not_equal.o +test_objs += ./anole/or.o +test_objs += ./anole/pow.o +test_objs += ./anole/prod.o +test_objs += ./anole/psroipooling.o +test_objs += ./anole/relun.o +test_objs += ./anole/reorg.o +test_objs += ./anole/reverse.o +test_objs += ./anole/roipooling.o +test_objs += ./anole/rsqrt.o +test_objs += ./anole/select.o +test_objs += ./anole/shuffle_channel.o +test_objs += ./anole/slice.o +test_objs += ./anole/sin.o +test_objs += ./anole/softmax.o +test_objs += ./anole/softplus.o +test_objs += ./anole/softrelu.o +test_objs += ./anole/space_to_batch.o +test_objs += ./anole/sqrt.o +test_objs += ./anole/square.o +test_objs += ./anole/stack.o +test_objs += ./anole/sum.o +test_objs += ./anole/tile.o +test_objs += ./anole/topk.o +test_objs += ./anole/unpooling.o +test_objs += ./anole/unstack.o + + +utils_objs = + +utils_objs += ../utils/math_snr.o +utils_objs += ../utils/test_utils.o + +all: csi + +csi: $(utils_objs) $(test_objs) + +$(utils_objs): %.o: %.c + $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ + +$(test_objs): %.o: %.c + $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ + $(CC) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../lib -L../../module/acuity-driver/lib/acuity-ovxlib-dev/lib/ \ + -ljpeg -lpng -lz -lshl_openvx -Wl,-unresolved-symbols=ignore-in-shared-libs -lm -o $@.elf + +clean: + rm -rf $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm diff --git a/tests/validation_graph/Makefile.pnna b/tests/validation_graph/Makefile.pnna new file mode 100644 index 00000000..e8369cda --- /dev/null +++ b/tests/validation_graph/Makefile.pnna @@ -0,0 +1,74 @@ +CC = riscv64-unknown-linux-gnu-gcc +INCLUDE = -I../../include -I../utils +CFLAGS = -O0 -g3 +CFLAGS += -DCSINN_API=7 # params->api = CSINN_API = CSINN_LIGHT = 7 +CFLAGS += -DCSINN_TEST_DTYPE=2 # tensor.dtype = CSINN_TEST_DTYPE = CSINN_DTYPE_INT8 = 2 + +test_objs = + +test_objs += add.o +test_objs += avgpool.o +test_objs += batch_normalization.o +test_objs += concat.o +test_objs += crop.o +test_objs += depth_to_space.o +test_objs += flatten.o +test_objs += global_avgpool.o +test_objs += global_maxpool.o +test_objs += leaky_relu.o +test_objs += lrn.o +test_objs += maximum.o +test_objs += maxpool.o +test_objs += mean.o +test_objs += minimum.o +test_objs += negative.o +test_objs += pad.o +test_objs += prelu.o +test_objs += relu.o +test_objs += relu1.o +test_objs += relu6.o +test_objs += reshape.o +test_objs += resize.o +test_objs += sigmoid.o +test_objs += space_to_depth.o +test_objs += split.o +test_objs += squeeze.o +test_objs += strided_slice.o +test_objs += sub.o +test_objs += tanh.o +test_objs += transpose.o + +test_objs += ./light/argmax.o +test_objs += ./light/batch_to_space_nd.o +test_objs += ./light/convolution.o +test_objs += ./light/deconvolution.o +test_objs += ./light/depthwise_convolution.o +test_objs += ./light/div.o +test_objs += ./light/fullyconnected.o +test_objs += ./light/group_convolution.o +test_objs += ./light/l2_normalization.o +test_objs += ./light/softmax.o +test_objs += ./light/space_to_batch_nd.o + + + +utils_objs = + +utils_objs += ../utils/math_snr.o +utils_objs += ../utils/test_utils.o + + +all: csi + +csi: $(utils_objs) $(test_objs) + +$(utils_objs): %.o: %.c + $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ + +$(test_objs): %.o: %.c + $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ + $(CC) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../lib -L../../module/nna_ddk_install/light/ \ + ../../lib/libshl_pnna.a -limgdnn -lnnasession -lpthread -lssl -lcrypto -latomic -lz -lm -lstdc++ -o $@.elf + +clean: + rm -rf $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm imgdnn_session_*/ *.o diff --git a/tests/validation_graph/add.c b/tests/validation_graph/add.c index 1d60f93e..fa6fc8c5 100644 --- a/tests/validation_graph/add.c +++ b/tests/validation_graph/add.c @@ -16,51 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_add_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_add_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_add(input0, input1, output, params); + csinn_add(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference); int main(int argc, char **argv) { @@ -69,11 +69,11 @@ int main(int argc, char **argv) int *buffer = read_input_data_f32(argv[1]); int flag = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 0, in1_size = 0, out_size = 0; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = buffer[0]; // batch input0->dim[1] = buffer[1]; // channel input0->dim[2] = buffer[2]; // height @@ -87,7 +87,7 @@ int main(int argc, char **argv) input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; @@ -107,7 +107,7 @@ int main(int argc, char **argv) input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; @@ -121,15 +121,14 @@ int main(int argc, char **argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_add(input0, input1, output, ¶ms, difference); + test_add(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/argmax.c b/tests/validation_graph/argmax.c index 495d31e8..f5c92a73 100644 --- a/tests/validation_graph/argmax.c +++ b/tests/validation_graph/argmax.c @@ -16,48 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of argmax(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int axis = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); float min_value, max_value; int in_size = 0, out_size = 0; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); - input->dim[0] = buffer[0]; // batch ??? why must be 1 - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(sess); + input->dim[0] = buffer[0]; // batch ??? why must be 1 + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 5); - input->data = input_data; + input->data = input_data; get_quant_info(input); input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); - for(int i = 0; i < 4; i++) { - if(i == axis) { + struct csinn_tensor *output = csinn_alloc_tensor(sess); + for (int i = 0; i < 4; i++) { + if (i == axis) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; @@ -71,47 +71,48 @@ int main(int argc, char** argv) get_quant_info(output); /* operator parameter configuration */ - struct reduce_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis_count = 1; // must be 1 for light - params.axis = &axis; - - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis_count = 1; // must be 1 for light + params->axis = &axis; + + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; - if (csi_argmax_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_argmax_init(input, output, params) != CSINN_TRUE) { printf("argmax init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_argmax(input, output, ¶ms); + csinn_argmax(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -124,7 +125,7 @@ int main(int argc, char** argv) free(reference->qinfo); free(reference); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/avgpool.c b/tests/validation_graph/avgpool.c index ac8eb428..d6afbeed 100644 --- a/tests/validation_graph/avgpool.c +++ b/tests/validation_graph/avgpool.c @@ -16,75 +16,75 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_avgpool2d_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_avgpool2d_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_avgpool2d(input, output, params); + csinn_avgpool2d(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_avgpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference); +void test_avgpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 15); - input->data = input_data; + input->data = input_data; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); - output->dim[0] = input->dim[0]; // batch - output->dim[1] = input->dim[1]; // in_channel - output->dim[2] = buffer[12]; // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1 - output->dim[3] = buffer[13]; // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1 + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + output->dim[0] = input->dim[0]; // batch + output->dim[1] = input->dim[1]; // in_channel + output->dim[2] = buffer[12]; // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1 + output->dim[3] = buffer[13]; // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1 output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 15 + in_size); @@ -94,27 +94,23 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct pool_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.ceil_mode = 0; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.count_include_pad = 0; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->ceil_mode = 0; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->count_include_pad = 0; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_avgpool(input, output, ¶ms, difference); + test_avgpool(input, output, params, difference); return done_testing(); } - - - diff --git a/tests/validation_graph/batch_normalization.c b/tests/validation_graph/batch_normalization.c index 970faa36..ddcccdd3 100644 --- a/tests/validation_graph/batch_normalization.c +++ b/tests/validation_graph/batch_normalization.c @@ -16,36 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch normalization(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int channel_size = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); - input->dim[0] = buffer[1]; // batch - input->dim[1] = buffer[4]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(sess); + input->dim[0] = buffer[1]; // batch + input->dim[1] = buffer[4]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -55,7 +54,7 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* mean tensor configuration */ - struct csi_tensor *mean = csi_alloc_tensor(sess); + struct csinn_tensor *mean = csinn_alloc_tensor(sess); mean->dim[0] = channel_size; mean->dim_count = 1; mean->name = "mean"; @@ -65,7 +64,7 @@ int main(int argc, char** argv) mean->dtype = CSINN_DTYPE_FLOAT32; /* variance tensor configuration */ - struct csi_tensor *variance = csi_alloc_tensor(sess); + struct csinn_tensor *variance = csinn_alloc_tensor(sess); variance->dim[0] = channel_size; variance->dim_count = 1; variance->name = "variance"; @@ -75,7 +74,7 @@ int main(int argc, char** argv) mean->dtype = CSINN_DTYPE_FLOAT32; /* gamma tensor configuration */ - struct csi_tensor *gamma = csi_alloc_tensor(sess); + struct csinn_tensor *gamma = csinn_alloc_tensor(sess); gamma->dim[0] = channel_size; gamma->dim_count = 1; gamma->name = "gamma"; @@ -85,7 +84,7 @@ int main(int argc, char** argv) gamma->dtype = CSINN_DTYPE_FLOAT32; /* beta tensor configuration */ - struct csi_tensor *beta = csi_alloc_tensor(sess); + struct csinn_tensor *beta = csinn_alloc_tensor(sess); beta->dim[0] = channel_size; beta->dim_count = 1; beta->name = "beta"; @@ -95,7 +94,7 @@ int main(int argc, char** argv) beta->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -107,56 +106,56 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct bn_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.epsilon = *((float *)buffer + 5); - - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->epsilon = *((float *)buffer + 5); + + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; - struct csi_tensor *mean_tensor = convert_input(mean, test_dtype); + struct csinn_tensor *mean_tensor = convert_input(mean, test_dtype); mean->dtype = sess->base_dtype; - struct csi_tensor *variance_tensor = convert_input(variance, test_dtype); + struct csinn_tensor *variance_tensor = convert_input(variance, test_dtype); variance->dtype = sess->base_dtype; - struct csi_tensor *gamma_tensor = convert_input(gamma, test_dtype); + struct csinn_tensor *gamma_tensor = convert_input(gamma, test_dtype); gamma->dtype = sess->base_dtype; - struct csi_tensor *beta_tensor = convert_input(beta, test_dtype); + struct csinn_tensor *beta_tensor = convert_input(beta, test_dtype); beta->dtype = sess->base_dtype; - if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, ¶ms) != CSINN_TRUE) { + if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) != + CSINN_TRUE) { printf("batch normalization init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_batch_normalization(input, mean, variance, gamma, beta, output, ¶ms); + csinn_batch_normalization(input, mean, variance, gamma, beta, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -169,7 +168,7 @@ int main(int argc, char** argv) free(reference->qinfo); free(reference); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/batch_to_space_nd.c b/tests/validation_graph/batch_to_space_nd.c index dbadb1d0..fae0a6ce 100644 --- a/tests/validation_graph/batch_to_space_nd.c +++ b/tests/validation_graph/batch_to_space_nd.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch_to_space_nd(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); float min_value, max_value; int in_size = 1, out_size = 1; int prod_block = 1; int spatial_shape_cnt = buffer[0]; - int remain_shape_cnt = buffer[1]; + int remain_shape_cnt = buffer[1]; int32_t *block_shape = (int32_t *)malloc(spatial_shape_cnt * sizeof(int32_t)); int32_t *crops = (int32_t *)malloc(2 * spatial_shape_cnt * sizeof(int32_t)); - for(int i = 0; i < spatial_shape_cnt; i++) { + for (int i = 0; i < spatial_shape_cnt; i++) { block_shape[i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i]; crops[2 * i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 1]; crops[2 * i + 1] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 2]; @@ -45,17 +45,17 @@ int main(int argc, char** argv) } enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_LIGHT; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); - input->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt; // batch_cnt + spatial_shape_cnt + remain_shape_cnt - for(int i = 0; i < input->dim_count; i++) { + struct csinn_tensor *input = csinn_alloc_tensor(sess); + input->dim_count = 1 + spatial_shape_cnt + + remain_shape_cnt; // batch_cnt + spatial_shape_cnt + remain_shape_cnt + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } @@ -66,15 +66,16 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); - output->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt; // output->dim_cnt = input->dim_cnt - output->dim[0] = input->dim[0] / prod_block; // batch_out + struct csinn_tensor *output = csinn_alloc_tensor(sess); + output->dim_count = + 1 + spatial_shape_cnt + remain_shape_cnt; // output->dim_cnt = input->dim_cnt + output->dim[0] = input->dim[0] / prod_block; // batch_out output->dim[1] = input->dim[1]; - for(int i = 0; i < 2; i++) { - output->dim[2 + i] = input->dim[2 + i] * block_shape[i] - crops[2 * i] - crops[ 2 * i + 1]; + for (int i = 0; i < 2; i++) { + output->dim[2 + i] = input->dim[2 + i] * block_shape[i] - crops[2 * i] - crops[2 * i + 1]; } - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { out_size *= output->dim[i]; } reference->data = (float *)(buffer + 2 + spatial_shape_cnt * 3 + input->dim_count + in_size); @@ -82,48 +83,45 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct batch_to_space_nd_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.block_shape = block_shape; - params.crops = crops; - params.spatial_dim_cnt = spatial_shape_cnt; - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_batch_to_space_nd_params *params; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->block_shape = block_shape; + params->crops = crops; + params->spatial_dim_cnt = spatial_shape_cnt; + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; - if (csi_batch_to_space_nd_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_batch_to_space_nd_init(input, output, params) != CSINN_TRUE) { printf("batch_to_space_nd init fail.\n\t"); return -1; } + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); - - csi_batch_to_space_nd(input, output, ¶ms); + csinn_batch_to_space_nd(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* FIX ME */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); /* free alloced memory */ @@ -137,7 +135,7 @@ int main(int argc, char** argv) free(block_shape); free(crops); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/c906/Makefile b/tests/validation_graph/c906/Makefile index 26598f7e..592ce19f 100644 --- a/tests/validation_graph/c906/Makefile +++ b/tests/validation_graph/c906/Makefile @@ -1,6 +1,6 @@ CC = riscv64-unknown-linux-gnu-gcc INCLUDE = -I../../../include -I../../utils -CFLAGS = -O2 -g3 -march=rv64gcvxthead -mabi=lp64dv -static +CFLAGS = -O2 -g3 -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -static test_objs = @@ -66,8 +66,8 @@ $(utils_objs): %.o: %.c $(test_objs): %.o: %.c $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ $(CC) -c $(CFLAGS) $(INCLUDE) ../$< -o ../$@ - $(CC) $@ ../$@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../../lib/ \ - ../../../lib/libcsi_nn2_c906.a -lpthread -lc -lm -lstdc++ -o $@.elf + $(CC) $@ ../$@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../../riscv_build/ \ + ../../../riscv_build/libshl_c906.a -lpthread -lc -lm -lstdc++ -o $@.elf clean: rm -rf $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm diff --git a/tests/validation_graph/c906/add.c b/tests/validation_graph/c906/add.c index 9c9ad745..e881ec0d 100644 --- a/tests/validation_graph/c906/add.c +++ b/tests/validation_graph/c906/add.c @@ -16,66 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff); -void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test add f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test add f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input0, input1, output, params, difference); test_f32(input0, input1, output, params, difference); diff --git a/tests/validation_graph/c906/avgpool.c b/tests/validation_graph/c906/avgpool.c index 9190fa0c..8fcaffc9 100644 --- a/tests/validation_graph/c906/avgpool.c +++ b/tests/validation_graph/c906/avgpool.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test avgpool f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test avgpool f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_quant_type = CSINN_QUANT_FLOAT32; sess->base_dtype = CSINN_DTYPE_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_avgpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_avgpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/concat.c b/tests/validation_graph/c906/concat.c index 7673a6e1..9a47518e 100644 --- a/tests/validation_graph/c906/concat.c +++ b/tests/validation_graph/c906/concat.c @@ -16,69 +16,68 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - struct csi_session *sess, struct csi_tensor **real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, struct csinn_session *sess, + struct csinn_tensor **real_input, float *output_data, float diff); -void test_f16(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - float difference) +void test_f16(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, float difference) { printf("test concat f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; - struct csi_tensor *qinput[params->inputs_count]; - struct csi_tensor *real_input[params->inputs_count]; + struct csinn_tensor *qinput[params->inputs_count]; + struct csinn_tensor *real_input[params->inputs_count]; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - for(int i = 0; i < params->inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { qinput[i] = convert_f32_input(input[i], test_dtype, sess); real_input[i] = convert_f32_input(input[i], test_dtype, sess); } - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - float difference) +void test_f32(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, float difference) { printf("test concat f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_quant_type = CSINN_QUANT_FLOAT32; sess->base_dtype = CSINN_DTYPE_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; - struct csi_tensor *qinput[params->inputs_count]; - struct csi_tensor *real_input[params->inputs_count]; + struct csinn_tensor *qinput[params->inputs_count]; + struct csinn_tensor *real_input[params->inputs_count]; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - for(int i = 0; i < params->inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { qinput[i] = convert_f32_input(input[i], test_dtype, sess); real_input[i] = convert_f32_input(input[i], test_dtype, sess); } - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - float difference) +void test_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/convolution.c b/tests/validation_graph/c906/convolution.c index 040f0637..9b47f8bb 100644 --- a/tests/validation_graph/c906/convolution.c +++ b/tests/validation_graph/c906/convolution.c @@ -16,68 +16,68 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test conv2d f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test conv2d f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_quant_type = CSINN_QUANT_FLOAT32; sess->base_dtype = CSINN_DTYPE_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, kernel, bias, output, params, difference); test_f32(input, kernel, bias, output, params, difference); } - diff --git a/tests/validation_graph/c906/deconvolution.c b/tests/validation_graph/c906/deconvolution.c index b87aada3..aef8ba2d 100644 --- a/tests/validation_graph/c906/deconvolution.c +++ b/tests/validation_graph/c906/deconvolution.c @@ -16,65 +16,67 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test deconv2d f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test deconv2d f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_deconv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_deconv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, kernel, bias, output, params, difference); test_f32(input, kernel, bias, output, params, difference); } diff --git a/tests/validation_graph/c906/depth_to_space.c b/tests/validation_graph/c906/depth_to_space.c index 63d35028..7fd42c51 100644 --- a/tests/validation_graph/c906/depth_to_space.c +++ b/tests/validation_graph/c906/depth_to_space.c @@ -16,62 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, float difference) { printf("test depth_to_space f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, - struct depth_to_space_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, float difference) { printf("test depth_to_space f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } - -void test_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params, - float difference) +void test_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, output, params, difference); test_f32(input, output, params, difference); } diff --git a/tests/validation_graph/c906/depthwise_convolution.c b/tests/validation_graph/c906/depthwise_convolution.c index 1302dfce..043e8d9b 100644 --- a/tests/validation_graph/c906/depthwise_convolution.c +++ b/tests/validation_graph/c906/depthwise_convolution.c @@ -16,67 +16,67 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test depthwise conv2d f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test depthwise conv2d f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, - struct csi_tensor *bias, struct csi_tensor *output, - struct conv2d_params *params, float difference) +void test_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, kernel, bias, output, params, difference); test_f32(input, kernel, bias, output, params, difference); diff --git a/tests/validation_graph/c906/div.c b/tests/validation_graph/c906/div.c index 1ca03d47..e6dd15d9 100644 --- a/tests/validation_graph/c906/div.c +++ b/tests/validation_graph/c906/div.c @@ -16,66 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff); -void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test div f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test div f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input0, input1, output, params, difference); test_f32(input0, input1, output, params, difference); diff --git a/tests/validation_graph/c906/flatten.c b/tests/validation_graph/c906/flatten.c index 7587b0f0..603eb70c 100644 --- a/tests/validation_graph/c906/flatten.c +++ b/tests/validation_graph/c906/flatten.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, float difference) { printf("test flatten f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, float difference) { printf("test flatten f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - float difference) +void test_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, output, params, difference); test_f32(input, output, params, difference); } diff --git a/tests/validation_graph/c906/fullyconnected.c b/tests/validation_graph/c906/fullyconnected.c index 8e02cef2..245e40ee 100644 --- a/tests/validation_graph/c906/fullyconnected.c +++ b/tests/validation_graph/c906/fullyconnected.c @@ -16,67 +16,67 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, float difference) { printf("test fullyconnected f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, float difference) { printf("test fullyconnected f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_fc(struct csi_tensor *input, struct csi_tensor *weights, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, float difference) +void test_fc(struct csinn_tensor *input, struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, weights, bias, output, params, difference); test_f32(input, weights, bias, output, params, difference); diff --git a/tests/validation_graph/c906/global_avgpool.c b/tests/validation_graph/c906/global_avgpool.c index 336eb093..f754e53e 100644 --- a/tests/validation_graph/c906/global_avgpool.c +++ b/tests/validation_graph/c906/global_avgpool.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test global avgpool f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test global avgpool f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_global_avgpool(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params, float difference) +void test_global_avgpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, output, params, difference); test_f32(input, output, params, difference); } diff --git a/tests/validation_graph/c906/global_maxpool.c b/tests/validation_graph/c906/global_maxpool.c index d725f713..5c9e7d84 100644 --- a/tests/validation_graph/c906/global_maxpool.c +++ b/tests/validation_graph/c906/global_maxpool.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test global maxpool f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test global maxpool f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_global_maxpool(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params, float difference) +void test_global_maxpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/group_convolution.c b/tests/validation_graph/c906/group_convolution.c index 14561fa0..77004b80 100644 --- a/tests/validation_graph/c906/group_convolution.c +++ b/tests/validation_graph/c906/group_convolution.c @@ -16,67 +16,69 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test group conv2d f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference) { printf("test group conv2d f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); - struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess); + struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data, difference); } -void test_group_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference) +void test_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input, kernel, bias, output, params, difference); test_f32(input, kernel, bias, output, params, difference); } diff --git a/tests/validation_graph/c906/leaky_relu.c b/tests/validation_graph/c906/leaky_relu.c index 74aafa5e..c5519eb2 100644 --- a/tests/validation_graph/c906/leaky_relu.c +++ b/tests/validation_graph/c906/leaky_relu.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test leaky relu f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test leaky relu f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_leaky_relu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params, float difference) +void test_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/maximum.c b/tests/validation_graph/c906/maximum.c index f1dc2e28..8ee5277f 100644 --- a/tests/validation_graph/c906/maximum.c +++ b/tests/validation_graph/c906/maximum.c @@ -16,66 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff); -void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test maximum f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test maximum f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input0, input1, output, params, difference); test_f32(input0, input1, output, params, difference); diff --git a/tests/validation_graph/c906/maxpool.c b/tests/validation_graph/c906/maxpool.c index ff314459..2562ec30 100644 --- a/tests/validation_graph/c906/maxpool.c +++ b/tests/validation_graph/c906/maxpool.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test maxpool f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { printf("test maxpool f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_maxpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference) +void test_maxpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/minimum.c b/tests/validation_graph/c906/minimum.c index 087bb8f2..a516126d 100644 --- a/tests/validation_graph/c906/minimum.c +++ b/tests/validation_graph/c906/minimum.c @@ -16,66 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff); -void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test minimum f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test minimum f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input0, input1, output, params, difference); test_f32(input0, input1, output, params, difference); diff --git a/tests/validation_graph/c906/pad.c b/tests/validation_graph/c906/pad.c index b46011aa..b0976c60 100644 --- a/tests/validation_graph/c906/pad.c +++ b/tests/validation_graph/c906/pad.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, float difference) { printf("test pad f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, float difference) { printf("test pad f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - float difference) +void test_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/relu.c b/tests/validation_graph/c906/relu.c index 59c8c71d..ba1613f7 100644 --- a/tests/validation_graph/c906/relu.c +++ b/tests/validation_graph/c906/relu.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/relu1.c b/tests/validation_graph/c906/relu1.c index 3b8165ec..27f68150 100644 --- a/tests/validation_graph/c906/relu1.c +++ b/tests/validation_graph/c906/relu1.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu1 f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu1 f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/relu6.c b/tests/validation_graph/c906/relu6.c index d14f0653..0da4ba93 100644 --- a/tests/validation_graph/c906/relu6.c +++ b/tests/validation_graph/c906/relu6.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu6 f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { printf("test relu6 f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference) +void test_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/reshape.c b/tests/validation_graph/c906/reshape.c index 632916ba..bf85da5b 100644 --- a/tests/validation_graph/c906/reshape.c +++ b/tests/validation_graph/c906/reshape.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, float difference) { printf("test reshape f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, float difference) { printf("test reshape f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_reshape(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params, float difference) +void test_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/resize_bilinear.c b/tests/validation_graph/c906/resize_bilinear.c index e151a4cc..1745dd31 100644 --- a/tests/validation_graph/c906/resize_bilinear.c +++ b/tests/validation_graph/c906/resize_bilinear.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { printf("test resize f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { printf("test resize f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; /* CSINN_RESIZE_BILINEAR */ printf("test CSINN_RESIZE_BILINEAR \n"); diff --git a/tests/validation_graph/c906/resize_nearest_neighbor.c b/tests/validation_graph/c906/resize_nearest_neighbor.c index 4b51edae..1dd1edab 100644 --- a/tests/validation_graph/c906/resize_nearest_neighbor.c +++ b/tests/validation_graph/c906/resize_nearest_neighbor.c @@ -16,62 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { printf("test resize f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { printf("test resize f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference) +void test_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; printf("test CSINN_RESIZE_NEAREST_NEIGHBOR \n"); test_f16(input, output, params, difference); diff --git a/tests/validation_graph/c906/sigmoid.c b/tests/validation_graph/c906/sigmoid.c index 91d85342..75714cb6 100644 --- a/tests/validation_graph/c906/sigmoid.c +++ b/tests/validation_graph/c906/sigmoid.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, float difference) { printf("test sigmoid f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, float difference) { printf("test sigmoid f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_sigmoid(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params, float difference) +void test_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/space_to_depth.c b/tests/validation_graph/c906/space_to_depth.c index 9bfe7e1b..b6c9c44f 100644 --- a/tests/validation_graph/c906/space_to_depth.c +++ b/tests/validation_graph/c906/space_to_depth.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct space_to_depth_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params, float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, float difference) { printf("test space_to_depth f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params, float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, float difference) { printf("test space_to_depth f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_space_to_depth(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params, float difference) +void test_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/split.c b/tests/validation_graph/c906/split.c index 125b1d88..467b49e1 100644 --- a/tests/validation_graph/c906/split.c +++ b/tests/validation_graph/c906/split.c @@ -16,73 +16,72 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float **output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float **output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, float difference) { printf("test transpose f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; int output_cnt = params->output_num; float *output_data[output_cnt]; - struct csi_tensor *qoutput[output_cnt]; + struct csinn_tensor *qoutput[output_cnt]; for (int i = 0; i < output_cnt; i++) { output_data[i] = output[i]->data; qoutput[i] = convert_f32_input(output[i], test_dtype, sess); } - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output_data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, float difference) { printf("test transpose f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; int output_cnt = params->output_num; float *output_data[output_cnt]; - struct csi_tensor *qoutput[output_cnt]; + struct csinn_tensor *qoutput[output_cnt]; for (int i = 0; i < output_cnt; i++) { output_data[i] = output[i]->data; qoutput[i] = convert_f32_input(output[i], test_dtype, sess); } - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output_data, difference); } -void test_split(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params, float difference) +void test_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/squeeze.c b/tests/validation_graph/c906/squeeze.c index efcaad66..c283876e 100644 --- a/tests/validation_graph/c906/squeeze.c +++ b/tests/validation_graph/c906/squeeze.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, float difference) { printf("test squeeze f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, float difference) { printf("test squeeze f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_squeeze(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params, float difference) +void test_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/sub.c b/tests/validation_graph/c906/sub.c index 50aadf4a..c485df0b 100644 --- a/tests/validation_graph/c906/sub.c +++ b/tests/validation_graph/c906/sub.c @@ -16,66 +16,66 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff); -void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test sub f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { printf("test sub f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); - struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess); + struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess); op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data, difference); } -void test_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference) +void test_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; + test_f16(input0, input1, output, params, difference); test_f32(input0, input1, output, params, difference); } diff --git a/tests/validation_graph/c906/tanh.c b/tests/validation_graph/c906/tanh.c index 7bddbef1..2a6f77f1 100644 --- a/tests/validation_graph/c906/tanh.c +++ b/tests/validation_graph/c906/tanh.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float difference) { printf("test tanh f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float difference) { printf("test tanh f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - float difference) +void test_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/c906/transpose.c b/tests/validation_graph/c906/transpose.c index 12d954de..3793bf59 100644 --- a/tests/validation_graph/c906/transpose.c +++ b/tests/validation_graph/c906/transpose.c @@ -16,61 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff); +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff); -void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params, - float difference) +void test_f16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, float difference) { printf("test transpose f16\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT16; sess->base_quant_type = CSINN_QUANT_FLOAT16; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params, - float difference) +void test_f32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, float difference) { printf("test transpose f32\n"); - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_C906; sess->base_run_mode = CSINN_RM_CPU_GRAPH; sess->base_dtype = CSINN_DTYPE_FLOAT32; sess->base_quant_type = CSINN_QUANT_FLOAT32; - // sess->debug_level = CSI_DEBUG_LEVEL_INFO; + // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; params->base.sess = sess; enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32; - struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess); - struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess); - struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess); + struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess); + struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess); op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference); } -void test_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params, float difference) +void test_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, float difference) { params->base.api = CSINN_C906; - params->base.run_mode = CSINN_RM_CPU_GRAPH; test_f16(input, output, params, difference); test_f32(input, output, params, difference); diff --git a/tests/validation_graph/concat.c b/tests/validation_graph/concat.c index 392cd17a..0f20930f 100644 --- a/tests/validation_graph/concat.c +++ b/tests/validation_graph/concat.c @@ -16,51 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - struct csi_session *sess, struct csi_tensor **real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, struct csinn_session *sess, + struct csinn_tensor **real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(params->inputs_count, sess); - csi_set_output_number(1, sess); - csi_concat_init(input, output, params); - - for(int i = 0; i < params->inputs_count; i++) { - csi_set_tensor_entry(input[i], sess); - csi_set_input(i, input[i], sess); + csinn_session_init(sess); + csinn_set_input_number(params->inputs_count, sess); + csinn_set_output_number(1, sess); + csinn_concat_init(input, output, params); + + for (int i = 0; i < params->inputs_count; i++) { + csinn_set_tensor_entry(input[i], sess); + csinn_set_input(i, input[i], sess); } - csi_concat(input, output, params); + csinn_concat(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - for(int i = 0; i < params->inputs_count; i++) { - csi_update_input(i, real_input[i], sess); + for (int i = 0; i < params->inputs_count; i++) { + csinn_update_input(i, real_input[i], sess); } - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input[0]->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input[0]->data, diff, csinn_tensor_size(output), false); // free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params, - float difference); +void test_concat(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of concat(graph).\n"); @@ -68,20 +68,20 @@ int main(int argc, char** argv) int input_cnt = buffer[4]; int axis = buffer[5]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 1; /* input tensor configuration */ - struct csi_tensor *input[input_cnt]; + struct csinn_tensor *input[input_cnt]; float *input_data[input_cnt]; void **src_tmp = malloc(input_cnt * sizeof(void *)); char input_name[input_cnt][10]; - for(int i = 0; i < input_cnt; i++) { - input[i] = csi_alloc_tensor(NULL); - input[i]->dim[0] = buffer[0]; // batch - input[i]->dim[1] = buffer[1]; // in_channel - input[i]->dim[2] = buffer[2]; // height - input[i]->dim[3] = buffer[3]; // width + for (int i = 0; i < input_cnt; i++) { + input[i] = csinn_alloc_tensor(NULL); + input[i]->dim[0] = buffer[0]; // batch + input[i]->dim[1] = buffer[1]; // in_channel + input[i]->dim[2] = buffer[2]; // height + input[i]->dim[3] = buffer[3]; // width input[i]->dim_count = 4; in_size = input[i]->dim[0] * input[i]->dim[1] * input[i]->dim[2] * input[i]->dim[3]; @@ -92,9 +92,9 @@ int main(int argc, char** argv) } /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); - for(int i = 0; i < 4; i++) { - if(i == axis) { + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + for (int i = 0; i < 4; i++) { + if (i == axis) { output->dim[i] = input_cnt * buffer[i]; } else { output->dim[i] = buffer[i]; @@ -110,16 +110,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct concat_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = axis; - params.inputs_count = input_cnt; + struct csinn_concat_params *params = + csinn_alloc_params(sizeof(struct csinn_concat_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->inputs_count = input_cnt; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_concat(input, output, ¶ms, difference); + test_concat(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/convolution.c b/tests/validation_graph/convolution.c index 7b6c57f6..223770a9 100644 --- a/tests/validation_graph/convolution.c +++ b/tests/validation_graph/convolution.c @@ -16,45 +16,46 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_conv2d_init(input, output, kernel, bias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_conv2d_init(input, output, kernel, bias, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_conv2d(input, output, kernel, bias, params); + csinn_conv2d(input, output, kernel, bias, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference); +void test_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference); int main(int argc, char **argv) { @@ -62,11 +63,11 @@ int main(int argc, char **argv) int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = buffer[0]; // batch input->dim[1] = buffer[1]; // in_channel input->dim[2] = buffer[2]; // height @@ -80,7 +81,7 @@ int main(int argc, char **argv) input->layout = CSINN_LAYOUT_NCHW; /* kernel tensor configuration */ - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = buffer[12]; kernel->dim[1] = buffer[1]; kernel->dim[2] = buffer[6]; @@ -95,7 +96,7 @@ int main(int argc, char **argv) kernel->layout = CSINN_LAYOUT_OIHW; /* bias tensor configuratioin */ - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = buffer[12]; bias->dim_count = 1; bias_size = bias->dim[0]; @@ -107,7 +108,7 @@ int main(int argc, char **argv) bias->layout = CSINN_LAYOUT_O; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = buffer[0]; // batch output->dim[1] = buffer[12]; // out_channel output->dim[2] = buffer[16]; // height @@ -121,25 +122,25 @@ int main(int argc, char **argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct conv2d_params params; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.group = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.base.name = "params"; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->group = 1; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.name = "params"; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_conv2d(input, kernel, bias, output, ¶ms, difference); + test_conv2d(input, kernel, bias, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/crop.c b/tests/validation_graph/crop.c index da67da9f..e25faa44 100644 --- a/tests/validation_graph/crop.c +++ b/tests/validation_graph/crop.c @@ -16,13 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of crop(graph).\n"); @@ -30,26 +30,25 @@ int main(int argc, char** argv) int in_out_dim = buffer[0]; int *begin = (int *)malloc(in_out_dim * sizeof(int)); int *end = (int *)malloc(in_out_dim * sizeof(int)); - for(int i = 0; i < in_out_dim; i++) { + for (int i = 0; i < in_out_dim; i++) { begin[i] = buffer[2 + in_out_dim + 3 * i]; end[i] = buffer[2 + in_out_dim + 3 * i + 1]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 1, out_size = 1; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); + struct csinn_tensor *input = csinn_alloc_tensor(sess); input->dim_count = in_out_dim; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[1 + i]; in_size *= input->dim[i]; } @@ -60,10 +59,10 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); output->dim_count = in_out_dim; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = end[i] - begin[i]; // end[i] - begin[i] ( stride[i] = 1 ) + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = end[i] - begin[i]; // end[i] - begin[i] ( stride[i] = 1 ) out_size *= output->dim[i]; } // out_size = buffer[2 + 4 * input->dim_count]; @@ -72,62 +71,59 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct crop_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = buffer[1 + input->dim_count]; - params.offset_num = input->dim_count - params.axis; - - int32_t *offset = (int32_t *)malloc((params.offset_num) * sizeof(int32_t)); - for(int i = 0; i < params.offset_num; i++) { - offset[i] = begin[i + params.axis]; + struct csinn_crop_params *params; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = buffer[1 + input->dim_count]; + params->offset_num = input->dim_count - params->axis; + + int32_t *offset = (int32_t *)malloc((params->offset_num) * sizeof(int32_t)); + for (int i = 0; i < params->offset_num; i++) { + offset[i] = begin[i + params->axis]; } - params.offset = offset; - + params->offset = offset; - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; /* light: 1. cropping on the batch axis is not supported. -->> axis >= 1 2. input->dim_count <= 4 */ - if (csi_crop_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_crop_init(input, output, params) != CSINN_TRUE) { printf("crop init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); - - csi_crop(input, output, ¶ms); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_crop(input, output, params); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -143,7 +139,7 @@ int main(int argc, char** argv) free(end); free(offset); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/deconvolution.c b/tests/validation_graph/deconvolution.c index e72f59f6..a68927a0 100644 --- a/tests/validation_graph/deconvolution.c +++ b/tests/validation_graph/deconvolution.c @@ -16,63 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_deconv2d_init(input, output, kernel, bias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_deconv2d_init(input, output, kernel, bias, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_deconv2d(input, output, kernel, bias, params); + csinn_deconv2d(input, output, kernel, bias, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_deconv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference); +void test_deconv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconv2d(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 17); input->data = input_data; @@ -80,14 +82,14 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* kernel tensor configuration */ - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - kernel->dim[0] = buffer[1]; // i - kernel->dim[1] = buffer[14]; // o - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); + kernel->dim[0] = buffer[1]; // i + kernel->dim[1] = buffer[14]; // o + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w kernel->dim_count = 4; kernel->layout = CSINN_LAYOUT_OIHW; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; kernel->name = "kernel"; float *kernel_data = (float *)(buffer + 17 + in_size); kernel->data = kernel_data; @@ -96,7 +98,7 @@ int main(int argc, char** argv) kernel->layout = CSINN_LAYOUT_OIHW; /* bias tensor configuratioin */ - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = buffer[14]; bias->dim_count = 1; bias_size = bias->dim[0]; @@ -108,11 +110,11 @@ int main(int argc, char** argv) bias->layout = CSINN_LAYOUT_O; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[14]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[14]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 17 + in_size + weight_size + bias->dim[0]); @@ -122,22 +124,22 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct conv2d_params params; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.group = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.base.name = "params"; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->group = 1; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.name = "params"; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_deconv2d(input, kernel, bias, output, ¶ms, difference); + test_deconv2d(input, kernel, bias, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/depth_to_space.c b/tests/validation_graph/depth_to_space.c index 7059a8fe..b5f22c6d 100644 --- a/tests/validation_graph/depth_to_space.c +++ b/tests/validation_graph/depth_to_space.c @@ -16,62 +16,62 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_depth_to_space_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_depth_to_space_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_depth_to_space(input, output, params); + csinn_depth_to_space(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params, - float difference); +void test_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depth_to_space(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int block_size = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -81,7 +81,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1] / (block_size * block_size); output->dim[2] = input->dim[2] * block_size; @@ -95,14 +95,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct depth_to_space_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.block_size = block_size; + struct csinn_depth_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->block_size = block_size; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_depth_to_space(input, output, ¶ms, difference); + test_depth_to_space(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/depthwise_convolution.c b/tests/validation_graph/depthwise_convolution.c index 8472f6d7..93fad010 100644 --- a/tests/validation_graph/depthwise_convolution.c +++ b/tests/validation_graph/depthwise_convolution.c @@ -16,46 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_conv2d_init(input, output, kernel, bias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_conv2d_init(input, output, kernel, bias, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_conv2d(input, output, kernel, bias, params); + csinn_conv2d(input, output, kernel, bias, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, - struct csi_tensor *bias, struct csi_tensor *output, - struct conv2d_params *params, float difference); +void test_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference); int main(int argc, char **argv) { @@ -64,11 +65,11 @@ int main(int argc, char **argv) int *buffer = read_input_data_f32(argv[1]); int group = buffer[1]; // group = in_channel - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = buffer[0]; // batch input->dim[1] = buffer[1]; // in_channel input->dim[2] = buffer[2]; // height @@ -83,7 +84,7 @@ int main(int argc, char **argv) input->layout = CSINN_LAYOUT_NCHW; /* kernel tensor configuration */ - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = buffer[1]; // i kernel->dim[1] = buffer[12] / group; // o kernel->dim[2] = buffer[6]; // h @@ -98,7 +99,7 @@ int main(int argc, char **argv) kernel->layout = CSINN_LAYOUT_OIHW; /* bias tensor configuratioin */ - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = buffer[12]; bias->dim_count = 1; bias_size = bias->dim[0]; @@ -110,7 +111,7 @@ int main(int argc, char **argv) bias->layout = CSINN_LAYOUT_O; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = buffer[0]; // batch output->dim[1] = buffer[12]; // out_channel output->dim[2] = buffer[15]; // height @@ -124,24 +125,24 @@ int main(int argc, char **argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct conv2d_params params; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.group = group; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.base.name = "params"; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->group = group; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.name = "params"; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_depthwise_conv2d(input, kernel, bias, output, ¶ms, difference); + test_depthwise_conv2d(input, kernel, bias, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/div.c b/tests/validation_graph/div.c index a88396df..3b32817d 100644 --- a/tests/validation_graph/div.c +++ b/tests/validation_graph/div.c @@ -16,51 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" #include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_div_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_div_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_div(input0, input1, output, params); + csinn_div(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference); int main(int argc, char **argv) { @@ -68,11 +68,11 @@ int main(int argc, char **argv) int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 0, in1_size = 0, out_size = 0; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = buffer[0]; // batch input0->dim[1] = buffer[1]; // channel input0->dim[2] = buffer[2]; // height @@ -86,7 +86,7 @@ int main(int argc, char **argv) input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim[0] = buffer[0]; // batch input1->dim[1] = buffer[1]; // channel input1->dim[2] = buffer[2]; // height @@ -100,7 +100,7 @@ int main(int argc, char **argv) input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; @@ -114,13 +114,12 @@ int main(int argc, char **argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_div(input0, input1, output, ¶ms, difference); + test_div(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/flatten.c b/tests/validation_graph/flatten.c index fd640534..54660958 100644 --- a/tests/validation_graph/flatten.c +++ b/tests/validation_graph/flatten.c @@ -16,59 +16,59 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_flatten_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_flatten_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_flatten(input, output, params); + csinn_flatten(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params, - float difference); +void test_flatten(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of flatten(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int input_dims = buffer[0]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 1, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - for(int i = 0; i < input_dims; i++) { + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + for (int i = 0; i < input_dims; i++) { input->dim[i] = buffer[1 + i]; in_size *= input->dim[i]; } @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = in_size; output->dim_count = 1; out_size = in_size; @@ -91,13 +91,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct flatten_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_flatten_params *params = + csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_flatten(input, output, ¶ms, difference); + test_flatten(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/fullyconnected.c b/tests/validation_graph/fullyconnected.c index 8ba7d304..6bbbb4e2 100644 --- a/tests/validation_graph/fullyconnected.c +++ b/tests/validation_graph/fullyconnected.c @@ -16,59 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_fullyconnected_init(input, output, kernel, bias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_fullyconnected_init(input, output, kernel, bias, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_fullyconnected(input, output, kernel, bias, params); + csinn_fullyconnected(input, output, kernel, bias, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_fc(struct csi_tensor *input, struct csi_tensor *weights, struct csi_tensor *bias, - struct csi_tensor *output, struct fc_params *params, float difference); +void test_fc(struct csinn_tensor *input, struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_fc_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, weights_size = 0, bias_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_nodes + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_nodes input->dim_count = 2; in_size = input->dim[0] * input->dim[1]; input->name = "input"; @@ -78,9 +79,9 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* weight tensor configuration */ - struct csi_tensor *weights = csi_alloc_tensor(NULL); - weights->dim[0] = buffer[2]; // out_nodes - weights->dim[1] = buffer[1]; // in_nodes + struct csinn_tensor *weights = csinn_alloc_tensor(NULL); + weights->dim[0] = buffer[2]; // out_nodes + weights->dim[1] = buffer[1]; // in_nodes weights->dim_count = 2; weights_size = weights->dim[0] * weights->dim[1]; weights->name = "weights"; @@ -90,10 +91,9 @@ int main(int argc, char** argv) weights->dtype = CSINN_DTYPE_FLOAT32; weights->layout = CSINN_LAYOUT_OIHW; - /* bias tensor configuration */ - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = buffer[2]; // out_nodes + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = buffer[2]; // out_nodes bias->dim_count = 1; bias_size = bias->dim[0]; bias->name = "bias"; @@ -104,9 +104,9 @@ int main(int argc, char** argv) bias->layout = CSINN_LAYOUT_O; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[2]; // out_nodes + struct csinn_tensor *output = csinn_alloc_tensor(NULL); + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[2]; // out_nodes output->dim_count = 2; out_size = output->dim[0] * output->dim[1]; reference->data = (float *)(buffer + 3 + in_size + weights_size + bias_size); @@ -116,15 +116,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct fc_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.units = buffer[2]; // out_nodes + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->units = buffer[2]; // out_nodes /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_fc(input, weights, bias, output, ¶ms, difference); + test_fc(input, weights, bias, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/global_avgpool.c b/tests/validation_graph/global_avgpool.c index a14a31b3..2144f2dd 100644 --- a/tests/validation_graph/global_avgpool.c +++ b/tests/validation_graph/global_avgpool.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_global_avgpool2d_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_global_avgpool2d_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_global_avgpool2d(input, output, params); + csinn_global_avgpool2d(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_global_avgpool(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params, float difference); +void test_global_avgpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global_avgpool(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,11 +80,11 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; - output->dim[2] = buffer[4]; // 1 - output->dim[3] = buffer[5]; // 1 + output->dim[2] = buffer[4]; // 1 + output->dim[3] = buffer[5]; // 1 output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 6 + in_size); @@ -94,15 +94,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct pool_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.count_include_pad = 0; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->count_include_pad = 0; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_global_avgpool(input, output, ¶ms, difference); + test_global_avgpool(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/global_maxpool.c b/tests/validation_graph/global_maxpool.c index 834fc5ee..65e9e444 100644 --- a/tests/validation_graph/global_maxpool.c +++ b/tests/validation_graph/global_maxpool.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_global_maxpool2d_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_global_maxpool2d_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_global_maxpool2d(input, output, params); + csinn_global_maxpool2d(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_global_maxpool(struct csi_tensor *input, struct csi_tensor *output, - struct pool_params *params, float difference); +void test_global_maxpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of global_maxpool2d(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,11 +80,11 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; - output->dim[2] = buffer[4]; // 1 - output->dim[3] = buffer[5]; // 1 + output->dim[2] = buffer[4]; // 1 + output->dim[3] = buffer[5]; // 1 output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 6 + in_size); @@ -94,14 +94,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct pool_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.count_include_pad = 0; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->count_include_pad = 0; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_global_maxpool(input, output, ¶ms, difference); + test_global_maxpool(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/group_convolution.c b/tests/validation_graph/group_convolution.c index 1bbed89b..a58d5359 100644 --- a/tests/validation_graph/group_convolution.c +++ b/tests/validation_graph/group_convolution.c @@ -16,58 +16,60 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h"s -void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_tensor *output, struct csinn_conv2d_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_conv2d_init(input, output, kernel, bias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_conv2d_init(input, output, kernel, bias, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_conv2d(input, output, kernel, bias, params); + csinn_conv2d(input, output, kernel, bias, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_group_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias, - struct csi_tensor *output, struct conv2d_params *params, float difference); +void test_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_conv2d_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group conv2d(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int group = buffer[17]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = buffer[0]; // batch input->dim[1] = buffer[1]; // in_channel input->dim[2] = buffer[2]; // height @@ -81,7 +83,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* kernel tensor configuration */ - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = buffer[12]; // o kernel->dim[1] = buffer[1] / group; // i kernel->dim[2] = buffer[6]; // h @@ -96,7 +98,7 @@ int main(int argc, char** argv) kernel->layout = CSINN_LAYOUT_OIHW; /* bias tensor configuratioin */ - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = buffer[12]; bias->dim_count = 1; bias_size = bias->dim[0]; @@ -108,7 +110,7 @@ int main(int argc, char** argv) bias->layout = CSINN_LAYOUT_O; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = buffer[0]; // batch output->dim[1] = buffer[12]; // out_channel output->dim[2] = buffer[16]; // height @@ -122,24 +124,24 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct conv2d_params params; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.group = group; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.base.name = "params"; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->group = group; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.name = "params"; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_group_conv2d(input, kernel, bias, output, ¶ms, difference); + test_group_conv2d(input, kernel, bias, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/l2_normalization.c b/tests/validation_graph/l2_normalization.c index 97c5786f..f25077f0 100644 --- a/tests/validation_graph/l2_normalization.c +++ b/tests/validation_graph/l2_normalization.c @@ -16,35 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of l2 normalization(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); - input->dim[0] = buffer[2]; // batch - input->dim[1] = buffer[3]; // channel - input->dim[2] = buffer[4]; // height - input->dim[3] = buffer[5]; // width + struct csinn_tensor *input = csinn_alloc_tensor(sess); + input->dim[0] = buffer[2]; // batch + input->dim[1] = buffer[3]; // channel + input->dim[2] = buffer[4]; // height + input->dim[3] = buffer[5]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -54,7 +53,7 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -66,52 +65,60 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct l2n_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; int32_t axis[] = {1}; - params.axis = axis; - params.n = 1; - params.epsilon = *((float *)buffer + 1); + params->axis = axis; + params->n = 1; + params->epsilon = *((float *)buffer + 1); - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; - - if (csi_l2_normalization_init(input, output, ¶ms) != CSINN_TRUE) { + /* + light: + software layer, across_spatial = true channel_shared_ = true (scale = 1.0f). + in fact, axis = (1, 2, 3) because across_spatial = true. + it means normalize with (channel * height * width), so params axis and epsilon are invaild + by test: axis can be (1) (2) (3) or (1,2) (2,3) + can not be (0) (4) (1,2,3) .... + anole: + l2_norm compute init set axis = 2 (channel axis), so axis would be ignored here + */ + if (csinn_l2_normalization_init(input, output, params) != CSINN_TRUE) { printf("l2 normalization init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_l2_normalization(input, output, ¶ms); + csinn_l2_normalization(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -124,7 +131,7 @@ int main(int argc, char** argv) free(reference->qinfo); free(reference); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/leaky_relu.c b/tests/validation_graph/leaky_relu.c index 38b0d469..a0a279b2 100644 --- a/tests/validation_graph/leaky_relu.c +++ b/tests/validation_graph/leaky_relu.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_leaky_relu_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_leaky_relu_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_leaky_relu(input, output, params); + csinn_leaky_relu(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_leaky_relu(struct csi_tensor *input, struct csi_tensor *output, - struct relu_params *params, float difference); +void test_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of leaky_relu(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -93,18 +93,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->dtype = CSINN_DTYPE_FLOAT32; - /* operator parameter configuration */ - struct relu_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.n = *((float *)buffer + 4); // alpha + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->n = *((float *)buffer + 4); // alpha /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_leaky_relu(input, output, ¶ms, difference); + test_leaky_relu(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/lrn.c b/tests/validation_graph/lrn.c index af46f035..3e1b6a84 100644 --- a/tests/validation_graph/lrn.c +++ b/tests/validation_graph/lrn.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_lrn_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_lrn_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_lrn(input, output, params); + csinn_lrn(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params, - float difference); +void test_lrn(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of lrn(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -94,19 +94,18 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct lrn_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.range = buffer[4] * 2 + 1; // size = 2 * depth_radius + 1 - params.bias = *((float *)buffer + 5); - params.alpha = *((float *)buffer + 6); - params.beta = *((float *)buffer + 7); - params.norm_region = CSINN_LRN_ACROSS_CHANNELS; + struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->range = buffer[4] * 2 + 1; // size = 2 * depth_radius + 1 + params->bias = *((float *)buffer + 5); + params->alpha = *((float *)buffer + 6); + params->beta = *((float *)buffer + 7); + params->norm_region = CSINN_LRN_ACROSS_CHANNELS; // FIXME: only anole support lrn mode /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_lrn(input, output, ¶ms, difference); + test_lrn(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/maximum.c b/tests/validation_graph/maximum.c index dc83520b..0406738d 100644 --- a/tests/validation_graph/maximum.c +++ b/tests/validation_graph/maximum.c @@ -16,65 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_maximum_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_maximum_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_maximum(input0, input1, output, params); + csinn_maximum(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maximum(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 1, in1_size = 1, out_size = 1; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim_count = buffer[0]; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; in0_size *= input0->dim[i]; } @@ -84,9 +84,9 @@ int main(int argc, char** argv) input0->dtype = CSINN_DTYPE_FLOAT32; input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim_count = input0->dim_count; - for(int i = 0; i < input1->dim_count; i++) { + for (int i = 0; i < input1->dim_count; i++) { input1->dim[i] = input0->dim[i]; in1_size *= input1->dim[i]; } @@ -96,10 +96,12 @@ int main(int argc, char** argv) input1->dtype = CSINN_DTYPE_FLOAT32; input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim_count = input0->dim_count; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = csi_ref_max_internal_s32(input0->dim[i], input1->dim[i]); // in fact, ouput->dim[i] are always equal to input0->dim[i] + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = shl_ref_max_internal_s32( + input0->dim[i], + input1->dim[i]); // in fact, ouput->dim[i] are always equal to input0->dim[i] out_size *= output->dim[i]; } reference->data = (float *)(buffer + 1 + input0->dim_count + in0_size + in1_size); @@ -109,15 +111,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_maximum(input0, input1, output, ¶ms, difference); + test_maximum(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/maxpool.c b/tests/validation_graph/maxpool.c index d228b980..070150aa 100644 --- a/tests/validation_graph/maxpool.c +++ b/tests/validation_graph/maxpool.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_maxpool2d_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_maxpool2d_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_maxpool2d(input, output, params); + csinn_maxpool2d(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_maxpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params, - float difference); +void test_maxpool(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool2d(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,11 +80,11 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; - output->dim[2] = buffer[12]; // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1 - output->dim[3] = buffer[13]; // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1 + output->dim[2] = buffer[12]; // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1 + output->dim[3] = buffer[13]; // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1 output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 14 + in_size); @@ -94,23 +94,22 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct pool_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.ceil_mode = 0; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.count_include_pad = 0; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->ceil_mode = 0; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->count_include_pad = 0; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_maxpool(input, output, ¶ms, difference); + test_maxpool(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/mean.c b/tests/validation_graph/mean.c index 9e6f5c15..b53755b8 100644 --- a/tests/validation_graph/mean.c +++ b/tests/validation_graph/mean.c @@ -16,57 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_mean_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_mean_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_mean(input, output, params); + csinn_mean(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_mean(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params, - float difference); +void test_mean(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, float difference); bool find_axis(int *axis, int axis_cnt, int index) { - for(int i = 0; i < axis_cnt; i++) { - if(axis[i] == index) { + for (int i = 0; i < axis_cnt; i++) { + if (axis[i] == index) { return true; } } return false; } -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mean(graph).\n"); @@ -74,19 +74,19 @@ int main(int argc, char** argv) bool keep_dim = buffer[4]; int axis_count = buffer[5]; int *axis = (int *)malloc(axis_count * sizeof(int)); - for(int i = 0; i < axis_count; i++) { + for (int i = 0; i < axis_count; i++) { axis[i] = buffer[6 + i]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 1; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -96,15 +96,15 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - if(keep_dim) { + if (keep_dim) { output->dim_count = input->dim_count; - output->dim[0] = input->dim[0]; // can not reduce on batch and channel axis + output->dim[0] = input->dim[0]; // can not reduce on batch and channel axis output->dim[1] = input->dim[1]; - for(int i = 2; i < output->dim_count; i++) { - if(find_axis(axis, axis_count, i) == true) { + for (int i = 2; i < output->dim_count; i++) { + if (find_axis(axis, axis_count, i) == true) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; @@ -112,17 +112,17 @@ int main(int argc, char** argv) } } else { output->dim_count = input->dim_count - axis_count; - output->dim[0] = input->dim[0]; // can not reduce on batch and channel axis + output->dim[0] = input->dim[0]; // can not reduce on batch and channel axis output->dim[1] = input->dim[1]; int j = 2; - for(int i = 2; i < input->dim_count; i++) { - if(find_axis(axis, axis_count, i) == false) { + for (int i = 2; i < input->dim_count; i++) { + if (find_axis(axis, axis_count, i) == false) { output->dim[j] = input->dim[i]; j++; } } } - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { out_size *= output->dim[i]; } reference->data = (float *)(buffer + 6 + axis_count + in_size); @@ -132,18 +132,18 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct reduce_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = axis; - params.axis_count = axis_count; - params.keepdims = keep_dim; + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->axis_count = axis_count; + params->keepdims = keep_dim; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_mean(input, output, ¶ms, difference); + test_mean(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/minimum.c b/tests/validation_graph/minimum.c index 85bbcba5..24d8f8a0 100644 --- a/tests/validation_graph/minimum.c +++ b/tests/validation_graph/minimum.c @@ -16,65 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_minimum_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_minimum_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_minimum(input0, input1, output, params); + csinn_minimum(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of minimum(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 1, in1_size = 1, out_size = 1; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim_count = buffer[0]; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; in0_size *= input0->dim[i]; } @@ -85,9 +85,9 @@ int main(int argc, char** argv) input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim_count = input0->dim_count; - for(int i = 0; i < input1->dim_count; i++) { + for (int i = 0; i < input1->dim_count; i++) { input1->dim[i] = input0->dim[i]; in1_size *= input1->dim[i]; } @@ -98,10 +98,12 @@ int main(int argc, char** argv) input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim_count = input0->dim_count; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = csi_ref_max_internal_s32(input0->dim[i], input1->dim[i]); // in fact, ouput->dim[i] are always equal to input0->dim[i] + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = shl_ref_max_internal_s32( + input0->dim[i], + input1->dim[i]); // in fact, ouput->dim[i] are always equal to input0->dim[i] out_size *= output->dim[i]; } reference->data = (float *)(buffer + 1 + input0->dim_count + in0_size + in1_size); @@ -111,14 +113,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_minimum(input0, input1, output, ¶ms, difference); + test_minimum(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/mul.c b/tests/validation_graph/mul.c index e4913c2b..8824e087 100644 --- a/tests/validation_graph/mul.c +++ b/tests/validation_graph/mul.c @@ -16,79 +16,79 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, - struct csi_tensor *output, struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, - float *output_data, float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_mul_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_mul_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_mul(input0, input1, output, params); + csinn_mul(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, - csi_tensor_size(output), false); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), + false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mul(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; + int flag = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 0, in1_size = 0, out_size = 0; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->name = "input0"; float *input0_data = (float *)(buffer + 5); - input0->data = input0_data; + input0->data = input0_data; input0->dtype = CSINN_DTYPE_FLOAT32; input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - if(flag) { + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in1_size = input1->dim[0]; @@ -102,34 +102,33 @@ int main(int argc, char** argv) } input1->name = "input1"; float *input1_data = (float *)(buffer + 5 + in0_size); - input1->data = input1_data; + input1->data = input1_data; input1->dtype = CSINN_DTYPE_FLOAT32; input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; output->dim[3] = input0->dim[3]; output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - reference->data= (float *)(buffer + 5 + in0_size + in1_size); + reference->data = (float *)(buffer + 5 + in0_size + in1_size); output->data = reference->data; output->name = "output"; output->layout = CSINN_LAYOUT_NCHW; output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_mul(input0, input1, output, ¶ms, difference); + test_mul(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/negative.c b/tests/validation_graph/negative.c index 0b477fb1..36d553a1 100644 --- a/tests/validation_graph/negative.c +++ b/tests/validation_graph/negative.c @@ -16,33 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of negative(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 1, out_size = 1; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); + struct csinn_tensor *input = csinn_alloc_tensor(sess); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[1 + i]; in_size *= input->dim[i]; } @@ -53,9 +52,9 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { output->dim[i] = input->dim[i]; out_size *= output->dim[i]; } @@ -64,48 +63,47 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct siso_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; /* light: unsupport negative now*/ - if (csi_negative_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_negative_init(input, output, params) != CSINN_TRUE) { printf("negative init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_negative(input, output, ¶ms); + csinn_negative(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -118,7 +116,7 @@ int main(int argc, char** argv) free(reference->qinfo); free(reference); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/pad.c b/tests/validation_graph/pad.c index 9c7ce11a..3cbf67b2 100644 --- a/tests/validation_graph/pad.c +++ b/tests/validation_graph/pad.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_pad_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_pad_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_pad(input, output, params); + csinn_pad(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params, - float difference); +void test_pad(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pad(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2] + buffer[6] + buffer[7]; @@ -94,25 +94,24 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct pad_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; int32_t pad_top = buffer[6]; int32_t pad_down = buffer[7]; - int32_t pad_before[4] = {0, 0, pad_top, pad_left}; // NCHW - int32_t pad_after[4] = {0, 0, pad_down, pad_right}; // NCHW - params.pad_before = pad_before; - params.pad_after = pad_after; - params.pad_num = input->dim_count; + int32_t pad_before[4] = {0, 0, pad_top, pad_left}; // NCHW + int32_t pad_after[4] = {0, 0, pad_down, pad_right}; // NCHW + params->pad_before = pad_before; + params->pad_after = pad_after; + params->pad_num = input->dim_count; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_pad(input, output, ¶ms, difference); + test_pad(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/prelu.c b/tests/validation_graph/prelu.c index bc33153f..a14548ce 100644 --- a/tests/validation_graph/prelu.c +++ b/tests/validation_graph/prelu.c @@ -16,62 +16,63 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params, struct csi_session *sess, - struct csi_tensor *real_input, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data, + float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_prelu_init(input, alpha, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_prelu_init(input, alpha, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_prelu(input, alpha, output, params); + csinn_prelu(input, alpha, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output, - struct prelu_params *params, float difference); +void test_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha, struct csinn_tensor *output, + struct csinn_prelu_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0, alpha_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -81,8 +82,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* alpha tensor configuration */ - struct csi_tensor *alpha = csi_alloc_tensor(NULL); - alpha->dim[0] = buffer[1]; // channel + struct csinn_tensor *alpha = csinn_alloc_tensor(NULL); + alpha->dim[0] = buffer[1]; // channel alpha->dim_count = 1; alpha_size = alpha->dim[0]; alpha->name = "alpha"; @@ -93,7 +94,7 @@ int main(int argc, char** argv) alpha->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -107,15 +108,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct prelu_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = 1; // channel dim + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = 1; // channel dim /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_prelu(input, alpha, output, ¶ms, difference); + test_prelu(input, alpha, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/relu.c b/tests/validation_graph/relu.c index ec336f6c..5a8f2449 100644 --- a/tests/validation_graph/relu.c +++ b/tests/validation_graph/relu.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_relu_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_relu_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_relu(input, output, params); + csinn_relu(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference); +void test_relu(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -94,15 +94,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct relu_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_relu(input, output, ¶ms, difference); + test_relu(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/relu1.c b/tests/validation_graph/relu1.c index dd9c51cf..8e20d44a 100644 --- a/tests/validation_graph/relu1.c +++ b/tests/validation_graph/relu1.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_relu1_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_relu1_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_relu1(input, output, params); + csinn_relu1(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference); +void test_relu1(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu1(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // - input->dim[2] = buffer[2]; // - input->dim[3] = buffer[3]; // + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // + input->dim[2] = buffer[2]; // + input->dim[3] = buffer[3]; // input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -94,14 +94,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct relu_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.n = 1.0f; // clamp max_value + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->n = 1.0f; // clamp max_value float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_relu1(input, output, ¶ms, difference); + test_relu1(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/relu6.c b/tests/validation_graph/relu6.c index 53594b5a..6ad70bfc 100644 --- a/tests/validation_graph/relu6.c +++ b/tests/validation_graph/relu6.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_relu6_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_relu6_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_relu6(input, output, params); + csinn_relu6(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params, - float difference); +void test_relu6(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // - input->dim[2] = buffer[2]; // - input->dim[3] = buffer[3]; // + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // + input->dim[2] = buffer[2]; // + input->dim[3] = buffer[3]; // input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -94,15 +94,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct relu_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.n = 6.0f; // clamp max_value + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->n = 6.0f; // clamp max_value /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_relu6(input, output, ¶ms, difference); + test_relu6(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/reshape.c b/tests/validation_graph/reshape.c index 44721bcd..ad23b070 100644 --- a/tests/validation_graph/reshape.c +++ b/tests/validation_graph/reshape.c @@ -16,66 +16,66 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_reshape_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_reshape_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_reshape(input, output, params); + csinn_reshape(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_reshape(struct csi_tensor *input, struct csi_tensor *output, - struct reshape_params *params, float difference); +void test_reshape(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reshape(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int reshape_count = buffer[4]; int *reshape = (int *)malloc(reshape_count * sizeof(int)); - for(int i = 0; i < reshape_count; i++) { + for (int i = 0; i < reshape_count; i++) { reshape[i] = buffer[5 + i]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 1; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -84,9 +84,9 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim_count = reshape_count; - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { output->dim[i] = reshape[i]; out_size *= output->dim[i]; } @@ -97,15 +97,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct reshape_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.shape = reshape; - params.shape_num = output->dim_count; + struct csinn_reshape_params *params = + csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->shape = reshape; + params->shape_num = output->dim_count; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_reshape(input, output, ¶ms, difference); + test_reshape(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/resize_bilinear.c b/tests/validation_graph/resize_bilinear.c index 100198b5..be71e6b4 100644 --- a/tests/validation_graph/resize_bilinear.c +++ b/tests/validation_graph/resize_bilinear.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_resize_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_resize_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_resize(input, output, params); + csinn_resize(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference); +void test_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = buffer[4]; @@ -94,15 +94,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct resize_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.resize_mode = CSINN_RESIZE_BILINEAR; - params.align_corners = buffer[6]; + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->resize_mode = CSINN_RESIZE_BILINEAR; + params->align_corners = buffer[6]; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_resize(input, output, ¶ms, difference); + test_resize(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/resize_nearest_neighbor.c b/tests/validation_graph/resize_nearest_neighbor.c index 86ddad93..b7d0f1a8 100644 --- a/tests/validation_graph/resize_nearest_neighbor.c +++ b/tests/validation_graph/resize_nearest_neighbor.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_resize_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_resize_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_resize(input, output, params); + csinn_resize(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params, - float difference); +void test_resize(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = buffer[4]; @@ -94,15 +94,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct resize_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_resize(input, output, ¶ms, difference); + test_resize(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/sigmoid.c b/tests/validation_graph/sigmoid.c index 6cbe9c64..53a46bfe 100644 --- a/tests/validation_graph/sigmoid.c +++ b/tests/validation_graph/sigmoid.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_sigmoid_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_sigmoid_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_sigmoid(input, output, params); + csinn_sigmoid(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_sigmoid(struct csi_tensor *input, struct csi_tensor *output, - struct sigmoid_params *params, float difference); +void test_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -79,7 +79,7 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -93,14 +93,14 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct sigmoid_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_sigmoid(input, output, ¶ms, difference); + test_sigmoid(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/softmax.c b/tests/validation_graph/softmax.c index ccd060fd..d0675212 100644 --- a/tests/validation_graph/softmax.c +++ b/tests/validation_graph/softmax.c @@ -16,58 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct softmax_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_softmax_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_softmax_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_softmax(input, output, params); + csinn_softmax(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_softmax(struct csi_tensor *input, struct csi_tensor *output, - struct softmax_params *params, float difference); +void test_softmax(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int axis = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = buffer[0]; input->dim[1] = buffer[1]; input->dim[2] = buffer[2]; @@ -81,7 +81,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -95,15 +95,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct softmax_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = axis; + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_softmax(input, output, ¶ms, difference); + test_softmax(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/space_to_batch_nd.c b/tests/validation_graph/space_to_batch_nd.c index 85e0c07f..5257a98b 100644 --- a/tests/validation_graph/space_to_batch_nd.c +++ b/tests/validation_graph/space_to_batch_nd.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_batch_nd(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); float min_value, max_value; int in_size = 1, out_size = 1; int prod_block = 1; int spatial_shape_cnt = buffer[0]; - int remain_shape_cnt = buffer[1]; + int remain_shape_cnt = buffer[1]; int32_t *block_shape = (int32_t *)malloc(spatial_shape_cnt * sizeof(int32_t)); int32_t *paddings = (int32_t *)malloc(2 * spatial_shape_cnt * sizeof(int32_t)); enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; - for(int i = 0; i < spatial_shape_cnt; i++) { + for (int i = 0; i < spatial_shape_cnt; i++) { block_shape[i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i]; paddings[2 * i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 1]; paddings[2 * i + 1] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 2]; @@ -45,17 +45,16 @@ int main(int argc, char** argv) } /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_LIGHT; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); + struct csinn_tensor *input = csinn_alloc_tensor(sess); input->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } @@ -66,16 +65,18 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); - output->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt; // output->dim_cnt = input->dim_cnt - output->dim[0] = input->dim[0] * prod_block; // batch_out - for(int i = 0; i < spatial_shape_cnt; i++) { - output->dim[1 + i] = (input->dim[1 + i] + paddings[2 * i] + paddings[ 2 * i + 1]) / block_shape[i]; + struct csinn_tensor *output = csinn_alloc_tensor(sess); + output->dim_count = + 1 + spatial_shape_cnt + remain_shape_cnt; // output->dim_cnt = input->dim_cnt + output->dim[0] = input->dim[0] * prod_block; // batch_out + for (int i = 0; i < spatial_shape_cnt; i++) { + output->dim[1 + i] = + (input->dim[1 + i] + paddings[2 * i] + paddings[2 * i + 1]) / block_shape[i]; } - for(int i = 0; i < remain_shape_cnt; i++) { + for (int i = 0; i < remain_shape_cnt; i++) { output->dim[1 + spatial_shape_cnt + i] = input->dim[1 + spatial_shape_cnt + i]; } - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { out_size *= output->dim[i]; } reference->data = (float *)(buffer + 2 + spatial_shape_cnt * 3 + input->dim_count + in_size); @@ -83,47 +84,45 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct space_to_batch_nd_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.block_shape = block_shape; - params.paddings = paddings; - params.spatial_dim_cnt = spatial_shape_cnt; - - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_space_to_batch_nd_params *params; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->block_shape = block_shape; + params->paddings = paddings; + params->spatial_dim_cnt = spatial_shape_cnt; + + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; - if (csi_space_to_batch_nd_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_space_to_batch_nd_init(input, output, params) != CSINN_TRUE) { printf("spce_to_batch_nd init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_space_to_batch_nd(input, output, ¶ms); + csinn_space_to_batch_nd(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* FIX ME */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); /* free alloced memory */ @@ -137,7 +136,7 @@ int main(int argc, char** argv) free(block_shape); free(paddings); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/space_to_depth.c b/tests/validation_graph/space_to_depth.c index d48ec375..02f859b7 100644 --- a/tests/validation_graph/space_to_depth.c +++ b/tests/validation_graph/space_to_depth.c @@ -16,62 +16,62 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct space_to_depth_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_space_to_depth_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_space_to_depth_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_space_to_depth(input, output, params); + csinn_space_to_depth(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_space_to_depth(struct csi_tensor *input, struct csi_tensor *output, - struct space_to_depth_params *params, float difference); +void test_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int block_size = buffer[4]; - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -81,7 +81,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1] * block_size * block_size; output->dim[2] = input->dim[2] / block_size; @@ -95,15 +95,15 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct space_to_depth_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.block_size = block_size; + struct csinn_space_to_depth_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->block_size = block_size; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_space_to_depth(input, output, ¶ms, difference); + test_space_to_depth(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/split.c b/tests/validation_graph/split.c index 793dada9..032d71f2 100644 --- a/tests/validation_graph/split.c +++ b/tests/validation_graph/split.c @@ -16,50 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float **output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float **output_data, float diff) { int output_cnt = params->output_num; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(output_cnt, sess); - csi_split_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(output_cnt, sess); + csinn_split_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_split(input, output, params); + csinn_split(input, output, params); - for(int i = 0; i < output_cnt; i++) { - csi_set_output(i, output[i], sess); + for (int i = 0; i < output_cnt; i++) { + csinn_set_output(i, output[i], sess); } - csi_session_setup(sess); - - csi_update_input(0, real_input, sess); - csi_session_run(sess); - for(int i = 0; i < output_cnt; i++) { - csi_get_output(i, output[i], sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output[i]); - result_verify_f32(output_data[i], foutput->data, input->data, diff, csi_tensor_size(output[i]), - false); + csinn_session_setup(sess); + + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + for (int i = 0; i < output_cnt; i++) { + csinn_get_output(i, output[i], sess); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output[i]); + result_verify_f32(output_data[i], foutput->data, input->data, diff, + csinn_tensor_size(output[i]), false); } free_input(real_input); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_split(struct csi_tensor *input, struct csi_tensor **output, - struct split_params *params, float difference); +void test_split(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of split(graph).\n"); @@ -67,25 +67,25 @@ int main(int argc, char** argv) int axis = buffer[4]; int output_cnt = buffer[5]; int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t)); - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { split_index[i] = buffer[axis] / output_cnt; } - struct csi_tensor *reference[output_cnt]; - for(int i = 0; i < output_cnt; i++) { - reference[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *reference[output_cnt]; + for (int i = 0; i < output_cnt; i++) { + reference[i] = csinn_alloc_tensor(NULL); } float min_value, max_value; int in_size = 0; int out_size[output_cnt]; - int acc_out_size = 0; // in fact, different output tensor may has different out_size + int acc_out_size = 0; // in fact, different output tensor may has different out_size /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -95,12 +95,12 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output[output_cnt]; + struct csinn_tensor *output[output_cnt]; char output_name[output_cnt][10]; - for(int i = 0; i < output_cnt; i++) { - output[i] = csi_alloc_tensor(NULL); - for(int j = 0; j < 4; j++) { - if(j == axis) { + for (int i = 0; i < output_cnt; i++) { + output[i] = csinn_alloc_tensor(NULL); + for (int j = 0; j < 4; j++) { + if (j == axis) { output[i]->dim[j] = split_index[i]; } else { output[i]->dim[j] = input->dim[j]; @@ -119,22 +119,21 @@ int main(int argc, char** argv) } /* operator parameter configuration */ - struct split_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = axis; - params.output_num = output_cnt; + struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->output_num = output_cnt; int temp = 0; - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { temp += split_index[i]; split_index[i] = temp; } - params.split_index = split_index; + params->split_index = split_index; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_split(input, output, ¶ms, difference); + test_split(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/squeeze.c b/tests/validation_graph/squeeze.c index 164ac35e..f9951ecc 100644 --- a/tests/validation_graph/squeeze.c +++ b/tests/validation_graph/squeeze.c @@ -16,65 +16,65 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_squeeze_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_squeeze_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_squeeze(input, output, params); + csinn_squeeze(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_squeeze(struct csi_tensor *input, struct csi_tensor *output, - struct squeeze_params *params, float difference); +void test_squeeze(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of squeeze(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int axis_len = buffer[3]; int32_t *axis = (int32_t *)malloc(axis_len * sizeof(int32_t)); - for(int i = 0; i < axis_len; i++) { - axis[i] = buffer[4+i]; + for (int i = 0; i < axis_len; i++) { + axis[i] = buffer[4 + i]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width input->dim[3] = 1; input->dim[4] = 1; input->dim[5] = 1; @@ -87,7 +87,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[2]; @@ -100,16 +100,16 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct squeeze_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.axis = axis; - params.axis_num = axis_len; + struct csinn_squeeze_params *params = + csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->axis_num = axis_len; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_squeeze(input, output, ¶ms, difference); + test_squeeze(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/strided_slice.c b/tests/validation_graph/strided_slice.c index 18174212..afbdbc3a 100644 --- a/tests/validation_graph/strided_slice.c +++ b/tests/validation_graph/strided_slice.c @@ -16,44 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of strided_slice(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int in_out_dim = buffer[0]; int slice_count = buffer[1 + in_out_dim]; - int *begin = (int *)malloc(slice_count * sizeof(int)); - int *end = (int *)malloc(slice_count * sizeof(int)); - int *stride = (int *)malloc(slice_count * sizeof(int)); + int *begin = (int *)malloc(slice_count * sizeof(int)); + int *end = (int *)malloc(slice_count * sizeof(int)); + int *stride = (int *)malloc(slice_count * sizeof(int)); - for(int i = 0; i < slice_count; i++) { + for (int i = 0; i < slice_count; i++) { begin[i] = buffer[2 + in_out_dim + 3 * i]; end[i] = buffer[2 + in_out_dim + 3 * i + 1]; stride[i] = buffer[2 + in_out_dim + 3 * i + 2]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 1, out_size = 1; enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE; /* session configuration */ - struct csi_session *sess = csi_alloc_session(); + struct csinn_session *sess = csinn_alloc_session(); sess->base_api = CSINN_API; - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(sess); + struct csinn_tensor *input = csinn_alloc_tensor(sess); input->dim_count = in_out_dim; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[1 + i]; in_size *= input->dim[i]; } @@ -64,11 +63,11 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); output->dim_count = in_out_dim; - for(int i = 0; i < output->dim_count; i++) { - if(i < slice_count) { - output->dim[i] = ceil( (float)(end[i] - begin[i]) / stride[i] ); + for (int i = 0; i < output->dim_count; i++) { + if (i < slice_count) { + output->dim[i] = ceil((float)(end[i] - begin[i]) / stride[i]); } else { output->dim[i] = input->dim[i]; } @@ -80,55 +79,55 @@ int main(int argc, char** argv) output->name = "output"; get_quant_info(output); - /* operator parameter configuration */ - struct strided_slice_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.begin = begin; - params.end = end; - params.stride = stride; - params.slice_count = slice_count; - - struct csi_tensor *input_tensor = convert_input(input, test_dtype); + struct csinn_strided_slice_params *params = + csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->begin = begin; + params->end = end; + params->stride = stride; + params->slice_count = slice_count; + + struct csinn_tensor *input_tensor = convert_input(input, test_dtype); input->dtype = sess->base_dtype; /* - Cropping on the batch axis is not supported. --> begin[0] = 0, end[0] = batch( input->dim[0] ), stride[0] = 1 - slice_count == input->dim_count + Cropping on the batch axis is not supported. --> begin[0] = 0, end[0] = batch( input->dim[0] + ), stride[0] = 1 slice_count == input->dim_count */ - if (csi_strided_slice_init(input, output, ¶ms) != CSINN_TRUE) { + if (csinn_strided_slice_init(input, output, params) != CSINN_TRUE) { printf("strided_slice init fail.\n\t"); return -1; } - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_strided_slice(input, output, ¶ms); + csinn_strided_slice(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, input_tensor, sess); - csi_session_run(sess); + csinn_update_input(0, input_tensor, sess); + csinn_session_run(sess); - struct csi_tensor *output_tensor = csi_alloc_tensor(NULL); + struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL); output_tensor->data = NULL; output_tensor->dtype = sess->base_dtype; output_tensor->is_const = 0; - int output_num = csi_get_output_number(sess); + int output_num = csinn_get_output_number(sess); printf("output_num = %d\n", output_num); - csi_get_output(0, output_tensor, sess); - memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info)); + csinn_get_output(0, output_tensor, sess); + memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info)); /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) { result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false); - } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) { - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor); + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && + output_tensor->dtype == CSINN_DTYPE_INT8) { + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor); result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false); } @@ -144,7 +143,7 @@ int main(int argc, char** argv) free(end); free(stride); - csi_session_deinit(sess); - csi_free_session(sess); + csinn_session_deinit(sess); + csinn_free_session(sess); return done_testing(); } diff --git a/tests/validation_graph/sub.c b/tests/validation_graph/sub.c index bab9ff54..5abd8f75 100644 --- a/tests/validation_graph/sub.c +++ b/tests/validation_graph/sub.c @@ -16,67 +16,67 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, struct csi_session *sess, - struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_session *sess, struct csinn_tensor *real_input0, + struct csinn_tensor *real_input1, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(2, sess); - csi_set_output_number(1, sess); - csi_sub_init(input0, input1, output, params); + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + csinn_sub_init(input0, input1, output, params); - csi_set_tensor_entry(input0, sess); - csi_set_tensor_entry(input1, sess); - csi_set_input(0, input0, sess); - csi_set_input(1, input1, sess); + csinn_set_tensor_entry(input0, sess); + csinn_set_tensor_entry(input1, sess); + csinn_set_input(0, input0, sess); + csinn_set_input(1, input1, sess); - csi_sub(input0, input1, output, params); + csinn_sub(input0, input1, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input0, sess); - csi_update_input(1, real_input1, sess); - csi_session_run(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); - csi_get_output(0, output, sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output), false); free_input(real_input0); free_input(real_input1); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - struct diso_params *params, float difference); +void test_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_diso_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sub(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in0_size = 0, in1_size = 0, out_size = 0; /* input0 tensor configuration */ - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->name = "input0"; @@ -86,11 +86,11 @@ int main(int argc, char** argv) input0->layout = CSINN_LAYOUT_NCHW; /* input1 tensor configuration */ - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - input1->dim[0] = buffer[0]; // batch - input1->dim[1] = buffer[1]; // channel - input1->dim[2] = buffer[2]; // height - input1->dim[3] = buffer[3]; // width + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); + input1->dim[0] = buffer[0]; // batch + input1->dim[1] = buffer[1]; // channel + input1->dim[2] = buffer[2]; // height + input1->dim[3] = buffer[3]; // width input1->dim_count = 4; in1_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3]; input1->name = "input1"; @@ -100,13 +100,14 @@ int main(int argc, char** argv) input1->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; output->dim[2] = input0->dim[2]; output->dim[3] = input0->dim[3]; output->dim_count = 4; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; + ; reference->data = (float *)(buffer + 4 + in0_size + in1_size); output->data = reference->data; output->name = "output"; @@ -114,14 +115,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct diso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_sub(input0, input1, output, ¶ms, difference); + test_sub(input0, input1, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/tanh.c b/tests/validation_graph/tanh.c index 91e9c244..42ac1ca3 100644 --- a/tests/validation_graph/tanh.c +++ b/tests/validation_graph/tanh.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_tanh_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_tanh_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_tanh(input, output, params); + csinn_tanh(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params, - float difference); +void test_tanh(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh(graph).\n"); int *buffer = read_input_data_f32(argv[1]); - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 0, out_size = 0; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[1]; // batch - input->dim[1] = buffer[2]; // in_channel - input->dim[2] = buffer[3]; // height - input->dim[3] = buffer[4]; // width + struct csinn_tensor *input = csinn_alloc_tensor(NULL); + input->dim[0] = buffer[1]; // batch + input->dim[1] = buffer[2]; // in_channel + input->dim[2] = buffer[3]; // height + input->dim[3] = buffer[4]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; @@ -80,7 +80,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; @@ -94,14 +94,13 @@ int main(int argc, char** argv) output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct siso_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_tanh(input, output, ¶ms, difference); + test_tanh(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_graph/transpose.c b/tests/validation_graph/transpose.c index d3ecab37..b21db9a4 100644 --- a/tests/validation_graph/transpose.c +++ b/tests/validation_graph/transpose.c @@ -16,63 +16,63 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params, - struct csi_session *sess, struct csi_tensor *real_input, float *output_data, - float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, struct csinn_session *sess, + struct csinn_tensor *real_input, float *output_data, float diff) { - csi_session_init(sess); - csi_set_input_number(1, sess); - csi_set_output_number(1, sess); - csi_transpose_init(input, output, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + csinn_transpose_init(input, output, params); - csi_set_tensor_entry(input, sess); - csi_set_input(0, input, sess); + csinn_set_tensor_entry(input, sess); + csinn_set_input(0, input, sess); - csi_transpose(input, output, params); + csinn_transpose(input, output, params); - csi_set_output(0, output, sess); - csi_session_setup(sess); + csinn_set_output(0, output, sess); + csinn_session_setup(sess); - csi_update_input(0, real_input, sess); - csi_session_run(sess); - csi_get_output(0, output, sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + csinn_get_output(0, output, sess); - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); free_input(real_input); - csi_ref_tensor_transform_free_f32(foutput); - csi_session_deinit(sess); - csi_free_session(sess); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } -void test_transpose(struct csi_tensor *input, struct csi_tensor *output, - struct transpose_params *params, float difference); +void test_transpose(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of transpose(graph).\n"); int *buffer = read_input_data_f32(argv[1]); int32_t *permute = (int32_t *)malloc(buffer[0] * sizeof(int32_t)); - for(int i = 0; i < buffer[0]; i++) { + for (int i = 0; i < buffer[0]; i++) { permute[i] = buffer[1 + buffer[0] + i]; } - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size = 1, out_size = 1; /* input tensor configuration */ - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[1 + i]; in_size *= input->dim[i]; } @@ -82,29 +82,29 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; /* output tensor configuration */ - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { output->dim[i] = input->dim[permute[i]]; out_size *= output->dim[i]; } reference->data = (float *)(buffer + 1 + 3 * input->dim_count + in_size); - output->data= reference->data; + output->data = reference->data; output->name = "output"; output->layout = CSINN_LAYOUT_NCHW; output->dtype = CSINN_DTYPE_FLOAT32; /* operator parameter configuration */ - struct transpose_params params; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_NPU_GRAPH; - params.permute = permute; - params.permute_num = input->dim_count; + struct csinn_transpose_params *params = + csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL); + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->permute = permute; + params->permute_num = input->dim_count; /* verify result */ float difference = argc > 2 ? atof(argv[2]) : 1e-4; - test_transpose(input, output, ¶ms, difference); + test_transpose(input, output, params, difference); return done_testing(); } diff --git a/tests/validation_layer/Makefile.c906 b/tests/validation_layer/Makefile.c906 index f31a4774..92900fb9 100644 --- a/tests/validation_layer/Makefile.c906 +++ b/tests/validation_layer/Makefile.c906 @@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections CFLAGS += -DCSINN_API=3 -LIB_NAME = csi_nn2_c906 +LIB_NAME = shl_c906 CC = riscv64-unknown-linux-gnu-gcc test_objs = diff --git a/tests/validation_layer/Makefile.c908 b/tests/validation_layer/Makefile.c908 new file mode 100644 index 00000000..9b02ac6d --- /dev/null +++ b/tests/validation_layer/Makefile.c908 @@ -0,0 +1,44 @@ +LIB_DIR = ../../riscv_build +INCLUDE = -I../../include -I../utils -I./layer +CFLAGS = -O0 -g3 -static +CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadv -mabi=lp64d +CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections +CFLAGS += -DCSINN_API=12 +LIB_NAME = shl_c908 +CC = riscv64-unknown-linux-gnu-gcc +CPLUS = riscv64-unknown-linux-gnu-g++ +TYPE=? + +test_objs = + + + +test_objs += averagepool.o +test_objs += convolution.o +test_objs += depthwise_convolution.o +test_objs += fullyconnected.o +test_objs += global_avgpool.o +test_objs += global_maxpool.o +test_objs += maxpool.o + + + +utils_objs = + +utils_objs += ../utils/math_snr.o +utils_objs += ../utils/test_utils.o +# template_objs += ./layer/common.o + +all: csi + +csi: $(utils_objs) $(test_objs) + +$(utils_objs): %.o: %.c + $(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@ + +$(test_objs): %.o: %.cpp + $(CPLUS) -c $(CFLAGS) $(INCLUDE) -D DTYPE=$(TYPE) $< -o $@ + $(CPLUS) $@ $(CFLAGS) $(BOARD) $(utils_objs) $(template_objs) -L$(LIB_DIR) -l$(LIB_NAME) -lc -lm -o $@.elf -lgcov + +clean: + rm -rf $(test_objs) $(utils_objs) *.a *.asm *.elf *.asm diff --git a/tests/validation_layer/Makefile.rvv b/tests/validation_layer/Makefile.rvv index d5e1dc23..c3054919 100644 --- a/tests/validation_layer/Makefile.rvv +++ b/tests/validation_layer/Makefile.rvv @@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections CFLAGS += -DCSINN_API=15 -LIB_NAME = csi_nn2_rvv +LIB_NAME = shl_rvv CC = riscv64-unknown-linux-gnu-gcc CPLUS = riscv64-unknown-linux-gnu-g++ RVV=1 diff --git a/tests/validation_layer/abs.cpp b/tests/validation_layer/abs.cpp index 7ef09536..ccd0383c 100644 --- a/tests/validation_layer/abs.cpp +++ b/tests/validation_layer/abs.cpp @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_utils.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -28,10 +27,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of abs(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); @@ -57,18 +58,17 @@ int main(int argc, char **argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 4); reference->data = (float *)(buffer + 4 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_abs_init, csi_abs, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_abs_init, csi_abs, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_abs_init, csinn_abs, &difference); + test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_abs_init, csinn_abs, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_abs_init, csi_abs, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_abs_init, csinn_abs, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/acos.c b/tests/validation_layer/acos.c index 5881b2fb..f93c17c6 100644 --- a/tests/validation_layer/acos.c +++ b/tests/validation_layer/acos.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acos(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_acos_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_acos_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_acos_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_acos_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_acos_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_acos_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/acosh.c b/tests/validation_layer/acosh.c index f72d5042..5efb8cf4 100644 --- a/tests/validation_layer/acosh.c +++ b/tests/validation_layer/acosh.c @@ -16,27 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of acosh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -51,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_acosh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_acosh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_acosh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_acosh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_acosh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_acosh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/add.cpp b/tests/validation_layer/add.cpp index 5d5302fa..cc1a09f3 100644 --- a/tests/validation_layer/add.cpp +++ b/tests/validation_layer/add.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,11 +28,13 @@ int main(int argc, char **argv) { init_testsuite("Testing function of add(layer).\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); @@ -48,11 +49,24 @@ int main(int argc, char **argv) input0->dtype = CSINN_DTYPE_FLOAT32; input0->is_const = 0; input0->quant_channel = 1; - if (flag) { + if (int(flag) == 1) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; - } else { + } + else if (int(flag) == 2) { + input1->dim[0] = 1; + input1->dim_count = 1; + in_size1 = input1->dim[0]; + } + else if (int(flag) == 3) { + input1->dim[0] = input0->dim[1]; + input1->dim[1] = input0->dim[2]; + input1->dim[2] = 1; + input1->dim_count = 3; + in_size1 = input1->dim[0] * input1->dim[1]; + } + else { input1->dim[0] = input0->dim[0]; input1->dim[1] = input0->dim[1]; input1->dim[2] = input0->dim[2]; @@ -76,8 +90,7 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input0->data = (float *)(buffer + 5); input1->data = (float *)(buffer + 5 + in_size0); @@ -86,18 +99,18 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if THEAD_RVV - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT32, csi_add_init, - csi_nn_rvv_add_fp32, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT16, csi_add_init, - csi_nn_rvv_add_fp16, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_add_init, - csi_nn_rvv_add_int8, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_add_init, + shl_rvv_add_fp32, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_add_init, + shl_rvv_add_fp16, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_add_init, + shl_rvv_add_int8, &difference); #else - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT32, csi_add_init, csi_add, + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_add_init, csinn_add, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_add_init, csi_add, + test_binary_op(input0, input1, output, params, CSINN_QUANT_UINT8_ASYM, csinn_add_init, csinn_add, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_add_init, csi_add, + test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_add_init, csinn_add, &difference); #endif diff --git a/tests/validation_layer/and.c b/tests/validation_layer/and.c index 6511d898..184d40c0 100644 --- a/tests/validation_layer/and.c +++ b/tests/validation_layer/and.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of and u32.\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -57,18 +59,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (uint32_t *)(buffer + 1 + input0->dim_count); - input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size); + input0->data = (uint32_t *)(buffer + 1 + input0->dim_count); + input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size); reference->data = (uint32_t *)(buffer + 1 + input0->dim_count + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_and_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_and_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_and_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_and_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/arange.c b/tests/validation_layer/arange.c index 196b33eb..a1388f7f 100644 --- a/tests/validation_layer/arange.c +++ b/tests/validation_layer/arange.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" @@ -26,35 +26,37 @@ int main(int argc, char **argv) { init_testsuite("Testing function of arange(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct arange_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_arange_params *params = + csinn_alloc_params(sizeof(struct csinn_arange_params), sess); int out_size = 1; int *buffer = read_input_data_f32(argv[1]); out_size = buffer[3]; - params.start = buffer[0]; - params.stop = buffer[1]; - params.step = buffer[2]; + params->start = buffer[0]; + params->stop = buffer[1]; + params->step = buffer[2]; output->dim_count = 1; output->dim[0] = out_size; output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = 0; reference->data = (float *)(buffer + 4); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_arange_CSINN_QUANT_FLOAT32(output, ¶ms, &difference); - test_arange_CSINN_QUANT_UINT8_ASYM(output, ¶ms, &difference); - test_arange_CSINN_QUANT_INT8_SYM(output, ¶ms, &difference); + test_arange_CSINN_QUANT_FLOAT32(output, params, &difference); + test_arange_CSINN_QUANT_UINT8_ASYM(output, params, &difference); + test_arange_CSINN_QUANT_INT8_SYM(output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/argmax.c b/tests/validation_layer/argmax.c index b7d67e8f..27a8ac3e 100644 --- a/tests/validation_layer/argmax.c +++ b/tests/validation_layer/argmax.c @@ -16,51 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - - init_testsuite("Testing function of argmax(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -74,33 +72,30 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); // output->data = malloc(out_size * sizeof(float)); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - test_argmax_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_argmax_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_argmax_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_argmax_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_argmax_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_argmax_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/argmin.c b/tests/validation_layer/argmin.c index 46ec6c87..5dc4ef72 100644 --- a/tests/validation_layer/argmin.c +++ b/tests/validation_layer/argmin.c @@ -16,51 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { - - init_testsuite("Testing function of argmin(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -74,33 +72,30 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); // output->data = malloc(out_size * sizeof(float)); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - test_argmin_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_argmin_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_argmin_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_argmin_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_argmin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_argmin_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/asin.c b/tests/validation_layer/asin.c index eeafb769..04f64387 100644 --- a/tests/validation_layer/asin.c +++ b/tests/validation_layer/asin.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asin(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_asin_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_asin_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_asin_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_asin_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_asin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_asin_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/asinh.c b/tests/validation_layer/asinh.c index 9b6f6a17..d83eab17 100644 --- a/tests/validation_layer/asinh.c +++ b/tests/validation_layer/asinh.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of asinh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_asinh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_asinh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_asinh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_asinh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_asinh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_asinh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/atan.c b/tests/validation_layer/atan.c index 67d067b6..4fe305c6 100644 --- a/tests/validation_layer/atan.c +++ b/tests/validation_layer/atan.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atan(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_atan_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_atan_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_atan_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_atan_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_atan_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_atan_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/atanh.c b/tests/validation_layer/atanh.c index f2194bad..2da47b37 100644 --- a/tests/validation_layer/atanh.c +++ b/tests/validation_layer/atanh.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of atanh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_atanh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_atanh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_atanh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_atanh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_atanh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_atanh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/averagepool.cpp b/tests/validation_layer/averagepool.cpp index 821b8d03..ee309904 100644 --- a/tests/validation_layer/averagepool.cpp +++ b/tests/validation_layer/averagepool.cpp @@ -16,48 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" #include "math_snr.h" +#include "shl_thead_rvv.h" #include "test_utils.h" #include "testutil.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool2d(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; @@ -72,23 +72,24 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.count_include_pad = 1; - params.ceil_mode = 0; + params->base.api = CSINN_API; + params->count_include_pad = buffer[14]; + params->ceil_mode = buffer[15]; - input->data = (float *)(buffer + 15); - reference->data = (float *)(buffer + 15 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 16); + reference->data = (float *)(buffer + 16 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_avgpool2d_init, csi_avgpool2d, +#if (DTYPE==32) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_avgpool2d_init, csinn_avgpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_avgpool2d_init, csi_avgpool2d, +#elif (DTYPE==16) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_avgpool2d_init, csinn_avgpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_avgpool2d_init, csi_avgpool2d, +#elif (DTYPE==8) + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_avgpool2d_init, csinn_avgpool2d, &difference); - - +#endif return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/averagepool3d.c b/tests/validation_layer/averagepool3d.c index d1bc927f..7fb14203 100644 --- a/tests/validation_layer/averagepool3d.c +++ b/tests/validation_layer/averagepool3d.c @@ -16,31 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of avgpool3d(layer).\n"); - - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -48,21 +48,21 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.count_include_pad = buffer[20]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->count_include_pad = buffer[20]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCDHW; @@ -77,17 +77,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 21); reference->data = (float *)(buffer + 21 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_avgpool3d_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_avgpool3d_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_avgpool3d_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_avgpool3d_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_avgpool3d_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_avgpool3d_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/batch_norm.c b/tests/validation_layer/batch_norm.c index 073a019a..406433bf 100644 --- a/tests/validation_layer/batch_norm.c +++ b/tests/validation_layer/batch_norm.c @@ -16,24 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch normalization(layer).\n"); - - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *mean = csi_alloc_tensor(NULL); - struct csi_tensor *variance = csi_alloc_tensor(NULL); - struct csi_tensor *beta = csi_alloc_tensor(NULL); - struct csi_tensor *gamma = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct bn_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *mean = csinn_alloc_tensor(sess); + struct csinn_tensor *variance = csinn_alloc_tensor(sess); + struct csinn_tensor *beta = csinn_alloc_tensor(sess); + struct csinn_tensor *gamma = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), sess); int size = 1; int *buffer = read_input_data_f32(argv[1]); @@ -47,10 +48,10 @@ int main(int argc, char** argv) size *= input->dim[i]; } - mean->dim_count = 1; + mean->dim_count = 1; variance->dim_count = 1; - gamma->dim_count = 1; - beta->dim_count = 1; + gamma->dim_count = 1; + beta->dim_count = 1; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NHWC; @@ -76,23 +77,29 @@ int main(int argc, char** argv) beta->layout = CSINN_LAYOUT_O; beta->is_const = 0; beta->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NHWC; - params.epsilon = *((float *)buffer + 1 + input->dim_count); - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NHWC; + params->epsilon = *((float *)buffer + 1 + input->dim_count); + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); - mean->data = (float *)(buffer + 2 + input->dim_count + size); - variance->data = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); - gamma->data = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); - beta->data = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); - reference->data = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); - output->data = reference->data; + input->data = (float *)(buffer + 2 + input->dim_count); + mean->data = (float *)(buffer + 2 + input->dim_count + size); + variance->data = + (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]); + gamma->data = + (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]); + beta->data = + (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]); + reference->data = + (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_batch_normalization_CSINN_QUANT_FLOAT32(input, mean, variance, gamma, beta, output, ¶ms, &difference); - test_batch_normalization_CSINN_QUANT_UINT8_ASYM(input, mean, variance, gamma, beta, output, ¶ms, &difference); - test_batch_normalization_CSINN_QUANT_INT8_SYM(input, mean, variance, gamma, beta, output, ¶ms, &difference); + test_batch_normalization_CSINN_QUANT_FLOAT32(input, mean, variance, gamma, beta, output, params, + &difference); + test_batch_normalization_CSINN_QUANT_UINT8_ASYM(input, mean, variance, gamma, beta, output, + params, &difference); + test_batch_normalization_CSINN_QUANT_INT8_SYM(input, mean, variance, gamma, beta, output, + params, &difference); return done_testing(); } diff --git a/tests/validation_layer/batch_to_space.c b/tests/validation_layer/batch_to_space.c index ed4be8e5..c5ba0294 100644 --- a/tests/validation_layer/batch_to_space.c +++ b/tests/validation_layer/batch_to_space.c @@ -16,39 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of batch_to_space(laver).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct batch_to_space_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_batch_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //in_batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width - params.block_size = buffer[4]; - params.crop_top = buffer[5]; - params.crop_bottom = buffer[6]; - params.crop_left = buffer[7]; - params.crop_right = buffer[8]; + input->dim[0] = buffer[0]; // in_batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width + params->block_size = buffer[4]; + params->crop_top = buffer[5]; + params->crop_bottom = buffer[6]; + params->crop_left = buffer[7]; + params->crop_right = buffer[8]; - output->dim[0] = input->dim[0] / (params.block_size * params.block_size); + output->dim[0] = input->dim[0] / (params->block_size * params->block_size); output->dim[1] = input->dim[1]; - output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom; - output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right; + output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom; + output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right; input->dim_count = 4; output->dim_count = 4; @@ -63,18 +66,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 9); reference->data = (float *)(buffer + 9 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_batch_to_space_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_batch_to_space_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_batch_to_space_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_batch_to_space_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_batch_to_space_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_batch_to_space_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/broadcast_to.c b/tests/validation_layer/broadcast_to.c index 5180c575..66fe52e9 100644 --- a/tests/validation_layer/broadcast_to.c +++ b/tests/validation_layer/broadcast_to.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" @@ -26,17 +26,20 @@ int main(int argc, char **argv) { init_testsuite("Testing function of broadcast_to(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct broadcast_to_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_broadcast_to_params *params = + csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - params.shape_count = buffer[1]; + params->shape_count = buffer[1]; output->dim_count = buffer[1]; for (int i = 0; i < input->dim_count; i++) { @@ -44,12 +47,12 @@ int main(int argc, char **argv) in_size = in_size * input->dim[i]; } - params.shape = (int *)malloc(params.shape_count * sizeof(int)); + params->shape = (int *)malloc(params->shape_count * sizeof(int)); - for (int i = 0; i < params.shape_count; i++) { + for (int i = 0; i < params->shape_count; i++) { output->dim[i] = buffer[2 + input->dim_count + i]; out_size = out_size * output->dim[i]; - params.shape[i] = output->dim[i]; + params->shape[i] = output->dim[i]; } input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; @@ -59,17 +62,16 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count + params.shape_count); - reference->data = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size); + input->data = (float *)(buffer + 2 + input->dim_count + params->shape_count); + reference->data = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_broadcast_to_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_broadcast_to_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_broadcast_to_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_broadcast_to_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_broadcast_to_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_broadcast_to_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/ceil.c b/tests/validation_layer/ceil.c index 07141e14..3b21332d 100644 --- a/tests/validation_layer/ceil.c +++ b/tests/validation_layer/ceil.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of ceil(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_ceil_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_ceil_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_ceil_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_ceil_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_ceil_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_ceil_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/clip.c b/tests/validation_layer/clip.c index b251eecb..50569f3c 100644 --- a/tests/validation_layer/clip.c +++ b/tests/validation_layer/clip.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of clip(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct clip_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -58,19 +60,18 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.min_value = buffer[4]; - params.max_value = buffer[5]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->min_value = buffer[4]; + params->max_value = buffer[5]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_clip_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_clip_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_clip_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_clip_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_clip_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_clip_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/concat.cpp b/tests/validation_layer/concat.cpp index 24a5e2e3..ef1f2c02 100644 --- a/tests/validation_layer/concat.cpp +++ b/tests/validation_layer/concat.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -31,35 +30,35 @@ int main(int argc, char **argv) int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_concat_params *params = csinn_alloc_params(sizeof(struct csinn_concat_params), sess); - struct concat_params params; + params->inputs_count = buffer[4]; - params.inputs_count = buffer[4]; + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *input[params->inputs_count]; - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input[params.inputs_count]; - - for (int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(sess); } - params.axis = buffer[5]; + params->axis = buffer[5]; output->dim_count = 4; for (int i = 0; i < output->dim_count; i++) { - if (i == params.axis) { - output->dim[i] = params.inputs_count * buffer[i]; + if (i == params->axis) { + output->dim[i] = params->inputs_count * buffer[i]; } else { output->dim[i] = buffer[i]; } out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + in_size = out_size / params->inputs_count; + params->base.api = CSINN_API; - for (int i = 0; i < params.inputs_count; i++) { + for (int i = 0; i < params->inputs_count; i++) { input[i]->data = (float *)(buffer + 6 + in_size * i); input[i]->dim[0] = buffer[0]; // batch input[i]->dim[1] = buffer[1]; // height @@ -76,24 +75,24 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - reference->data = (float *)(buffer + 6 + in_size * params.inputs_count); + reference->data = (float *)(buffer + 6 + in_size * params->inputs_count); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; #if THEAD_RVV - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_FLOAT32, - csi_concat_init, csi_nn_rvv_concat_fp32, &difference); - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_FLOAT16, - csi_concat_init, csi_nn_rvv_concat_fp16, &difference); - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_INT8_SYM, - csi_concat_init, csi_nn_rvv_concat_int8, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT32, + csinn_concat_init, shl_rvv_concat_fp32, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT16, + csinn_concat_init, shl_rvv_concat_fp16, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_INT8_SYM, + csinn_concat_init, shl_rvv_concat_int8, &difference); #else - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_FLOAT32, - csi_concat_init, csi_concat, &difference); - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, - csi_concat_init, csi_concat, &difference); - test_concat_op((struct csi_tensor **)input, output, ¶ms, CSINN_QUANT_INT8_SYM, - csi_concat_init, csi_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT32, + csinn_concat_init, csinn_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_UINT8_ASYM, + csinn_concat_init, csinn_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_INT8_SYM, + csinn_concat_init, csinn_concat, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/convolution.cpp b/tests/validation_layer/convolution.cpp index e383b461..8caf27dc 100644 --- a/tests/validation_layer/convolution.cpp +++ b/tests/validation_layer/convolution.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,12 +28,14 @@ int main(int argc, char** argv) { init_testsuite("Testing function of convolution(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = (csinn_conv2d_params *)csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, kernel_size; if (argc == 1) { @@ -57,16 +58,17 @@ int main(int argc, char** argv) output->dim[2] = buffer[16]; // height output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; + params->conv_extra.fuse_zp2bias = false; input->dim_count = 4; input->layout = CSINN_LAYOUT_NCHW; @@ -96,8 +98,7 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 17); kernel->data = (float *)(buffer + 17 + in_size); @@ -107,16 +108,23 @@ int main(int argc, char** argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +#if (DTYPE==32) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE==16) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE==8) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, + csinn_conv2d_init, csinn_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT32, - csi_conv2d_init, csi_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT16, - csi_conv2d_init, csi_conv2d, &difference); +#endif + + // if (params->base.api != CSINN_RVV && params->base.api != CSINN_C906 && params->base.api != CSINN_C910) { + // test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, + // csinn_conv2d_init, csinn_conv2d, &difference); + // } - if (params.base.api != CSINN_RVV && params.base.api != CSINN_C908 && params.base.api != CSINN_C906 && params.base.api != CSINN_C910) { - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_INT8_ASYM, - csi_conv2d_init, csi_conv2d, &difference); - } return done_testing(); } diff --git a/tests/validation_layer/convolution3d.c b/tests/validation_layer/convolution3d.c index 1f77fdc8..cba44e42 100644 --- a/tests/validation_layer/convolution3d.c +++ b/tests/validation_layer/convolution3d.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" @@ -26,12 +26,15 @@ int main(int argc, char **argv) { init_testsuite("Testing function of convolution3d(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), sess); int in_size, out_size, weight_size, bias_size; if (argc == 1) { @@ -60,21 +63,21 @@ int main(int argc, char **argv) output->dim[3] = buffer[10]; // out_height output->dim[4] = buffer[11]; // out_width - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.dilation_depth = buffer[21]; - params.dilation_height = buffer[22]; - params.dilation_width = buffer[23]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->dilation_depth = buffer[21]; + params->dilation_height = buffer[22]; + params->dilation_width = buffer[23]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -105,8 +108,7 @@ int main(int argc, char **argv) weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; bias_size = output->dim[1]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 24); kernel->data = (float *)(buffer + 24 + in_size); @@ -116,9 +118,9 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_conv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_conv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_conv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_conv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_conv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_conv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/convolution_relu.c b/tests/validation_layer/convolution_relu.c index 8c824fa0..5833bf0a 100644 --- a/tests/validation_layer/convolution_relu.c +++ b/tests/validation_layer/convolution_relu.c @@ -16,55 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,22 +95,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + + test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/convolution_relu6.c b/tests/validation_layer/convolution_relu6.c index 319fdcf1..ba948914 100644 --- a/tests/validation_layer/convolution_relu6.c +++ b/tests/validation_layer/convolution_relu6.c @@ -16,55 +16,57 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution relu6(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[1] = buffer[1]; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - kernel->dim[0] = buffer[12]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[1] = buffer[1]; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + kernel->dim[0] = buffer[12]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -93,22 +95,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + + test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/cos.c b/tests/validation_layer/cos.c index 0b42fdfc..0366d466 100644 --- a/tests/validation_layer/cos.c +++ b/tests/validation_layer/cos.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cos(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_cos_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_cos_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_cos_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_cos_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_cos_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_cos_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/cosh.c b/tests/validation_layer/cosh.c index 5da416d2..674e4081 100644 --- a/tests/validation_layer/cosh.c +++ b/tests/validation_layer/cosh.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cosh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_cosh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_cosh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_cosh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_cosh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_cosh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_cosh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/cumprod.c b/tests/validation_layer/cumprod.c index 91e84914..135891bd 100644 --- a/tests/validation_layer/cumprod.c +++ b/tests/validation_layer/cumprod.c @@ -16,36 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumprod(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumprod_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_cumprod_params *params = + csinn_alloc_params(sizeof(struct csinn_cumprod_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -59,17 +62,16 @@ int main(int argc, char** argv) output->quant_channel = 1; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_cumprod_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_cumprod_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_cumprod_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_cumprod_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_cumprod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_cumprod_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/cumsum.c b/tests/validation_layer/cumsum.c index 35e51d46..5134c371 100644 --- a/tests/validation_layer/cumsum.c +++ b/tests/validation_layer/cumsum.c @@ -16,36 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of cumsum(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct cumsum_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_cumsum_params *params = + csinn_alloc_params(sizeof(struct csinn_cumsum_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; - params.exclusive = buffer[5]; + params->axis = buffer[4]; + params->exclusive = buffer[5]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -59,17 +62,16 @@ int main(int argc, char** argv) output->quant_channel = 1; input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_cumsum_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_cumsum_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_cumsum_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_cumsum_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_cumsum_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_cumsum_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/deconvolution.c b/tests/validation_layer/deconvolution.c index 5cc696ed..c2da17a9 100644 --- a/tests/validation_layer/deconvolution.c +++ b/tests/validation_layer/deconvolution.c @@ -16,54 +16,56 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - kernel->dim[0] = buffer[1]; // i - kernel->dim[1] = buffer[14]; // o - kernel->dim[2] = buffer[6]; // h - kernel->dim[3] = buffer[7]; // w - bias->dim[0] = buffer[14]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[14]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[12]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = 1; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + kernel->dim[0] = buffer[1]; // i + kernel->dim[1] = buffer[14]; // o + kernel->dim[2] = buffer[6]; // h + kernel->dim[3] = buffer[7]; // w + bias->dim[0] = buffer[14]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[14]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[12]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = 1; input->dim_count = 4; kernel->dim_count = 4; @@ -85,25 +87,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + params->base.api = CSINN_API; - - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } - diff --git a/tests/validation_layer/deconvolution3d.c b/tests/validation_layer/deconvolution3d.c index 6620d02e..b234ff50 100644 --- a/tests/validation_layer/deconvolution3d.c +++ b/tests/validation_layer/deconvolution3d.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" #include "math_snr.h" @@ -26,12 +26,15 @@ int main(int argc, char **argv) { init_testsuite("Testing function of deconvolution3d(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv3d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv3d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv3d_params), sess); int in_size, out_size, weight_size, bias_size; if (argc == 1) { @@ -60,25 +63,25 @@ int main(int argc, char **argv) output->dim[3] = buffer[10]; // out_height output->dim[4] = buffer[11]; // out_width - params.stride_depth = buffer[12]; - params.stride_height = buffer[13]; - params.stride_width = buffer[14]; - params.pad_left = buffer[15]; - params.pad_right = buffer[16]; - params.pad_top = buffer[17]; - params.pad_down = buffer[18]; - params.pad_front = buffer[19]; - params.pad_back = buffer[20]; - - params.out_pad_depth = buffer[21]; - params.out_pad_height = buffer[22]; - params.out_pad_width = buffer[23]; - - params.dilation_depth = buffer[24]; - params.dilation_height = buffer[25]; - params.dilation_width = buffer[26]; - params.base.layout = CSINN_LAYOUT_NCDHW; - params.group = 1; + params->stride_depth = buffer[12]; + params->stride_height = buffer[13]; + params->stride_width = buffer[14]; + params->pad_left = buffer[15]; + params->pad_right = buffer[16]; + params->pad_top = buffer[17]; + params->pad_down = buffer[18]; + params->pad_front = buffer[19]; + params->pad_back = buffer[20]; + + params->out_pad_depth = buffer[21]; + params->out_pad_height = buffer[22]; + params->out_pad_width = buffer[23]; + + params->dilation_depth = buffer[24]; + params->dilation_height = buffer[25]; + params->dilation_width = buffer[26]; + params->base.layout = CSINN_LAYOUT_NCDHW; + params->group = 1; input->dim_count = 5; kernel->dim_count = 5; @@ -109,8 +112,7 @@ int main(int argc, char **argv) weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4]; bias_size = bias->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 27); kernel->data = (float *)(buffer + 27 + in_size); @@ -120,9 +122,9 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_deconv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_deconv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_deconv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_deconv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_deconv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_deconv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/depth_to_space.c b/tests/validation_layer/depth_to_space.c index 8f21bb09..6d11fe06 100644 --- a/tests/validation_layer/depth_to_space.c +++ b/tests/validation_layer/depth_to_space.c @@ -16,36 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depth_to_space(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct depth_to_space_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_depth_to_space_params *params = + csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] / (params.block_size * params.block_size); - output->dim[2] = input->dim[2] * params.block_size; - output->dim[3] = input->dim[3] * params.block_size; + output->dim[1] = input->dim[1] / (params->block_size * params->block_size); + output->dim[2] = input->dim[2] * params->block_size; + output->dim[3] = input->dim[3] * params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -60,17 +63,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_depth_to_space_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_depth_to_space_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_depth_to_space_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_depth_to_space_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_depth_to_space_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_depth_to_space_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/depthwise_convolution.cpp b/tests/validation_layer/depthwise_convolution.cpp index 299fa5e0..a22b7c10 100644 --- a/tests/validation_layer/depthwise_convolution.cpp +++ b/tests/validation_layer/depthwise_convolution.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,12 +28,14 @@ int main(int argc, char** argv) { init_testsuite("Testing function of depthwise convolution(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = (csinn_conv2d_params *)csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; if (argc == 1) { @@ -61,16 +62,17 @@ int main(int argc, char** argv) output->dim[2] = buffer[15]; // height output->dim[3] = buffer[16]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; + params->conv_extra.fuse_zp2bias = false; input->dim_count = 4; @@ -97,8 +99,7 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 17); kernel->data = (float *)(buffer + 17 + in_size); @@ -107,12 +108,16 @@ int main(int argc, char** argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT32, - csi_conv2d_init, csi_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT16, - csi_conv2d_init, csi_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_INT8_SYM, - csi_conv2d_init, csi_conv2d, &difference); +#if (DTYPE==32) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE==16) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE==8) + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, + csinn_conv2d_init, csinn_conv2d, &difference); +#endif return done_testing(); } diff --git a/tests/validation_layer/depthwise_convolution_relu.c b/tests/validation_layer/depthwise_convolution_relu.c index 033b9daa..9f96e37a 100644 --- a/tests/validation_layer/depthwise_convolution_relu.c +++ b/tests/validation_layer/depthwise_convolution_relu.c @@ -16,25 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; @@ -42,34 +44,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -95,22 +96,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.90; - test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/depthwise_convolution_relu6.c b/tests/validation_layer/depthwise_convolution_relu6.c index fe108643..b38b5f2d 100644 --- a/tests/validation_layer/depthwise_convolution_relu6.c +++ b/tests/validation_layer/depthwise_convolution_relu6.c @@ -16,25 +16,27 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution relu6(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; @@ -42,34 +44,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; kernel->dim_count = 4; @@ -95,22 +96,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); - bias->data = (float *)(buffer + 17 + in_size + weight_size); - reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; + + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); + bias->data = (float *)(buffer + 17 + in_size + weight_size); + reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/depthwise_deconvolution.c b/tests/validation_layer/depthwise_deconvolution.c index 87b54af8..1159dc67 100644 --- a/tests/validation_layer/depthwise_deconvolution.c +++ b/tests/validation_layer/depthwise_deconvolution.c @@ -16,22 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise deconvolution(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; if (argc == 1) { @@ -41,34 +44,33 @@ int main(int argc, char** argv) int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // in_channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - - kernel->dim[0] = buffer[1]; - kernel->dim[1] = 1; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - - bias->dim[0] = buffer[12]; - - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[15]; // height - output->dim[3] = buffer[16]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[14]; - params.dilation_height = buffer[13]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = buffer[1]; - + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + + kernel->dim[0] = buffer[1]; + kernel->dim[1] = 1; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + + bias->dim[0] = buffer[12]; + + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[15]; // height + output->dim[3] = buffer[16]; // width + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[14]; + params->dilation_height = buffer[13]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = buffer[1]; input->dim_count = 4; input->layout = CSINN_LAYOUT_NCHW; @@ -91,22 +93,21 @@ int main(int argc, char** argv) bias->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 17); - kernel->data = (float *)(buffer + 17 + in_size); + input->data = (float *)(buffer + 17); + kernel->data = (float *)(buffer + 17 + in_size); bias->data = (float *)(buffer + 17 + in_size + weight_size); reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/div.c b/tests/validation_layer/div.c index 6cff02b9..231702a2 100644 --- a/tests/validation_layer/div.c +++ b/tests/validation_layer/div.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of div(layer).\n"); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +64,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_div_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_div_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_div_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_div_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_div_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_div_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/elu.c b/tests/validation_layer/elu.c index 80e288ee..dd198467 100644 --- a/tests/validation_layer/elu.c +++ b/tests/validation_layer/elu.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size = 0; - int out_size =0; + int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -56,17 +58,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_elu_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_elu_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_elu_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_elu_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_elu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_elu_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/equal.c b/tests/validation_layer/equal.c index bb480fd6..45265465 100644 --- a/tests/validation_layer/equal.c +++ b/tests/validation_layer/equal.c @@ -16,21 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of equal(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size = 1; int out_size = 1; @@ -38,7 +39,7 @@ int main(int argc, char** argv) input0->dim_count = input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for (int i = 0; i < input0->dim_count; i++ ) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[1 + i]; input1->dim[i] = input0->dim[i]; output->dim[i] = input0->dim[i]; @@ -59,18 +60,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_equal_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/erf.c b/tests/validation_layer/erf.c index a5b20c4d..96205985 100644 --- a/tests/validation_layer/erf.c +++ b/tests/validation_layer/erf.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of erf(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_erf_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_erf_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_erf_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_erf_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_erf_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_erf_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/exp.c b/tests/validation_layer/exp.c index 095b88d6..0cffee83 100644 --- a/tests/validation_layer/exp.c +++ b/tests/validation_layer/exp.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of exp(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_exp_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_exp_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_exp_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_exp_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_exp_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_exp_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/expand_dims.c b/tests/validation_layer/expand_dims.c index db8f17f5..fa00c9af 100644 --- a/tests/validation_layer/expand_dims.c +++ b/tests/validation_layer/expand_dims.c @@ -16,27 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expand_dims(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct expand_dims_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_expand_dims_params *params = + csinn_alloc_params(sizeof(struct csinn_expand_dims_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); int dim_count = buffer[0]; int axis = buffer[1]; - for(int i = 0; i < dim_count; i++) { + for (int i = 0; i < dim_count; i++) { input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } @@ -44,15 +47,15 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - output->dim_count = input->dim_count + 1; // axis is 0-D scalar + output->dim_count = input->dim_count + 1; // axis is 0-D scalar output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - for(int i = 0; i < output->dim_count; i++) { - if(i < axis) { + for (int i = 0; i < output->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } else if(i == axis) { + } else if (i == axis) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i - 1]; @@ -62,17 +65,16 @@ int main(int argc, char** argv) input->dtype = CSINN_DTYPE_FLOAT32; output->dtype = CSINN_DTYPE_FLOAT32; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 2 + dim_count); reference->data = (float *)(buffer + 2 + dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_expand_dims_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_expand_dims_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_expand_dims_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_expand_dims_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_expand_dims_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_expand_dims_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/expm1.c b/tests/validation_layer/expm1.c index 89b0a01c..129aed1a 100644 --- a/tests/validation_layer/expm1.c +++ b/tests/validation_layer/expm1.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of expm1(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_expm1_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_expm1_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_expm1_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_expm1_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_expm1_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_expm1_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/flatten.c b/tests/validation_layer/flatten.c index 1718e603..3fc261f0 100644 --- a/tests/validation_layer/flatten.c +++ b/tests/validation_layer/flatten.c @@ -16,25 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of flatten(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct flatten_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_flatten_params *params = + csinn_alloc_params(sizeof(struct csinn_flatten_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } @@ -50,17 +53,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_flatten_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_flatten_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_flatten_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_flatten_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_flatten_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_flatten_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/floor.c b/tests/validation_layer/floor.c index c74c398c..16770749 100644 --- a/tests/validation_layer/floor.c +++ b/tests/validation_layer/floor.c @@ -16,35 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - - input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -57,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_floor_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_floor_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_floor_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + + test_floor_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_floor_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_floor_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/floor_div.c b/tests/validation_layer/floor_div.c index d79e1f9b..db64e8c5 100644 --- a/tests/validation_layer/floor_div.c +++ b/tests/validation_layer/floor_div.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor_divide(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_floor_divide_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_floor_divide_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_floor_divide_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_floor_divide_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_floor_divide_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_floor_divide_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/floor_mod.c b/tests/validation_layer/floor_mod.c index 75ae5486..790ecb15 100644 --- a/tests/validation_layer/floor_mod.c +++ b/tests/validation_layer/floor_mod.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of floor_mod(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_floor_mod_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_floor_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_floor_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_floor_mod_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_floor_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_floor_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/fullyconnected.cpp b/tests/validation_layer/fullyconnected.cpp index 7f621f17..b2f76795 100644 --- a/tests/validation_layer/fullyconnected.cpp +++ b/tests/validation_layer/fullyconnected.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,12 +28,14 @@ int main(int argc, char** argv) { init_testsuite("Testing function of fullyconnected(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *weight = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct fc_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *weight = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_fc_params *params = (csinn_fc_params *)csinn_alloc_params(sizeof(struct csinn_fc_params), sess); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); @@ -61,7 +62,7 @@ int main(int argc, char** argv) weight->is_const = 1; weight->quant_channel = 1; - bias->dtype = CSINN_DTYPE_FLOAT32; + bias->dtype = CSINN_DTYPE_FLOAT32; bias->layout = CSINN_LAYOUT_O; bias->is_const = 1; bias->quant_channel = 1; @@ -70,8 +71,7 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NC; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 3); weight->data = (float *)(buffer + 3 + in_size0); @@ -81,19 +81,19 @@ int main(int argc, char** argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if THEAD_RVV - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_FLOAT32, csi_fullyconnected_init, - csi_nn_rvv_fullyconnected_packn_fp32, &difference); - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_FLOAT16, csi_fullyconnected_init, - csi_nn_rvv_fullyconnected_packn_fp16, &difference); - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_INT8_SYM, csi_fullyconnected_init, - csi_nn_rvv_fullyconnected_packn_int8, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT32, csinn_fullyconnected_init, + shl_rvv_fullyconnected_packn_fp32, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT16, csinn_fullyconnected_init, + shl_rvv_fullyconnected_packn_fp16, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_INT8_SYM, csinn_fullyconnected_init, + shl_rvv_fullyconnected_packn_int8, &difference); #else - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_FLOAT32, - csi_fullyconnected_init, csi_fullyconnected, &difference); - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_FLOAT16, - csi_fullyconnected_init, csi_fullyconnected, &difference); - test_conv2d_op(input, output, weight, bias, ¶ms, CSINN_QUANT_INT8_SYM, - csi_fullyconnected_init, csi_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT32, + csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT16, + csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_QUANT_INT8_SYM, + csinn_fullyconnected_init, csinn_fullyconnected, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/gather.c b/tests/validation_layer/gather.c index 4d0bef15..18e64e85 100644 --- a/tests/validation_layer/gather.c +++ b/tests/validation_layer/gather.c @@ -16,33 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather(layer).\n"); - - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *indices = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_gather_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_params), sess); int in_size = 1, indices_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); int axis = buffer[0]; input->dim_count = buffer[1]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; in_size *= input->dim[i]; } indices->dim_count = buffer[2 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[3 + input->dim_count + i]; indices_size *= indices->dim[i]; } @@ -77,19 +79,19 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; + params->base.api = CSINN_API; + params->axis = axis; - input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count); - indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); - reference->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); - output->data = reference->data; + input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count); + indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size); + reference->data = + (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_gather_CSINN_QUANT_FLOAT32(input, indices, output, ¶ms, &difference); - test_gather_CSINN_QUANT_UINT8_ASYM(input, indices, output, ¶ms, &difference); - test_gather_CSINN_QUANT_INT8_SYM(input, indices, output, ¶ms, &difference); - + test_gather_CSINN_QUANT_FLOAT32(input, indices, output, params, &difference); + test_gather_CSINN_QUANT_UINT8_ASYM(input, indices, output, params, &difference); + test_gather_CSINN_QUANT_INT8_SYM(input, indices, output, params, &difference); + return done_testing(); } diff --git a/tests/validation_layer/gather_nd.c b/tests/validation_layer/gather_nd.c index 27db94d7..1b80a550 100644 --- a/tests/validation_layer/gather_nd.c +++ b/tests/validation_layer/gather_nd.c @@ -16,35 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of gather_nd(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *indices = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct gather_nd_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *indices = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_gather_nd_params *params = + csinn_alloc_params(sizeof(struct csinn_gather_nd_params), sess); int in_size = 1, out_size = 1, indices_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = 0; // init output->dim_count = 0 - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } indices->dim_count = buffer[1 + input->dim_count]; - for(int i = 0; i < indices->dim_count; i++) { + for (int i = 0; i < indices->dim_count; i++) { indices->dim[i] = buffer[i + 2 + input->dim_count]; indices_size *= indices->dim[i]; - if(i < indices->dim_count - 1) { + if (i < indices->dim_count - 1) { output->dim_count++; output->dim[i] = indices->dim[i]; } @@ -56,7 +59,7 @@ int main(int argc, char** argv) indices_outer_size = indices_size / indices->dim[indices->dim_count - 1]; int input_inner_size = 1; - for(int i = axis; i < input->dim_count; i++) { + for (int i = axis; i < input->dim_count; i++) { input_inner_size *= input->dim[i]; output->dim[output->dim_count] = input->dim[i]; output->dim_count++; @@ -75,18 +78,18 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); - input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); - reference->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); - output->data = reference->data; + indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size); + reference->data = + (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_gather_nd_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_gather_nd_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_gather_nd_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_gather_nd_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_gather_nd_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_gather_nd_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/global_avgpool.cpp b/tests/validation_layer/global_avgpool.cpp index b51f4cb6..e1140b1f 100644 --- a/tests/validation_layer/global_avgpool.cpp +++ b/tests/validation_layer/global_avgpool.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of global avgpool(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 0; int out_size = 0; @@ -62,27 +63,23 @@ int main(int argc, char **argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; -#if THEAD_RVV - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_global_avgpool2d_init, - csi_nn_rvv_global_avgpool2d_fp32, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_global_avgpool2d_init, - csi_nn_rvv_global_avgpool2d_fp16, &difference); -#else - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_global_avgpool2d_init, - csi_global_avgpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_global_avgpool2d_init, - csi_global_avgpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_global_avgpool2d_init, - csi_global_avgpool2d, &difference); +#if (DTYPE==32) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); +#elif (DTYPE==16) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); +#elif (DTYPE==8) + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/global_maxpool.cpp b/tests/validation_layer/global_maxpool.cpp index 193463dc..7d813c94 100644 --- a/tests/validation_layer/global_maxpool.cpp +++ b/tests/validation_layer/global_maxpool.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of global maxpool(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 0; int out_size = 0; @@ -62,27 +63,23 @@ int main(int argc, char **argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; input->data = (float *)(buffer + 6); reference->data = (float *)(buffer + 6 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; -#if THEAD_RVV - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_global_maxpool2d_init, - csi_nn_rvv_global_maxpool2d_fp32, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_global_maxpool2d_init, - csi_nn_rvv_global_maxpool2d_fp16, &difference); -#else - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_global_maxpool2d_init, - csi_global_maxpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_global_maxpool2d_init, - csi_global_maxpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_global_maxpool2d_init, - csi_global_maxpool2d, &difference); +#if (DTYPE==32) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); +#elif (DTYPE==16) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); +#elif (DTYPE==8) + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/greater.c b/tests/validation_layer/greater.c index e1cbbac9..46baee1d 100644 --- a/tests/validation_layer/greater.c +++ b/tests/validation_layer/greater.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_greater_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_greater_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_greater_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_greater_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_greater_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_greater_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/greater_equal.c b/tests/validation_layer/greater_equal.c index 15abf322..37249c9f 100644 --- a/tests/validation_layer/greater_equal.c +++ b/tests/validation_layer/greater_equal.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of greater_equal(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_greater_equal_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_greater_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_greater_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_greater_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_greater_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_greater_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/group_convolution.cpp b/tests/validation_layer/group_convolution.cpp index 270cb106..a3daafd7 100644 --- a/tests/validation_layer/group_convolution.cpp +++ b/tests/validation_layer/group_convolution.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,12 +28,14 @@ int main(int argc, char** argv) { init_testsuite("Testing function of group convolution(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; @@ -61,17 +62,17 @@ int main(int argc, char** argv) output->dim[1] = buffer[12]; // out_channel output->dim[2] = buffer[16]; // height output->dim[3] = buffer[15]; // width - - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -100,8 +101,7 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 18); kernel->data = (float *)(buffer + 18 + in_size); @@ -109,13 +109,13 @@ int main(int argc, char** argv) reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT32, - csi_conv2d_init, csi_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_FLOAT16, - csi_conv2d_init, csi_conv2d, &difference); - test_conv2d_op(input, output, kernel, bias, ¶ms, CSINN_QUANT_INT8_SYM, - csi_conv2d_init, csi_conv2d, &difference); + + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); + // test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, + // csinn_conv2d_init, csinn_conv2d, &difference); return done_testing(); } diff --git a/tests/validation_layer/group_convolution_relu.c b/tests/validation_layer/group_convolution_relu.c index af5973b7..aaf603ae 100644 --- a/tests/validation_layer/group_convolution_relu.c +++ b/tests/validation_layer/group_convolution_relu.c @@ -16,56 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; - if (argc == 1) { printf("please assign the input data.\n"); return 0; } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -91,22 +93,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 18); - kernel->data = (float *)(buffer + 18 + in_size); - bias->data = (float *)(buffer + 18 + in_size + weight_size); - reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); - output->data = reference->data; + input->data = (float *)(buffer + 18); + kernel->data = (float *)(buffer + 18 + in_size); + bias->data = (float *)(buffer + 18 + in_size + weight_size); + reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + + test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/group_convolution_relu6.c b/tests/validation_layer/group_convolution_relu6.c index eaa345f2..a5a4e8c5 100644 --- a/tests/validation_layer/group_convolution_relu6.c +++ b/tests/validation_layer/group_convolution_relu6.c @@ -16,23 +16,25 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of group convolution relu6(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *kernel = csi_alloc_tensor(NULL); - struct csi_tensor *bias = csi_alloc_tensor(NULL); - struct conv2d_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *kernel = csinn_alloc_tensor(sess); + struct csinn_tensor *bias = csinn_alloc_tensor(sess); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess); int in_size, out_size, weight_size; if (argc == 1) { @@ -41,31 +43,31 @@ int main(int argc, char** argv) } int *buffer = read_input_data_f32(argv[1]); - int group = buffer[17]; - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[3]; // in_channel - input->dim[2] = buffer[1]; // height - input->dim[3] = buffer[2]; // width + int group = buffer[17]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[3]; // in_channel + input->dim[2] = buffer[1]; // height + input->dim[3] = buffer[2]; // width input->dim_count = 4; - kernel->dim[0] = buffer[12]; - kernel->dim[1] = buffer[3] / group; - kernel->dim[2] = buffer[6]; - kernel->dim[3] = buffer[7]; - bias->dim[0] = buffer[12]; - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[12]; // out_channel - output->dim[2] = buffer[16]; // height - output->dim[3] = buffer[15]; // width - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.dilation_width = buffer[13]; - params.dilation_height = buffer[14]; - params.base.layout = CSINN_LAYOUT_NCHW; - params.group = group; + kernel->dim[0] = buffer[12]; + kernel->dim[1] = buffer[3] / group; + kernel->dim[2] = buffer[6]; + kernel->dim[3] = buffer[7]; + bias->dim[0] = buffer[12]; + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[12]; // out_channel + output->dim[2] = buffer[16]; // height + output->dim[3] = buffer[15]; // width + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->dilation_width = buffer[13]; + params->dilation_height = buffer[14]; + params->base.layout = CSINN_LAYOUT_NCHW; + params->group = group; input->dim_count = 4; kernel->dim_count = 4; @@ -91,22 +93,21 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group; + params->base.api = CSINN_API; + + input->data = (float *)(buffer + 18); + kernel->data = (float *)(buffer + 18 + in_size); + bias->data = (float *)(buffer + 18 + in_size + weight_size); + reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); + output->data = reference->data; + float difference = argc > 2 ? atof(argv[2]) : 0.99; - input->data = (float *)(buffer + 18); - kernel->data = (float *)(buffer + 18 + in_size); - bias->data = (float *)(buffer + 18 + in_size + weight_size); - reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]); - output->data = reference->data; - float difference = argc > 2 ? atof(argv[2]):0.99; - - test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, ¶ms, &difference); - test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, ¶ms, &difference); + test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference); + test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/hard_sigmoid.c b/tests/validation_layer/hard_sigmoid.c index c2062a38..40f388c0 100644 --- a/tests/validation_layer/hard_sigmoid.c +++ b/tests/validation_layer/hard_sigmoid.c @@ -16,26 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of hard_sigmoid(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +53,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_hard_sigmoid_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_hard_sigmoid_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_hard_sigmoid_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_hard_sigmoid_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_hard_sigmoid_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_hard_sigmoid_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/im2col.c b/tests/validation_layer/im2col.c index 2cb65ca0..8f2278f3 100644 --- a/tests/validation_layer/im2col.c +++ b/tests/validation_layer/im2col.c @@ -16,51 +16,58 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of im2col(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct im2col_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_im2col_params *params = + csinn_alloc_params(sizeof(struct csinn_im2col_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width input->dim_count = 4; - params.kernel_h = buffer[4]; - params.kernel_w = buffer[5]; - params.stride_h = buffer[6]; - params.stride_w = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; + params->kernel_h = buffer[4]; + params->kernel_w = buffer[5]; + params->stride_h = buffer[6]; + params->stride_w = buffer[7]; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { in_size *= input->dim[i]; } - int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1; - int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1; + int out_h = + (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h + + 1; + int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) / + params->stride_w + + 1; - output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w; + output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w; output->dim[1] = input->dim[0] * out_h * out_w; output->dim_count = 2; - out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w; + out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -69,19 +76,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 12); + input->data = (float *)(buffer + 12); reference->data = (float *)(buffer + 12 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_im2col_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_im2col_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_im2col_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_im2col_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_im2col_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_im2col_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } - diff --git a/tests/validation_layer/l2_norm.c b/tests/validation_layer/l2_norm.c index b4cb46b3..e731961a 100644 --- a/tests/validation_layer/l2_norm.c +++ b/tests/validation_layer/l2_norm.c @@ -16,30 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of l2 normalization(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct l2n_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), sess); int size = 1; int *buffer = read_input_data_f32(argv[1]); /* get the dim para */ output->dim_count = input->dim_count = buffer[0]; - params.epsilon = *(float *)&buffer[1]; + params->epsilon = *(float *)&buffer[1]; int32_t axis[] = {1}; - params.axis = axis; - params.n = 1; - + params->axis = axis; + params->n = 1; + for (int i = 0; i < input->dim_count; ++i) { output->dim[i] = input->dim[i] = buffer[2 + i]; } @@ -56,18 +58,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - //params.epsilon = *(float *)&buffer[1 + input->dim_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + // params->epsilon = *(float *)&buffer[1 + input->dim_count]; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count); reference->data = (float *)(buffer + 2 + input->dim_count + size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_l2_normalization_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_l2_normalization_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_l2_normalization_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_l2_normalization_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_l2_normalization_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_l2_normalization_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/layer/common.c b/tests/validation_layer/layer/common.c index cabd0c37..0c425743 100644 --- a/tests/validation_layer/layer/common.c +++ b/tests/validation_layer/layer/common.c @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "common.h" @@ -27,201 +27,204 @@ #include "math_snr.h" #include "test_utils.h" -#define LAYER_TEST_DISO(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *input1, \ - struct csi_tensor *output, struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ - struct csi_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput0, qinput1, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input0->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_DISO(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *input1, \ + struct csinn_tensor *output, struct SPARAMS *params, \ + float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ + struct csinn_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput0, qinput1, qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input0->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_SEGMENT(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *segment, \ - struct csi_tensor *output, struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qinput0, segment, qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput0, segment, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input0->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_SEGMENT(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *segment, \ + struct csinn_tensor *output, struct SPARAMS *params, \ + float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput0, segment, qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput0, segment, qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input0->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_SISO(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *output, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qinput, qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_SISO(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *output, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput, qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput, qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_CONCAT(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor **input, struct csi_tensor *output, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput[params->inputs_count]; \ - for (int i = 0; i < params->inputs_count; i++) { \ - qinput[i] = convert_f32_layer(input[i], test_dtype, test_api); \ - } \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init((struct csi_tensor **)qinput, qoutput, params) == CSINN_TRUE) { \ - csi_##OP((struct csi_tensor **)qinput, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input[0]->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_CONCAT(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor **input, struct csinn_tensor *output, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput[params->inputs_count]; \ + for (int i = 0; i < params->inputs_count; i++) { \ + qinput[i] = convert_f32_layer(input[i], test_dtype, test_api); \ + } \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init((struct csinn_tensor **)qinput, qoutput, params) == CSINN_TRUE) { \ + csinn_##OP((struct csinn_tensor **)qinput, qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input[0]->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_SPLIT(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor **output, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qoutput[params->output_num]; \ - int num = params->output_num; \ - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ - for (int i = 0; i < num; i++) { \ - qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api); \ - } \ - if (csi_##OP##_init(qinput, (struct csi_tensor **)qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput, (struct csi_tensor **)qoutput, params); \ - } \ - for (int i = 0; i < num; i++) { \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput[i]); \ - result_verify_f32(output[i]->data, foutput->data, input->data, *difference, \ - csi_tensor_size(output[i]), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ - } \ +#define LAYER_TEST_SPLIT(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor **output, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qoutput[params->output_num]; \ + int num = params->output_num; \ + struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ + for (int i = 0; i < num; i++) { \ + qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api); \ + } \ + if (csinn_##OP##_init(qinput, (struct csinn_tensor **)qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput, (struct csinn_tensor **)qoutput, params); \ + } \ + for (int i = 0; i < num; i++) { \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput[i]); \ + result_verify_f32(output[i]->data, foutput->data, input->data, *difference, \ + csinn_tensor_size(output[i]), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ + } \ + } + +#define LAYER_TEST_UNSTACK(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor **output, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qoutput[params->outputs_count]; \ + int num = params->outputs_count; \ + struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ + for (int i = 0; i < num; i++) { \ + qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api); \ + } \ + if (csinn_##OP##_init(qinput, (struct csinn_tensor **)qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput, (struct csinn_tensor **)qoutput, params); \ + } \ + for (int i = 0; i < num; i++) { \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput[i]); \ + result_verify_f32(output[i]->data, foutput->data, input->data, *difference, \ + csinn_tensor_size(output[i]), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ + } \ } -#define LAYER_TEST_UNSTACK(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor **output, \ +#define LAYER_TEST_CONV2D(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *output, \ + struct csinn_tensor *kernel, struct csinn_tensor *bias, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + struct csinn_tensor *qkernel = convert_f32_layer(kernel, test_dtype, test_api); \ + struct csinn_tensor *qbias = convert_f32_layer(bias, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { \ + csinn_##OP(qinput, qoutput, qkernel, qbias, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ + } + +#define LAYER_TEST_BATCHNORM(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *mean, \ + struct csinn_tensor *variance, struct csinn_tensor *gamma, \ + struct csinn_tensor *beta, struct csinn_tensor *output, \ struct SPARAMS *params, float *difference) \ { \ enum csinn_dtype_enum test_dtype = STYPE; \ enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qoutput[params->outputs_count]; \ - int num = params->outputs_count; \ - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ - for (int i = 0; i < num; i++) { \ - qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api); \ - } \ - if (csi_##OP##_init(qinput, (struct csi_tensor **)qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput, (struct csi_tensor **)qoutput, params); \ + struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ + struct csinn_tensor *qmean = convert_f32_layer(mean, test_dtype, test_api); \ + struct csinn_tensor *qvariance = convert_f32_layer(variance, test_dtype, test_api); \ + struct csinn_tensor *qgamma = convert_f32_layer(gamma, test_dtype, test_api); \ + struct csinn_tensor *qbeta = convert_f32_layer(beta, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params) == \ + CSINN_TRUE) { \ + csinn_##OP(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params); \ } \ - for (int i = 0; i < num; i++) { \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput[i]); \ - result_verify_f32(output[i]->data, foutput->data, input->data, *difference, \ - csi_tensor_size(output[i]), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ - } \ - } - -#define LAYER_TEST_CONV2D(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *output, \ - struct csi_tensor *kernel, struct csi_tensor *bias, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - struct csi_tensor *qkernel = convert_f32_layer(kernel, test_dtype, test_api); \ - struct csi_tensor *qbias = convert_f32_layer(bias, test_dtype, test_api); \ - if (csi_##OP##_init(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { \ - csi_##OP(qinput, qoutput, qkernel, qbias, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ - } - -#define LAYER_TEST_BATCHNORM(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *mean, \ - struct csi_tensor *variance, struct csi_tensor *gamma, \ - struct csi_tensor *beta, struct csi_tensor *output, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api); \ - struct csi_tensor *qmean = convert_f32_layer(mean, test_dtype, test_api); \ - struct csi_tensor *qvariance = convert_f32_layer(variance, test_dtype, test_api); \ - struct csi_tensor *qgamma = convert_f32_layer(gamma, test_dtype, test_api); \ - struct csi_tensor *qbeta = convert_f32_layer(beta, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params) == \ - CSINN_TRUE) { \ - csi_##OP(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_TISO(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *input1, \ - struct csi_tensor *input2, struct csi_tensor *output, \ - struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ - struct csi_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api); \ - struct csi_tensor *qinput2 = convert_f32_layer(input2, test_dtype, test_api); \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qinput0, qinput1, qinput2, qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qinput0, qinput1, qinput2, qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, input1->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_TISO(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *input1, \ + struct csinn_tensor *input2, struct csinn_tensor *output, \ + struct SPARAMS *params, float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \ + struct csinn_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api); \ + struct csinn_tensor *qinput2 = convert_f32_layer(input2, test_dtype, test_api); \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qinput0, qinput1, qinput2, qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qinput0, qinput1, qinput2, qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, input1->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } -#define LAYER_TEST_ARANGE(OP, STYPE, SPARAMS) \ - void test_##OP##_##STYPE(struct csi_tensor *output, struct SPARAMS *params, float *difference) \ - { \ - enum csinn_dtype_enum test_dtype = STYPE; \ - enum csinn_api_enum test_api = params->base.api; \ - struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ - if (csi_##OP##_init(qoutput, params) == CSINN_TRUE) { \ - csi_##OP(qoutput, params); \ - } \ - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); \ - result_verify_f32(output->data, foutput->data, output->data, *difference, \ - csi_tensor_size(output), false); \ - csi_ref_tensor_transform_free_f32(foutput); \ +#define LAYER_TEST_ARANGE(OP, STYPE, SPARAMS) \ + void test_##OP##_##STYPE(struct csinn_tensor *output, struct SPARAMS *params, \ + float *difference) \ + { \ + enum csinn_dtype_enum test_dtype = STYPE; \ + enum csinn_api_enum test_api = params->base.api; \ + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \ + if (csinn_##OP##_init(qoutput, params) == CSINN_TRUE) { \ + csinn_##OP(qoutput, params); \ + } \ + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); \ + result_verify_f32(output->data, foutput->data, output->data, *difference, \ + csinn_tensor_size(output), false); \ + shl_ref_tensor_transform_free_f32(foutput); \ } LAYER_QUANT_TEST_DISO(LAYER_TEST_DISO) diff --git a/tests/validation_layer/layer/common.h b/tests/validation_layer/layer/common.h index 94fb381f..a4f2ea9f 100644 --- a/tests/validation_layer/layer/common.h +++ b/tests/validation_layer/layer/common.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include @@ -24,421 +24,421 @@ #include "math_snr.h" #include "test_utils.h" -#define LAYER_QUANT_TEST_SISO(MACRO) \ - MACRO(abs, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(abs, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(abs, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(acos, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(acos, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(acos, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(acosh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(acosh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(acosh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(asin, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(asin, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(asin, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(asinh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(asinh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(asinh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(atan, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(atan, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(atan, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(atanh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(atanh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(atanh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(ceil, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(ceil, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(ceil, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(cos, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(cos, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(cos, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(cosh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(cosh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(cosh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(erf, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(erf, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(erf, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(exp, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(exp, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(exp, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(expm1, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(expm1, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(expm1, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(floor, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(floor, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(floor, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(log, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(log, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(log, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(log1p, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(log1p, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(log1p, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(logical_not, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(logical_not, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(logical_not, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(round, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(round, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(round, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(rsqrt, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(rsqrt, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(rsqrt, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(sign, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(sign, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(sign, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(negative, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(negative, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(negative, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(sin, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(sin, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(sin, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(sinh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(sinh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(sinh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(softplus, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(softplus, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(softplus, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(softsign, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(softsign, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(softsign, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(sqrt, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(sqrt, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(sqrt, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(square, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(square, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(square, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(tan, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(tan, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(tan, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(tanh, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(tanh, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(tanh, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(trunc, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(trunc, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(trunc, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(yuv_rgb_scale, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(yuv_rgb_scale, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(yuv_rgb_scale, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(not, CSINN_QUANT_FLOAT32, siso_params) \ - MACRO(not, CSINN_QUANT_UINT8_ASYM, siso_params) \ - MACRO(not, CSINN_QUANT_INT8_SYM, siso_params) \ - MACRO(avgpool2d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(avgpool2d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(avgpool2d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(avgpool3d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(avgpool3d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(avgpool3d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(clip, CSINN_QUANT_FLOAT32, clip_params) \ - MACRO(clip, CSINN_QUANT_UINT8_ASYM, clip_params) \ - MACRO(clip, CSINN_QUANT_INT8_SYM, clip_params) \ - MACRO(batch_to_space, CSINN_QUANT_FLOAT32, batch_to_space_params) \ - MACRO(batch_to_space, CSINN_QUANT_UINT8_ASYM, batch_to_space_params) \ - MACRO(batch_to_space, CSINN_QUANT_INT8_SYM, batch_to_space_params) \ - MACRO(cumprod, CSINN_QUANT_FLOAT32, cumprod_params) \ - MACRO(cumprod, CSINN_QUANT_UINT8_ASYM, cumprod_params) \ - MACRO(cumprod, CSINN_QUANT_INT8_SYM, cumprod_params) \ - MACRO(cumsum, CSINN_QUANT_FLOAT32, cumsum_params) \ - MACRO(cumsum, CSINN_QUANT_UINT8_ASYM, cumsum_params) \ - MACRO(cumsum, CSINN_QUANT_INT8_SYM, cumsum_params) \ - MACRO(depth_to_space, CSINN_QUANT_FLOAT32, depth_to_space_params) \ - MACRO(depth_to_space, CSINN_QUANT_UINT8_ASYM, depth_to_space_params) \ - MACRO(depth_to_space, CSINN_QUANT_INT8_SYM, depth_to_space_params) \ - MACRO(elu, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(elu, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(elu, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(expand_dims, CSINN_QUANT_FLOAT32, expand_dims_params) \ - MACRO(expand_dims, CSINN_QUANT_UINT8_ASYM, expand_dims_params) \ - MACRO(expand_dims, CSINN_QUANT_INT8_SYM, expand_dims_params) \ - MACRO(flatten, CSINN_QUANT_FLOAT32, flatten_params) \ - MACRO(flatten, CSINN_QUANT_UINT8_ASYM, flatten_params) \ - MACRO(flatten, CSINN_QUANT_INT8_SYM, flatten_params) \ - MACRO(global_avgpool2d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(global_avgpool2d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(global_avgpool2d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(global_maxpool2d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(global_maxpool2d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(global_maxpool2d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(hard_sigmoid, CSINN_QUANT_FLOAT32, sigmoid_params) \ - MACRO(hard_sigmoid, CSINN_QUANT_UINT8_ASYM, sigmoid_params) \ - MACRO(hard_sigmoid, CSINN_QUANT_INT8_SYM, sigmoid_params) \ - MACRO(im2col, CSINN_QUANT_FLOAT32, im2col_params) \ - MACRO(im2col, CSINN_QUANT_UINT8_ASYM, im2col_params) \ - MACRO(im2col, CSINN_QUANT_INT8_SYM, im2col_params) \ - MACRO(l2_normalization, CSINN_QUANT_FLOAT32, l2n_params) \ - MACRO(l2_normalization, CSINN_QUANT_UINT8_ASYM, l2n_params) \ - MACRO(l2_normalization, CSINN_QUANT_INT8_SYM, l2n_params) \ - MACRO(leaky_relu, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(leaky_relu, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(leaky_relu, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(log_softmax, CSINN_QUANT_FLOAT32, softmax_params) \ - MACRO(log_softmax, CSINN_QUANT_UINT8_ASYM, softmax_params) \ - MACRO(log_softmax, CSINN_QUANT_INT8_SYM, softmax_params) \ - MACRO(lrn, CSINN_QUANT_FLOAT32, lrn_params) \ - MACRO(lrn, CSINN_QUANT_UINT8_ASYM, lrn_params) \ - MACRO(lrn, CSINN_QUANT_INT8_SYM, lrn_params) \ - MACRO(max, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(max, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(max, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(maxpool2d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(maxpool2d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(maxpool2d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(maxpool3d, CSINN_QUANT_FLOAT32, pool_params) \ - MACRO(maxpool3d, CSINN_QUANT_UINT8_ASYM, pool_params) \ - MACRO(maxpool3d, CSINN_QUANT_INT8_SYM, pool_params) \ - MACRO(mean, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(mean, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(mean, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(min, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(min, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(min, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(pad, CSINN_QUANT_FLOAT32, pad_params) \ - MACRO(pad, CSINN_QUANT_UINT8_ASYM, pad_params) \ - MACRO(pad, CSINN_QUANT_INT8_SYM, pad_params) \ - MACRO(prod, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(prod, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(prod, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_logsumexp, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_logsumexp, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_logsumexp, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_max, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_max, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_max, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_mean, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_mean, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_mean, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_min, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_min, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_min, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_prod, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_prod, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_prod, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(reduce_sum, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(reduce_sum, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(reduce_sum, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(relu, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(relu, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(relu, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(relu1, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(relu1, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(relu1, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(relu6, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(relu6, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(relu6, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(relun, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(relun, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(relun, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(reshape, CSINN_QUANT_FLOAT32, reshape_params) \ - MACRO(reshape, CSINN_QUANT_UINT8_ASYM, reshape_params) \ - MACRO(reshape, CSINN_QUANT_INT8_SYM, reshape_params) \ - MACRO(resize, CSINN_QUANT_FLOAT32, resize_params) \ - MACRO(resize, CSINN_QUANT_UINT8_ASYM, resize_params) \ - MACRO(resize, CSINN_QUANT_INT8_SYM, resize_params) \ - MACRO(reverse, CSINN_QUANT_FLOAT32, reverse_params) \ - MACRO(reverse, CSINN_QUANT_UINT8_ASYM, reverse_params) \ - MACRO(reverse, CSINN_QUANT_INT8_SYM, reverse_params) \ - MACRO(shuffle_channel, CSINN_QUANT_FLOAT32, shuffle_channel_params) \ - MACRO(shuffle_channel, CSINN_QUANT_UINT8_ASYM, shuffle_channel_params) \ - MACRO(shuffle_channel, CSINN_QUANT_INT8_SYM, shuffle_channel_params) \ - MACRO(sigmoid, CSINN_QUANT_FLOAT32, sigmoid_params) \ - MACRO(sigmoid, CSINN_QUANT_UINT8_ASYM, sigmoid_params) \ - MACRO(sigmoid, CSINN_QUANT_INT8_SYM, sigmoid_params) \ - MACRO(slice, CSINN_QUANT_FLOAT32, slice_params) \ - MACRO(slice, CSINN_QUANT_UINT8_ASYM, slice_params) \ - MACRO(slice, CSINN_QUANT_INT8_SYM, slice_params) \ - MACRO(softmax, CSINN_QUANT_FLOAT32, softmax_params) \ - MACRO(softmax, CSINN_QUANT_UINT8_ASYM, softmax_params) \ - MACRO(softmax, CSINN_QUANT_INT8_SYM, softmax_params) \ - MACRO(softrelu, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(softrelu, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(softrelu, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(space_to_batch, CSINN_QUANT_FLOAT32, space_to_batch_params) \ - MACRO(space_to_batch, CSINN_QUANT_UINT8_ASYM, space_to_batch_params) \ - MACRO(space_to_batch, CSINN_QUANT_INT8_SYM, space_to_batch_params) \ - MACRO(space_to_depth, CSINN_QUANT_FLOAT32, space_to_depth_params) \ - MACRO(space_to_depth, CSINN_QUANT_UINT8_ASYM, space_to_depth_params) \ - MACRO(space_to_depth, CSINN_QUANT_INT8_SYM, space_to_depth_params) \ - MACRO(squeeze, CSINN_QUANT_FLOAT32, squeeze_params) \ - MACRO(squeeze, CSINN_QUANT_UINT8_ASYM, squeeze_params) \ - MACRO(squeeze, CSINN_QUANT_INT8_SYM, squeeze_params) \ - MACRO(strided_slice, CSINN_QUANT_FLOAT32, strided_slice_params) \ - MACRO(strided_slice, CSINN_QUANT_UINT8_ASYM, strided_slice_params) \ - MACRO(strided_slice, CSINN_QUANT_INT8_SYM, strided_slice_params) \ - MACRO(sum, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(sum, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(sum, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(threshold_relu, CSINN_QUANT_FLOAT32, relu_params) \ - MACRO(threshold_relu, CSINN_QUANT_UINT8_ASYM, relu_params) \ - MACRO(threshold_relu, CSINN_QUANT_INT8_SYM, relu_params) \ - MACRO(tile, CSINN_QUANT_FLOAT32, tile_params) \ - MACRO(tile, CSINN_QUANT_UINT8_ASYM, tile_params) \ - MACRO(tile, CSINN_QUANT_INT8_SYM, tile_params) \ - MACRO(transpose, CSINN_QUANT_FLOAT32, transpose_params) \ - MACRO(transpose, CSINN_QUANT_UINT8_ASYM, transpose_params) \ - MACRO(transpose, CSINN_QUANT_INT8_SYM, transpose_params) \ - MACRO(argmax, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(argmax, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(argmax, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(argmin, CSINN_QUANT_FLOAT32, reduce_params) \ - MACRO(argmin, CSINN_QUANT_UINT8_ASYM, reduce_params) \ - MACRO(argmin, CSINN_QUANT_INT8_SYM, reduce_params) \ - MACRO(broadcast_to, CSINN_QUANT_FLOAT32, broadcast_to_params) \ - MACRO(broadcast_to, CSINN_QUANT_UINT8_ASYM, broadcast_to_params) \ - MACRO(broadcast_to, CSINN_QUANT_INT8_SYM, broadcast_to_params) +#define LAYER_QUANT_TEST_SISO(MACRO) \ + MACRO(abs, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(abs, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(abs, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(acos, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(acos, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(acos, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(acosh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(acosh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(acosh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(asin, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(asin, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(asin, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(asinh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(asinh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(asinh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(atan, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(atan, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(atan, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(atanh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(atanh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(atanh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(ceil, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(ceil, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(ceil, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(cos, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(cos, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(cos, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(cosh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(cosh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(cosh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(erf, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(erf, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(erf, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(exp, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(exp, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(exp, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(expm1, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(expm1, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(expm1, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(floor, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(floor, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(floor, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(log, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(log, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(log, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(log1p, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(log1p, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(log1p, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(logical_not, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(logical_not, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(logical_not, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(round, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(round, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(round, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(rsqrt, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(rsqrt, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(rsqrt, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(sign, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(sign, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(sign, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(negative, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(negative, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(negative, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(sin, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(sin, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(sin, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(sinh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(sinh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(sinh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(softplus, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(softplus, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(softplus, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(softsign, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(softsign, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(softsign, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(sqrt, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(sqrt, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(sqrt, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(square, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(square, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(square, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(tan, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(tan, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(tan, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(tanh, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(tanh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(tanh, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(trunc, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(trunc, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(trunc, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(yuv_rgb_scale, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(yuv_rgb_scale, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(yuv_rgb_scale, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(not, CSINN_QUANT_FLOAT32, csinn_siso_params) \ + MACRO(not, CSINN_QUANT_UINT8_ASYM, csinn_siso_params) \ + MACRO(not, CSINN_QUANT_INT8_SYM, csinn_siso_params) \ + MACRO(avgpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(avgpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(avgpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(avgpool3d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(avgpool3d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(avgpool3d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(clip, CSINN_QUANT_FLOAT32, csinn_clip_params) \ + MACRO(clip, CSINN_QUANT_UINT8_ASYM, csinn_clip_params) \ + MACRO(clip, CSINN_QUANT_INT8_SYM, csinn_clip_params) \ + MACRO(batch_to_space, CSINN_QUANT_FLOAT32, csinn_batch_to_space_params) \ + MACRO(batch_to_space, CSINN_QUANT_UINT8_ASYM, csinn_batch_to_space_params) \ + MACRO(batch_to_space, CSINN_QUANT_INT8_SYM, csinn_batch_to_space_params) \ + MACRO(cumprod, CSINN_QUANT_FLOAT32, csinn_cumprod_params) \ + MACRO(cumprod, CSINN_QUANT_UINT8_ASYM, csinn_cumprod_params) \ + MACRO(cumprod, CSINN_QUANT_INT8_SYM, csinn_cumprod_params) \ + MACRO(cumsum, CSINN_QUANT_FLOAT32, csinn_cumsum_params) \ + MACRO(cumsum, CSINN_QUANT_UINT8_ASYM, csinn_cumsum_params) \ + MACRO(cumsum, CSINN_QUANT_INT8_SYM, csinn_cumsum_params) \ + MACRO(depth_to_space, CSINN_QUANT_FLOAT32, csinn_depth_to_space_params) \ + MACRO(depth_to_space, CSINN_QUANT_UINT8_ASYM, csinn_depth_to_space_params) \ + MACRO(depth_to_space, CSINN_QUANT_INT8_SYM, csinn_depth_to_space_params) \ + MACRO(elu, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(elu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(elu, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(expand_dims, CSINN_QUANT_FLOAT32, csinn_expand_dims_params) \ + MACRO(expand_dims, CSINN_QUANT_UINT8_ASYM, csinn_expand_dims_params) \ + MACRO(expand_dims, CSINN_QUANT_INT8_SYM, csinn_expand_dims_params) \ + MACRO(flatten, CSINN_QUANT_FLOAT32, csinn_flatten_params) \ + MACRO(flatten, CSINN_QUANT_UINT8_ASYM, csinn_flatten_params) \ + MACRO(flatten, CSINN_QUANT_INT8_SYM, csinn_flatten_params) \ + MACRO(global_avgpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(global_avgpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(global_avgpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(global_maxpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(global_maxpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(global_maxpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(hard_sigmoid, CSINN_QUANT_FLOAT32, csinn_sigmoid_params) \ + MACRO(hard_sigmoid, CSINN_QUANT_UINT8_ASYM, csinn_sigmoid_params) \ + MACRO(hard_sigmoid, CSINN_QUANT_INT8_SYM, csinn_sigmoid_params) \ + MACRO(im2col, CSINN_QUANT_FLOAT32, csinn_im2col_params) \ + MACRO(im2col, CSINN_QUANT_UINT8_ASYM, csinn_im2col_params) \ + MACRO(im2col, CSINN_QUANT_INT8_SYM, csinn_im2col_params) \ + MACRO(l2_normalization, CSINN_QUANT_FLOAT32, csinn_l2n_params) \ + MACRO(l2_normalization, CSINN_QUANT_UINT8_ASYM, csinn_l2n_params) \ + MACRO(l2_normalization, CSINN_QUANT_INT8_SYM, csinn_l2n_params) \ + MACRO(leaky_relu, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(leaky_relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(leaky_relu, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(log_softmax, CSINN_QUANT_FLOAT32, csinn_softmax_params) \ + MACRO(log_softmax, CSINN_QUANT_UINT8_ASYM, csinn_softmax_params) \ + MACRO(log_softmax, CSINN_QUANT_INT8_SYM, csinn_softmax_params) \ + MACRO(lrn, CSINN_QUANT_FLOAT32, csinn_lrn_params) \ + MACRO(lrn, CSINN_QUANT_UINT8_ASYM, csinn_lrn_params) \ + MACRO(lrn, CSINN_QUANT_INT8_SYM, csinn_lrn_params) \ + MACRO(max, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(max, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(max, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(maxpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(maxpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(maxpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(maxpool3d, CSINN_QUANT_FLOAT32, csinn_pool_params) \ + MACRO(maxpool3d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params) \ + MACRO(maxpool3d, CSINN_QUANT_INT8_SYM, csinn_pool_params) \ + MACRO(mean, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(mean, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(mean, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(min, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(min, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(min, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(pad, CSINN_QUANT_FLOAT32, csinn_pad_params) \ + MACRO(pad, CSINN_QUANT_UINT8_ASYM, csinn_pad_params) \ + MACRO(pad, CSINN_QUANT_INT8_SYM, csinn_pad_params) \ + MACRO(prod, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(prod, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(prod, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_logsumexp, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_logsumexp, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_logsumexp, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_max, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_max, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_max, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_mean, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_mean, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_mean, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_min, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_min, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_min, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_prod, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_prod, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_prod, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(reduce_sum, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(reduce_sum, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(reduce_sum, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(relu, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(relu, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(relu1, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(relu1, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(relu1, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(relu6, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(relu6, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(relu6, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(relun, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(relun, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(relun, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(reshape, CSINN_QUANT_FLOAT32, csinn_reshape_params) \ + MACRO(reshape, CSINN_QUANT_UINT8_ASYM, csinn_reshape_params) \ + MACRO(reshape, CSINN_QUANT_INT8_SYM, csinn_reshape_params) \ + MACRO(resize, CSINN_QUANT_FLOAT32, csinn_resize_params) \ + MACRO(resize, CSINN_QUANT_UINT8_ASYM, csinn_resize_params) \ + MACRO(resize, CSINN_QUANT_INT8_SYM, csinn_resize_params) \ + MACRO(reverse, CSINN_QUANT_FLOAT32, csinn_reverse_params) \ + MACRO(reverse, CSINN_QUANT_UINT8_ASYM, csinn_reverse_params) \ + MACRO(reverse, CSINN_QUANT_INT8_SYM, csinn_reverse_params) \ + MACRO(shuffle_channel, CSINN_QUANT_FLOAT32, csinn_shuffle_channel_params) \ + MACRO(shuffle_channel, CSINN_QUANT_UINT8_ASYM, csinn_shuffle_channel_params) \ + MACRO(shuffle_channel, CSINN_QUANT_INT8_SYM, csinn_shuffle_channel_params) \ + MACRO(sigmoid, CSINN_QUANT_FLOAT32, csinn_sigmoid_params) \ + MACRO(sigmoid, CSINN_QUANT_UINT8_ASYM, csinn_sigmoid_params) \ + MACRO(sigmoid, CSINN_QUANT_INT8_SYM, csinn_sigmoid_params) \ + MACRO(slice, CSINN_QUANT_FLOAT32, csinn_slice_params) \ + MACRO(slice, CSINN_QUANT_UINT8_ASYM, csinn_slice_params) \ + MACRO(slice, CSINN_QUANT_INT8_SYM, csinn_slice_params) \ + MACRO(softmax, CSINN_QUANT_FLOAT32, csinn_softmax_params) \ + MACRO(softmax, CSINN_QUANT_UINT8_ASYM, csinn_softmax_params) \ + MACRO(softmax, CSINN_QUANT_INT8_SYM, csinn_softmax_params) \ + MACRO(softrelu, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(softrelu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(softrelu, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(space_to_batch, CSINN_QUANT_FLOAT32, csinn_space_to_batch_params) \ + MACRO(space_to_batch, CSINN_QUANT_UINT8_ASYM, csinn_space_to_batch_params) \ + MACRO(space_to_batch, CSINN_QUANT_INT8_SYM, csinn_space_to_batch_params) \ + MACRO(space_to_depth, CSINN_QUANT_FLOAT32, csinn_space_to_depth_params) \ + MACRO(space_to_depth, CSINN_QUANT_UINT8_ASYM, csinn_space_to_depth_params) \ + MACRO(space_to_depth, CSINN_QUANT_INT8_SYM, csinn_space_to_depth_params) \ + MACRO(squeeze, CSINN_QUANT_FLOAT32, csinn_squeeze_params) \ + MACRO(squeeze, CSINN_QUANT_UINT8_ASYM, csinn_squeeze_params) \ + MACRO(squeeze, CSINN_QUANT_INT8_SYM, csinn_squeeze_params) \ + MACRO(strided_slice, CSINN_QUANT_FLOAT32, csinn_strided_slice_params) \ + MACRO(strided_slice, CSINN_QUANT_UINT8_ASYM, csinn_strided_slice_params) \ + MACRO(strided_slice, CSINN_QUANT_INT8_SYM, csinn_strided_slice_params) \ + MACRO(sum, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(sum, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(sum, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(threshold_relu, CSINN_QUANT_FLOAT32, csinn_relu_params) \ + MACRO(threshold_relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params) \ + MACRO(threshold_relu, CSINN_QUANT_INT8_SYM, csinn_relu_params) \ + MACRO(tile, CSINN_QUANT_FLOAT32, csinn_tile_params) \ + MACRO(tile, CSINN_QUANT_UINT8_ASYM, csinn_tile_params) \ + MACRO(tile, CSINN_QUANT_INT8_SYM, csinn_tile_params) \ + MACRO(transpose, CSINN_QUANT_FLOAT32, csinn_transpose_params) \ + MACRO(transpose, CSINN_QUANT_UINT8_ASYM, csinn_transpose_params) \ + MACRO(transpose, CSINN_QUANT_INT8_SYM, csinn_transpose_params) \ + MACRO(argmax, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(argmax, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(argmax, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(argmin, CSINN_QUANT_FLOAT32, csinn_reduce_params) \ + MACRO(argmin, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params) \ + MACRO(argmin, CSINN_QUANT_INT8_SYM, csinn_reduce_params) \ + MACRO(broadcast_to, CSINN_QUANT_FLOAT32, csinn_broadcast_to_params) \ + MACRO(broadcast_to, CSINN_QUANT_UINT8_ASYM, csinn_broadcast_to_params) \ + MACRO(broadcast_to, CSINN_QUANT_INT8_SYM, csinn_broadcast_to_params) -#define LAYER_QUANT_TEST_DISO(MACRO) \ - MACRO(add, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(add, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(add, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(div, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(div, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(div, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(equal, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(equal, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(equal, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(floor_divide, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(floor_divide, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(floor_divide, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(floor_mod, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(floor_mod, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(floor_mod, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(greater_equal, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(greater_equal, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(greater_equal, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(greater, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(greater, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(greater, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(less_equal, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(less_equal, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(less_equal, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(less, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(less, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(less, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(logical_and, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(logical_and, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(logical_and, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(logical_or, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(logical_or, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(logical_or, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(logical_xor, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(logical_xor, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(logical_xor, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(mod, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(mod, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(mod, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(mul, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(mul, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(mul, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(not_equal, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(not_equal, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(not_equal, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(power, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(power, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(power, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(sub, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(sub, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(sub, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(maximum, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(maximum, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(maximum, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(minimum, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(minimum, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(minimum, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(and, CSINN_QUANT_FLOAT32, diso_params) \ - MACRO(and, CSINN_QUANT_UINT8_ASYM, diso_params) \ - MACRO(and, CSINN_QUANT_INT8_SYM, diso_params) \ - MACRO(matmul, CSINN_QUANT_FLOAT32, matmul_params) \ - MACRO(matmul, CSINN_QUANT_UINT8_ASYM, matmul_params) \ - MACRO(matmul, CSINN_QUANT_INT8_SYM, matmul_params) \ - MACRO(prelu, CSINN_QUANT_FLOAT32, prelu_params) \ - MACRO(prelu, CSINN_QUANT_UINT8_ASYM, prelu_params) \ - MACRO(prelu, CSINN_QUANT_INT8_SYM, prelu_params) \ - MACRO(non_max_suppression, CSINN_QUANT_FLOAT32, non_max_suppression_params) \ - MACRO(non_max_suppression, CSINN_QUANT_UINT8_ASYM, non_max_suppression_params) \ - MACRO(non_max_suppression, CSINN_QUANT_INT8_SYM, non_max_suppression_params) \ - MACRO(psroipooling, CSINN_QUANT_FLOAT32, psroipooling_params) \ - MACRO(psroipooling, CSINN_QUANT_UINT8_ASYM, psroipooling_params) \ - MACRO(psroipooling, CSINN_QUANT_INT8_SYM, psroipooling_params) \ - MACRO(roi_align, CSINN_QUANT_FLOAT32, roi_align_params) \ - MACRO(roi_align, CSINN_QUANT_UINT8_ASYM, roi_align_params) \ - MACRO(roi_align, CSINN_QUANT_INT8_SYM, roi_align_params) \ - MACRO(roipool, CSINN_QUANT_FLOAT32, roi_pool_params) \ - MACRO(roipool, CSINN_QUANT_UINT8_ASYM, roi_pool_params) \ - MACRO(roipool, CSINN_QUANT_INT8_SYM, roi_pool_params) \ - MACRO(gather_nd, CSINN_QUANT_FLOAT32, gather_nd_params) \ - MACRO(gather_nd, CSINN_QUANT_UINT8_ASYM, gather_nd_params) \ - MACRO(gather_nd, CSINN_QUANT_INT8_SYM, gather_nd_params) \ - MACRO(gather, CSINN_QUANT_FLOAT32, gather_params) \ - MACRO(gather, CSINN_QUANT_UINT8_ASYM, gather_params) \ - MACRO(gather, CSINN_QUANT_INT8_SYM, gather_params) +#define LAYER_QUANT_TEST_DISO(MACRO) \ + MACRO(add, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(add, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(add, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(div, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(div, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(div, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(equal, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(equal, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(floor_divide, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(floor_divide, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(floor_divide, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(floor_mod, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(floor_mod, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(floor_mod, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(greater_equal, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(greater_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(greater_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(greater, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(greater, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(greater, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(less_equal, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(less_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(less_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(less, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(less, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(less, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(logical_and, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(logical_and, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(logical_and, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(logical_or, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(logical_or, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(logical_or, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(logical_xor, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(logical_xor, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(logical_xor, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(mod, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(mod, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(mod, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(mul, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(mul, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(mul, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(not_equal, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(not_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(not_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(power, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(power, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(power, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(sub, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(sub, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(sub, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(maximum, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(maximum, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(maximum, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(minimum, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(minimum, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(minimum, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(and, CSINN_QUANT_FLOAT32, csinn_diso_params) \ + MACRO(and, CSINN_QUANT_UINT8_ASYM, csinn_diso_params) \ + MACRO(and, CSINN_QUANT_INT8_SYM, csinn_diso_params) \ + MACRO(matmul, CSINN_QUANT_FLOAT32, csinn_matmul_params) \ + MACRO(matmul, CSINN_QUANT_UINT8_ASYM, csinn_matmul_params) \ + MACRO(matmul, CSINN_QUANT_INT8_SYM, csinn_matmul_params) \ + MACRO(prelu, CSINN_QUANT_FLOAT32, csinn_prelu_params) \ + MACRO(prelu, CSINN_QUANT_UINT8_ASYM, csinn_prelu_params) \ + MACRO(prelu, CSINN_QUANT_INT8_SYM, csinn_prelu_params) \ + MACRO(non_max_suppression, CSINN_QUANT_FLOAT32, csinn_non_max_suppression_params) \ + MACRO(non_max_suppression, CSINN_QUANT_UINT8_ASYM, csinn_non_max_suppression_params) \ + MACRO(non_max_suppression, CSINN_QUANT_INT8_SYM, csinn_non_max_suppression_params) \ + MACRO(psroipooling, CSINN_QUANT_FLOAT32, csinn_psroipooling_params) \ + MACRO(psroipooling, CSINN_QUANT_UINT8_ASYM, csinn_psroipooling_params) \ + MACRO(psroipooling, CSINN_QUANT_INT8_SYM, csinn_psroipooling_params) \ + MACRO(roi_align, CSINN_QUANT_FLOAT32, csinn_roi_align_params) \ + MACRO(roi_align, CSINN_QUANT_UINT8_ASYM, csinn_roi_align_params) \ + MACRO(roi_align, CSINN_QUANT_INT8_SYM, csinn_roi_align_params) \ + MACRO(roipool, CSINN_QUANT_FLOAT32, csinn_roi_pool_params) \ + MACRO(roipool, CSINN_QUANT_UINT8_ASYM, csinn_roi_pool_params) \ + MACRO(roipool, CSINN_QUANT_INT8_SYM, csinn_roi_pool_params) \ + MACRO(gather_nd, CSINN_QUANT_FLOAT32, csinn_gather_nd_params) \ + MACRO(gather_nd, CSINN_QUANT_UINT8_ASYM, csinn_gather_nd_params) \ + MACRO(gather_nd, CSINN_QUANT_INT8_SYM, csinn_gather_nd_params) \ + MACRO(gather, CSINN_QUANT_FLOAT32, csinn_gather_params) \ + MACRO(gather, CSINN_QUANT_UINT8_ASYM, csinn_gather_params) \ + MACRO(gather, CSINN_QUANT_INT8_SYM, csinn_gather_params) -#define LAYER_QUANT_TEST_SEGMENT(MACRO) \ - MACRO(segment_max, CSINN_QUANT_FLOAT32, segment_params) \ - MACRO(segment_max, CSINN_QUANT_UINT8_ASYM, segment_params) \ - MACRO(segment_max, CSINN_QUANT_INT8_SYM, segment_params) \ - MACRO(segment_mean, CSINN_QUANT_FLOAT32, segment_params) \ - MACRO(segment_mean, CSINN_QUANT_UINT8_ASYM, segment_params) \ - MACRO(segment_mean, CSINN_QUANT_INT8_SYM, segment_params) \ - MACRO(segment_min, CSINN_QUANT_FLOAT32, segment_params) \ - MACRO(segment_min, CSINN_QUANT_UINT8_ASYM, segment_params) \ - MACRO(segment_min, CSINN_QUANT_INT8_SYM, segment_params) \ - MACRO(segment_prod, CSINN_QUANT_FLOAT32, segment_params) \ - MACRO(segment_prod, CSINN_QUANT_UINT8_ASYM, segment_params) \ - MACRO(segment_prod, CSINN_QUANT_INT8_SYM, segment_params) \ - MACRO(segment_sum, CSINN_QUANT_FLOAT32, segment_params) \ - MACRO(segment_sum, CSINN_QUANT_UINT8_ASYM, segment_params) \ - MACRO(segment_sum, CSINN_QUANT_INT8_SYM, segment_params) +#define LAYER_QUANT_TEST_SEGMENT(MACRO) \ + MACRO(segment_max, CSINN_QUANT_FLOAT32, csinn_segment_params) \ + MACRO(segment_max, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \ + MACRO(segment_max, CSINN_QUANT_INT8_SYM, csinn_segment_params) \ + MACRO(segment_mean, CSINN_QUANT_FLOAT32, csinn_segment_params) \ + MACRO(segment_mean, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \ + MACRO(segment_mean, CSINN_QUANT_INT8_SYM, csinn_segment_params) \ + MACRO(segment_min, CSINN_QUANT_FLOAT32, csinn_segment_params) \ + MACRO(segment_min, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \ + MACRO(segment_min, CSINN_QUANT_INT8_SYM, csinn_segment_params) \ + MACRO(segment_prod, CSINN_QUANT_FLOAT32, csinn_segment_params) \ + MACRO(segment_prod, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \ + MACRO(segment_prod, CSINN_QUANT_INT8_SYM, csinn_segment_params) \ + MACRO(segment_sum, CSINN_QUANT_FLOAT32, csinn_segment_params) \ + MACRO(segment_sum, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \ + MACRO(segment_sum, CSINN_QUANT_INT8_SYM, csinn_segment_params) -#define LAYER_QUANT_TEST_BATCHNORM(MACRO) \ - MACRO(batch_normalization, CSINN_QUANT_FLOAT32, bn_params) \ - MACRO(batch_normalization, CSINN_QUANT_UINT8_ASYM, bn_params) \ - MACRO(batch_normalization, CSINN_QUANT_INT8_SYM, bn_params) +#define LAYER_QUANT_TEST_BATCHNORM(MACRO) \ + MACRO(batch_normalization, CSINN_QUANT_FLOAT32, csinn_bn_params) \ + MACRO(batch_normalization, CSINN_QUANT_UINT8_ASYM, csinn_bn_params) \ + MACRO(batch_normalization, CSINN_QUANT_INT8_SYM, csinn_bn_params) -#define LAYER_QUANT_TEST_CONCAT(MACRO) \ - MACRO(concat, CSINN_QUANT_FLOAT32, concat_params) \ - MACRO(concat, CSINN_QUANT_UINT8_ASYM, concat_params) \ - MACRO(concat, CSINN_QUANT_INT8_SYM, concat_params) \ - MACRO(stack, CSINN_QUANT_FLOAT32, stack_params) \ - MACRO(stack, CSINN_QUANT_UINT8_ASYM, stack_params) \ - MACRO(stack, CSINN_QUANT_INT8_SYM, stack_params) +#define LAYER_QUANT_TEST_CONCAT(MACRO) \ + MACRO(concat, CSINN_QUANT_FLOAT32, csinn_concat_params) \ + MACRO(concat, CSINN_QUANT_UINT8_ASYM, csinn_concat_params) \ + MACRO(concat, CSINN_QUANT_INT8_SYM, csinn_concat_params) \ + MACRO(stack, CSINN_QUANT_FLOAT32, csinn_stack_params) \ + MACRO(stack, CSINN_QUANT_UINT8_ASYM, csinn_stack_params) \ + MACRO(stack, CSINN_QUANT_INT8_SYM, csinn_stack_params) -#define LAYER_QUANT_TEST_CONV2D(MACRO) \ - MACRO(conv2d, CSINN_QUANT_FLOAT32, conv2d_params) \ - MACRO(conv2d, CSINN_QUANT_UINT8_ASYM, conv2d_params) \ - MACRO(conv2d, CSINN_QUANT_INT8_SYM, conv2d_params) \ - MACRO(conv3d, CSINN_QUANT_FLOAT32, conv3d_params) \ - MACRO(conv3d, CSINN_QUANT_UINT8_ASYM, conv3d_params) \ - MACRO(conv3d, CSINN_QUANT_INT8_SYM, conv3d_params) \ - MACRO(conv2d_relu, CSINN_QUANT_FLOAT32, conv2d_params) \ - MACRO(conv2d_relu, CSINN_QUANT_UINT8_ASYM, conv2d_params) \ - MACRO(conv2d_relu, CSINN_QUANT_INT8_SYM, conv2d_params) \ - MACRO(conv2d_relu6, CSINN_QUANT_FLOAT32, conv2d_params) \ - MACRO(conv2d_relu6, CSINN_QUANT_UINT8_ASYM, conv2d_params) \ - MACRO(conv2d_relu6, CSINN_QUANT_INT8_SYM, conv2d_params) \ - MACRO(deconv2d, CSINN_QUANT_FLOAT32, conv2d_params) \ - MACRO(deconv2d, CSINN_QUANT_UINT8_ASYM, conv2d_params) \ - MACRO(deconv2d, CSINN_QUANT_INT8_SYM, conv2d_params) \ - MACRO(deconv3d, CSINN_QUANT_FLOAT32, conv3d_params) \ - MACRO(deconv3d, CSINN_QUANT_UINT8_ASYM, conv3d_params) \ - MACRO(deconv3d, CSINN_QUANT_INT8_SYM, conv3d_params) \ - MACRO(fullyconnected, CSINN_QUANT_FLOAT32, fc_params) \ - MACRO(fullyconnected, CSINN_QUANT_UINT8_ASYM, fc_params) \ - MACRO(fullyconnected, CSINN_QUANT_INT8_SYM, fc_params) +#define LAYER_QUANT_TEST_CONV2D(MACRO) \ + MACRO(conv2d, CSINN_QUANT_FLOAT32, csinn_conv2d_params) \ + MACRO(conv2d, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params) \ + MACRO(conv2d, CSINN_QUANT_INT8_SYM, csinn_conv2d_params) \ + MACRO(conv3d, CSINN_QUANT_FLOAT32, csinn_conv3d_params) \ + MACRO(conv3d, CSINN_QUANT_UINT8_ASYM, csinn_conv3d_params) \ + MACRO(conv3d, CSINN_QUANT_INT8_SYM, csinn_conv3d_params) \ + MACRO(conv2d_relu, CSINN_QUANT_FLOAT32, csinn_conv2d_params) \ + MACRO(conv2d_relu, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params) \ + MACRO(conv2d_relu, CSINN_QUANT_INT8_SYM, csinn_conv2d_params) \ + MACRO(conv2d_relu6, CSINN_QUANT_FLOAT32, csinn_conv2d_params) \ + MACRO(conv2d_relu6, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params) \ + MACRO(conv2d_relu6, CSINN_QUANT_INT8_SYM, csinn_conv2d_params) \ + MACRO(deconv2d, CSINN_QUANT_FLOAT32, csinn_conv2d_params) \ + MACRO(deconv2d, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params) \ + MACRO(deconv2d, CSINN_QUANT_INT8_SYM, csinn_conv2d_params) \ + MACRO(deconv3d, CSINN_QUANT_FLOAT32, csinn_conv3d_params) \ + MACRO(deconv3d, CSINN_QUANT_UINT8_ASYM, csinn_conv3d_params) \ + MACRO(deconv3d, CSINN_QUANT_INT8_SYM, csinn_conv3d_params) \ + MACRO(fullyconnected, CSINN_QUANT_FLOAT32, csinn_fc_params) \ + MACRO(fullyconnected, CSINN_QUANT_UINT8_ASYM, csinn_fc_params) \ + MACRO(fullyconnected, CSINN_QUANT_INT8_SYM, csinn_fc_params) -#define LAYER_QUANT_TEST_TISO(MACRO) \ - MACRO(select, CSINN_QUANT_FLOAT32, select_params) \ - MACRO(select, CSINN_QUANT_UINT8_ASYM, select_params) \ - MACRO(select, CSINN_QUANT_INT8_SYM, select_params) +#define LAYER_QUANT_TEST_TISO(MACRO) \ + MACRO(select, CSINN_QUANT_FLOAT32, csinn_select_params) \ + MACRO(select, CSINN_QUANT_UINT8_ASYM, csinn_select_params) \ + MACRO(select, CSINN_QUANT_INT8_SYM, csinn_select_params) -#define LAYER_QUANT_TEST_SPLIT(MACRO) \ - MACRO(split, CSINN_QUANT_FLOAT32, split_params) \ - MACRO(split, CSINN_QUANT_UINT8_ASYM, split_params) \ - MACRO(split, CSINN_QUANT_INT8_SYM, split_params) +#define LAYER_QUANT_TEST_SPLIT(MACRO) \ + MACRO(split, CSINN_QUANT_FLOAT32, csinn_split_params) \ + MACRO(split, CSINN_QUANT_UINT8_ASYM, csinn_split_params) \ + MACRO(split, CSINN_QUANT_INT8_SYM, csinn_split_params) -#define LAYER_QUANT_TEST_UNSTACK(MACRO) \ - MACRO(unstack, CSINN_QUANT_FLOAT32, unstack_params) \ - MACRO(unstack, CSINN_QUANT_UINT8_ASYM, unstack_params) \ - MACRO(unstack, CSINN_QUANT_INT8_SYM, unstack_params) +#define LAYER_QUANT_TEST_UNSTACK(MACRO) \ + MACRO(unstack, CSINN_QUANT_FLOAT32, csinn_unstack_params) \ + MACRO(unstack, CSINN_QUANT_UINT8_ASYM, csinn_unstack_params) \ + MACRO(unstack, CSINN_QUANT_INT8_SYM, csinn_unstack_params) -#define LAYER_QUANT_TEST_ARANGE(MACRO) \ - MACRO(arange, CSINN_QUANT_FLOAT32, arange_params) \ - MACRO(arange, CSINN_QUANT_UINT8_ASYM, arange_params) \ - MACRO(arange, CSINN_QUANT_INT8_SYM, arange_params) +#define LAYER_QUANT_TEST_ARANGE(MACRO) \ + MACRO(arange, CSINN_QUANT_FLOAT32, csinn_arange_params) \ + MACRO(arange, CSINN_QUANT_UINT8_ASYM, csinn_arange_params) \ + MACRO(arange, CSINN_QUANT_INT8_SYM, csinn_arange_params) diff --git a/tests/validation_layer/leaky_relu.cpp b/tests/validation_layer/leaky_relu.cpp index ba022e6f..d4b8f6f7 100644 --- a/tests/validation_layer/leaky_relu.cpp +++ b/tests/validation_layer/leaky_relu.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of leaky_relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); @@ -56,13 +57,12 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.n = *((float *)buffer + 4); - csi_quantize_multiplier(params.n, &(params.n_multiplier), &(params.n_shift)); + params->base.api = CSINN_API; + params->n = *((float *)buffer + 4); + shl_quantize_multiplier(params->n, &(params->n_multiplier), &(params->n_shift)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); @@ -70,18 +70,18 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if THEAD_RVV - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_leaky_relu_init, - csi_nn_rvv_leaky_relu_fp32, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_leaky_relu_init, - csi_nn_rvv_leaky_relu_fp16, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_ASYM, csi_leaky_relu_init, - csi_nn_rvv_leaky_relu_int8, &difference); + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_leaky_relu_init, + shl_rvv_leaky_relu_fp32, &difference); + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_leaky_relu_init, + shl_rvv_leaky_relu_fp16, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_leaky_relu_init, + shl_rvv_leaky_relu_int8, &difference); #else - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_leaky_relu_init, csi_leaky_relu, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_leaky_relu_init, csinn_leaky_relu, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_leaky_relu_init, - csi_leaky_relu, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_leaky_relu_init, csi_leaky_relu, + test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_leaky_relu_init, + csinn_leaky_relu, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_leaky_relu_init, csinn_leaky_relu, &difference); #endif diff --git a/tests/validation_layer/less.c b/tests/validation_layer/less.c index 7023cbfb..d533b994 100644 --- a/tests/validation_layer/less.c +++ b/tests/validation_layer/less.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_less_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_less_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_less_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_less_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_less_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_less_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/less_equal.c b/tests/validation_layer/less_equal.c index 3bcb370b..3c3e6e8d 100644 --- a/tests/validation_layer/less_equal.c +++ b/tests/validation_layer/less_equal.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of less_equal(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_less_equal_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_less_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_less_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_less_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_less_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_less_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/log.c b/tests/validation_layer/log.c index 3b1ad97e..f714dd64 100644 --- a/tests/validation_layer/log.c +++ b/tests/validation_layer/log.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_log_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_log_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_log_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_log_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_log_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_log_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/log1p.c b/tests/validation_layer/log1p.c index 1b8e0db5..8431a75d 100644 --- a/tests/validation_layer/log1p.c +++ b/tests/validation_layer/log1p.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log1p(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_log1p_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_log1p_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_log1p_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_log1p_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_log1p_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_log1p_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/log_softmax.c b/tests/validation_layer/log_softmax.c index 21158c59..bcc7c649 100644 --- a/tests/validation_layer/log_softmax.c +++ b/tests/validation_layer/log_softmax.c @@ -16,28 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of log_softmax(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - params.axis = buffer[0]; + params->axis = buffer[0]; input->dim_count = buffer[1]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -52,17 +55,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 2 + input->dim_count); - reference->data = (float *)(buffer + 2 + input->dim_count + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 2 + input->dim_count); + reference->data = (float *)(buffer + 2 + input->dim_count + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_log_softmax_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_log_softmax_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_log_softmax_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_log_softmax_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_log_softmax_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_log_softmax_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/logical_and.c b/tests/validation_layer/logical_and.c index 365212e2..61faa1bf 100644 --- a/tests/validation_layer/logical_and.c +++ b/tests/validation_layer/logical_and.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical_and(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_logical_and_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_logical_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_logical_and_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_logical_and_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_logical_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_logical_and_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/logical_not.c b/tests/validation_layer/logical_not.c index 7ca50337..efc096bf 100644 --- a/tests/validation_layer/logical_not.c +++ b/tests/validation_layer/logical_not.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical_not(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_logical_not_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_logical_not_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_logical_not_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); - + test_logical_not_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_logical_not_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_logical_not_CSINN_QUANT_INT8_SYM(input, output, params, &difference); + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/logical_or.c b/tests/validation_layer/logical_or.c index 84fd18cb..37734850 100644 --- a/tests/validation_layer/logical_or.c +++ b/tests/validation_layer/logical_or.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical_or(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_logical_or_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_logical_or_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_logical_or_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_logical_or_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_logical_or_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_logical_or_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/logical_xor.c b/tests/validation_layer/logical_xor.c index b26b2f52..1ef12b55 100644 --- a/tests/validation_layer/logical_xor.c +++ b/tests/validation_layer/logical_xor.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of logical_xor(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_logical_xor_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_logical_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_logical_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_logical_xor_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_logical_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_logical_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/lrn.c b/tests/validation_layer/lrn.c index d11cb98b..413d7577 100644 --- a/tests/validation_layer/lrn.c +++ b/tests/validation_layer/lrn.c @@ -16,40 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of lrn(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct lrn_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.range = buffer[4] * 2 + 1; - params.bias = *(float *)(buffer + 5); - params.alpha = *(float *)(buffer + 6); - params.beta = *(float *)(buffer + 7); + params->range = buffer[4] * 2 + 1; + params->bias = *(float *)(buffer + 5); + params->alpha = *(float *)(buffer + 6); + params->beta = *(float *)(buffer + 7); - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; @@ -64,17 +66,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 8); reference->data = (float *)(buffer + 8 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_lrn_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_lrn_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_lrn_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_lrn_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_lrn_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_lrn_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/matmul.c b/tests/validation_layer/matmul.c index 9e4c492e..8b187cc4 100644 --- a/tests/validation_layer/matmul.c +++ b/tests/validation_layer/matmul.c @@ -16,28 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of matmul(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct matmul_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_matmul_params *params = + csinn_alloc_params(sizeof(struct csinn_matmul_params), sess); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = input1->dim_count = buffer[2]; output->dim_count = input0->dim_count; - params.trans_a = buffer[0]; - params.trans_b = buffer[1]; + params->trans_a = buffer[0]; + params->trans_b = buffer[1]; for (int i = 0; i < input0->dim_count; ++i) { input0->dim[i] = buffer[3 + i]; input1->dim[i] = buffer[3 + input0->dim_count + i]; @@ -71,19 +73,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 3 + 3 * input0->dim_count); - input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); + input0->data = (float *)(buffer + 3 + 3 * input0->dim_count); + input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0); reference->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_matmul_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_matmul_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_matmul_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_matmul_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_matmul_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_matmul_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/max_stride.c b/tests/validation_layer/max_stride.c index 2633432c..f1096313 100644 --- a/tests/validation_layer/max_stride.c +++ b/tests/validation_layer/max_stride.c @@ -16,49 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of max(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -72,31 +72,29 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - - test_max_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_max_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_max_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_max_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_max_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_max_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/maximum.c b/tests/validation_layer/maximum.c index 6be40a02..d9de0897 100644 --- a/tests/validation_layer/maximum.c +++ b/tests/validation_layer/maximum.c @@ -16,28 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maximum(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; in_size *= input0->dim[i]; @@ -57,19 +57,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); - reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size); - output->data = reference->data; + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_maximum_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_maximum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_maximum_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_maximum_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_maximum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_maximum_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/maxpool.cpp b/tests/validation_layer/maxpool.cpp index 2cbb381b..fa285256 100644 --- a/tests/validation_layer/maxpool.cpp +++ b/tests/validation_layer/maxpool.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of maxpool(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 1; int out_size = 1; @@ -47,16 +48,16 @@ int main(int argc, char **argv) output->dim[2] = buffer[12]; output->dim[3] = buffer[13]; - params.stride_height = buffer[4]; - params.stride_width = buffer[5]; - params.filter_height = buffer[6]; - params.filter_width = buffer[7]; + params->stride_height = buffer[4]; + params->stride_width = buffer[5]; + params->filter_height = buffer[6]; + params->filter_width = buffer[7]; - params.pad_left = buffer[8]; - params.pad_right = buffer[9]; - params.pad_top = buffer[10]; - params.pad_down = buffer[11]; - params.base.layout = CSINN_LAYOUT_NCHW; + params->pad_left = buffer[8]; + params->pad_right = buffer[9]; + params->pad_top = buffer[10]; + params->pad_down = buffer[11]; + params->base.layout = CSINN_LAYOUT_NCHW; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; @@ -71,20 +72,24 @@ int main(int argc, char **argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; + params->ceil_mode = buffer[14]; - input->data = (float *)(buffer + 14); - reference->data = (float *)(buffer + 14 + in_size); + + input->data = (float *)(buffer + 15); + reference->data = (float *)(buffer + 15 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_maxpool2d_init, csi_maxpool2d, +#if (DTYPE==32) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_maxpool2d_init, csinn_maxpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_maxpool2d_init, csi_maxpool2d, +#elif (DTYPE==16) + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_maxpool2d_init, csinn_maxpool2d, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_maxpool2d_init, csi_maxpool2d, +#elif (DTYPE==8) + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_maxpool2d_init, csinn_maxpool2d, &difference); - +#endif return done_testing(); } diff --git a/tests/validation_layer/maxpool3d.c b/tests/validation_layer/maxpool3d.c index 05b93f88..40ef62ab 100644 --- a/tests/validation_layer/maxpool3d.c +++ b/tests/validation_layer/maxpool3d.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool3d(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //channel - input->dim[2] = buffer[2]; //depth - input->dim[3] = buffer[3]; //height - input->dim[4] = buffer[4]; //width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // depth + input->dim[3] = buffer[3]; // height + input->dim[4] = buffer[4]; // width output->dim[0] = buffer[0]; output->dim[1] = buffer[1]; @@ -46,20 +48,20 @@ int main(int argc, char** argv) output->dim[3] = buffer[18]; output->dim[4] = buffer[19]; - params.stride_depth = buffer[5]; - params.stride_height = buffer[6]; - params.stride_width = buffer[7]; - params.filter_depth = buffer[8]; - params.filter_height = buffer[9]; - params.filter_width = buffer[10]; + params->stride_depth = buffer[5]; + params->stride_height = buffer[6]; + params->stride_width = buffer[7]; + params->filter_depth = buffer[8]; + params->filter_height = buffer[9]; + params->filter_width = buffer[10]; - params.pad_left = buffer[11]; - params.pad_right = buffer[12]; - params.pad_top = buffer[13]; - params.pad_down = buffer[14]; - params.pad_front = buffer[15]; - params.pad_back = buffer[16]; - params.base.layout = CSINN_LAYOUT_NCDHW; + params->pad_left = buffer[11]; + params->pad_right = buffer[12]; + params->pad_top = buffer[13]; + params->pad_down = buffer[14]; + params->pad_front = buffer[15]; + params->pad_back = buffer[16]; + params->base.layout = CSINN_LAYOUT_NCDHW; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCDHW; @@ -74,17 +76,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 20); reference->data = (float *)(buffer + 20 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_maxpool3d_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_maxpool3d_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_maxpool3d_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_maxpool3d_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_maxpool3d_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_maxpool3d_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/mean_stride.c b/tests/validation_layer/mean_stride.c index 7a06dddb..3abb0f2c 100644 --- a/tests/validation_layer/mean_stride.c +++ b/tests/validation_layer/mean_stride.c @@ -16,49 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mean(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -72,31 +72,29 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - test_mean_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_mean_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_mean_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_mean_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_mean_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_mean_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/min_stride.c b/tests/validation_layer/min_stride.c index 9f905dda..d38995c2 100644 --- a/tests/validation_layer/min_stride.c +++ b/tests/validation_layer/min_stride.c @@ -16,49 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of min(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -72,31 +72,29 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - test_min_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_min_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_min_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_min_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_min_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_min_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/minimum.c b/tests/validation_layer/minimum.c index ba7c096d..2709b0e3 100644 --- a/tests/validation_layer/minimum.c +++ b/tests/validation_layer/minimum.c @@ -16,27 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of minimum(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; in_size *= input0->dim[i]; @@ -56,18 +57,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 1 + input0->dim_count); - input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); - reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size); - output->data = reference->data; + input0->data = (float *)(buffer + 1 + input0->dim_count); + input1->data = (float *)(buffer + 1 + input0->dim_count + in_size); + reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_minimum_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_minimum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_minimum_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_minimum_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_minimum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_minimum_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/mod.c b/tests/validation_layer/mod.c index 30892ed9..4a253b90 100644 --- a/tests/validation_layer/mod.c +++ b/tests/validation_layer/mod.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of mod(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // height - input0->dim[2] = buffer[2]; // width - input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // height + input0->dim[2] = buffer[2]; // width + input0->dim[3] = buffer[3]; // channel in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dim_count = 4; input0->layout = CSINN_LAYOUT_NCHW; input0->dtype = CSINN_DTYPE_FLOAT32; input0->is_const = 0; input0->quant_channel = 1; - if(flag) { + if (flag) { input1->dim[0] = input0->dim[3]; input1->dim_count = 1; in_size1 = input1->dim[0]; @@ -73,18 +74,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 5); - input1->data = (float *)(buffer + 5 + in_size0); + input0->data = (float *)(buffer + 5); + input1->data = (float *)(buffer + 5 + in_size0); reference->data = (float *)(buffer + 5 + in_size0 + in_size1); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.9; - test_mod_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_mod_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/mul.cpp b/tests/validation_layer/mul.cpp index b6905801..dc0639ba 100644 --- a/tests/validation_layer/mul.cpp +++ b/tests/validation_layer/mul.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -28,12 +27,13 @@ int main(int argc, char **argv) { init_testsuite("Testing function of mul(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size0, in_size1, out_size; int *buffer = read_input_data_f32(argv[1]); @@ -76,8 +76,7 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input0->data = (float *)(buffer + 5); input1->data = (float *)(buffer + 5 + in_size0); @@ -86,18 +85,18 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if THEAD_RVV - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT32, csi_mul_init, - csi_nn_rvv_mul_fp32, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT16, csi_mul_init, - csi_nn_rvv_mul_fp16, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_mul_init, - csi_nn_rvv_mul_int8, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_mul_init, + shl_rvv_mul_fp32, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_mul_init, + shl_rvv_mul_fp16, &difference); + test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_mul_init, + shl_rvv_mul_int8, &difference); #else - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_FLOAT32, csi_mul_init, csi_mul, + test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_mul_init, csinn_mul, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_mul_init, csi_mul, + test_binary_op(input0, input1, output, params, CSINN_QUANT_UINT8_ASYM, csinn_mul_init, csinn_mul, &difference); - test_binary_op(input0, input1, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_mul_init, csi_mul, + test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_mul_init, csinn_mul, &difference); #endif diff --git a/tests/validation_layer/negative.c b/tests/validation_layer/negative.c index 88c1ec26..001da5cb 100644 --- a/tests/validation_layer/negative.c +++ b/tests/validation_layer/negative.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of negative(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_negative_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_negative_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_negative_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_negative_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_negative_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_negative_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/non_max_suppression.c b/tests/validation_layer/non_max_suppression.c index ecb87d32..f30fb4a8 100644 --- a/tests/validation_layer/non_max_suppression.c +++ b/tests/validation_layer/non_max_suppression.c @@ -16,21 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of non_max_suppression(layer).\n"); - - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct non_max_suppression_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_non_max_suppression_params *params = + csinn_alloc_params(sizeof(struct csinn_non_max_suppression_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); @@ -40,20 +42,20 @@ int main(int argc, char** argv) input0->dim[1] = 4; input1->dim[0] = buffer[0]; - params.max_output_size = buffer[1]; - params.iou_threshold = *((float *)buffer + 3); + params->max_output_size = buffer[1]; + params->iou_threshold = *((float *)buffer + 3); output->dim_count = 2; - output->dim[0] = params.max_output_size; + output->dim[0] = params->max_output_size; output->dim[1] = 4; - in_size = input0->dim[0] * 4; + in_size = input0->dim[0] * 4; out_size = buffer[2]; input0->dtype = CSINN_DTYPE_FLOAT32; input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; - input0->quant_channel = 1; + input0->quant_channel = 1; input1->dtype = CSINN_DTYPE_FLOAT32; input1->layout = CSINN_LAYOUT_NCHW; input1->is_const = 0; @@ -62,18 +64,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (int *)(buffer + 4 + in_size + in_size / 4); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_non_max_suppression_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_non_max_suppression_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_non_max_suppression_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_non_max_suppression_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_non_max_suppression_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_non_max_suppression_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/not.c b/tests/validation_layer/not.c index b86fccad..756efad5 100644 --- a/tests/validation_layer/not.c +++ b/tests/validation_layer/not.c @@ -16,43 +16,44 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void op_test_run(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params, float *output_data, float diff) +void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float *output_data, float diff) { - - if (csi_not_init(input, output, params) == CSINN_TRUE) { - csi_not(input, output, params); + if (csinn_not_init(input, output, params) == CSINN_TRUE) { + csinn_not(input, output, params); } - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output); - result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output), + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); + result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output), false); - csi_ref_tensor_transform_free_f32(foutput); + shl_ref_tensor_transform_free_f32(foutput); } -void test_not(struct csi_tensor *input, struct csi_tensor *output, - struct siso_params *params, float &difference); +void test_not(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, float &difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -67,15 +68,14 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_not(input, output, ¶ms, &difference); + test_not(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/not_equal.c b/tests/validation_layer/not_equal.c index a5902279..bcf7f999 100644 --- a/tests/validation_layer/not_equal.c +++ b/tests/validation_layer/not_equal.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of not_equal(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_not_equal_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_not_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_not_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_not_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_not_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_not_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/pad.cpp b/tests/validation_layer/pad.cpp index 841b639d..11c6f35d 100644 --- a/tests/validation_layer/pad.cpp +++ b/tests/validation_layer/pad.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of pad(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct pad_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), sess); int in_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); @@ -59,12 +60,11 @@ int main(int argc, char **argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.pad_mode = CSINN_PAD_CONSTANT; - params.pad_value = 0.0f; - params.pad_num = input->dim_count; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->pad_mode = CSINN_PAD_CONSTANT; + params->pad_value = 0.0f; + params->pad_num = input->dim_count; int32_t pad_left = buffer[4]; int32_t pad_right = buffer[5]; @@ -74,8 +74,8 @@ int main(int argc, char **argv) int32_t pad_before[4] = {0, 0, pad_top, pad_left}; int32_t pad_after[4] = {0, 0, pad_down, pad_right}; - params.pad_before = pad_before; - params.pad_after = pad_after; + params->pad_before = pad_before; + params->pad_after = pad_after; input->data = (float *)(buffer + 8); reference->data = (float *)(buffer + 8 + in_size); @@ -85,10 +85,10 @@ int main(int argc, char **argv) #if THEAD_RVV return 0 #else - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_pad_init, csi_pad, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_pad_init, csi_pad, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_pad_init, csinn_pad, &difference); + test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_pad_init, csinn_pad, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_pad_init, csi_pad, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_pad_init, csinn_pad, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/power.c b/tests/validation_layer/power.c index 341a45f6..61611ada 100644 --- a/tests/validation_layer/power.c +++ b/tests/validation_layer/power.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of power(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_power_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_power_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_power_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_power_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_power_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_power_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/prelu.c b/tests/validation_layer/prelu.c index 0bfd650f..80863fcb 100644 --- a/tests/validation_layer/prelu.c +++ b/tests/validation_layer/prelu.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prelu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *alpha_data = csi_alloc_tensor(NULL); - struct prelu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *alpha_data = csinn_alloc_tensor(sess); + struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // channel - output->dim[2] = input->dim[2] = buffer[2]; // height - output->dim[3] = input->dim[3] = buffer[3]; // width + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // channel + output->dim[2] = input->dim[2] = buffer[2]; // height + output->dim[3] = input->dim[3] = buffer[3]; // width alpha_data->dim[0] = buffer[1]; input->dim_count = 4; alpha_data->dim_count = 1; @@ -57,22 +59,21 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); + input->data = (float *)(buffer + 4); alpha_data->data = (float *)(buffer + 4 + in_size); - reference->data = (float *)(buffer + 4 + in_size + input->dim[1]); - output->data = reference->data; + reference->data = (float *)(buffer + 4 + in_size + input->dim[1]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_prelu_CSINN_QUANT_FLOAT32(input, alpha_data, output, ¶ms, &difference); - test_prelu_CSINN_QUANT_UINT8_ASYM(input, alpha_data, output, ¶ms, &difference); - test_prelu_CSINN_QUANT_INT8_SYM(input, alpha_data, output, ¶ms, &difference); + test_prelu_CSINN_QUANT_FLOAT32(input, alpha_data, output, params, &difference); + test_prelu_CSINN_QUANT_UINT8_ASYM(input, alpha_data, output, params, &difference); + test_prelu_CSINN_QUANT_INT8_SYM(input, alpha_data, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/prod_stride.c b/tests/validation_layer/prod_stride.c index 35a50917..768c1679 100644 --- a/tests/validation_layer/prod_stride.c +++ b/tests/validation_layer/prod_stride.c @@ -16,49 +16,49 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of prod(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; int axis = buffer[4]; int m = buffer[5]; int n = buffer[6]; - for(int i = 0; i < input->dim_count; i++) { - if(i < axis){ + for (int i = 0; i < input->dim_count; i++) { + if (i < axis) { output->dim[i] = input->dim[i]; - } - else if(i > axis){ - output->dim[i-1] = input->dim[i]; + } else if (i > axis) { + output->dim[i - 1] = input->dim[i]; } } - - int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); - int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); - int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); - + int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t)); + int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t)); + int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t)); in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size / input->dim[axis]; @@ -72,31 +72,29 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 7); + input->data = (float *)(buffer + 7); out_strides_0 = (int32_t *)(buffer + 7 + in_size); out_extents_0 = (int32_t *)(buffer + 7 + in_size + n); inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n); inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m); reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - - test_prod_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_prod_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_prod_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + + test_prod_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_prod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_prod_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/psroipooling.c b/tests/validation_layer/psroipooling.c index f89010c9..1c1f39cb 100644 --- a/tests/validation_layer/psroipooling.c +++ b/tests/validation_layer/psroipooling.c @@ -16,30 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of psropooling(layer).\n"); - - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *spatial_scale = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct psroipooling_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *spatial_scale = csinn_alloc_tensor(sess); + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_psroipooling_params *params = + csinn_alloc_params(sizeof(struct csinn_psroipooling_params), sess); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; input0->dtype = CSINN_DTYPE_FLOAT32; @@ -47,9 +49,7 @@ int main(int argc, char** argv) input0->is_const = 0; input0->quant_channel = 1; input0->name = "input0"; - input0->data = (float *)(buffer + 10); - - + input0->data = (float *)(buffer + 10); input1->dim[0] = buffer[6]; input1->dim[1] = 5; @@ -60,17 +60,16 @@ int main(int argc, char** argv) in1_size = input1->dim[0] * input1->dim[1]; input1->dtype = CSINN_DTYPE_FLOAT32; input1->name = "input1"; - input1->data = (float *)(buffer + 10 + in0_size); - + input1->data = (float *)(buffer + 10 + in0_size); - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = buffer[7]; // output_dim + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = buffer[7]; // output_dim output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 10 + in0_size + in1_size); - output->data = reference->data; + output->data = reference->data; output->name = "output"; output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; @@ -78,18 +77,16 @@ int main(int argc, char** argv) output->quant_channel = 1; float difference = argc > 2 ? atof(argv[2]) : 0.99; - params.spatial_scale = *((float *)buffer + 9); - params.output_dim = buffer[7]; - params.group_size = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->spatial_scale = *((float *)buffer + 9); + params->output_dim = buffer[7]; + params->group_size = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - - test_psroipooling_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_psroipooling_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_psroipooling_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_psroipooling_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_psroipooling_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_psroipooling_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_logsumexp.c b/tests/validation_layer/reduce_logsumexp.c index 3af665ad..e8705524 100644 --- a/tests/validation_layer/reduce_logsumexp.c +++ b/tests/validation_layer/reduce_logsumexp.c @@ -16,33 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_logsumexp(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reduce_logsumexp_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_logsumexp_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_logsumexp_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reduce_logsumexp_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_logsumexp_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_logsumexp_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_max.c b/tests/validation_layer/reduce_max.c index 2dbd4b5b..6a760c7d 100644 --- a/tests/validation_layer/reduce_max.c +++ b/tests/validation_layer/reduce_max.c @@ -16,33 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_max(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input->dim[0] = buffer[0]; // batch - reference->dim[1] = input->dim[1] = buffer[1]; // height - reference->dim[2] = input->dim[2] = buffer[2]; // width - reference->dim[3] = input->dim[3] = buffer[3]; // channel + reference->dim[0] = input->dim[0] = buffer[0]; // batch + reference->dim[1] = input->dim[1] = buffer[1]; // height + reference->dim[2] = input->dim[2] = buffer[2]; // width + reference->dim[3] = input->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_reduce_max_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_max_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_max_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_reduce_max_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_max_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_max_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_mean.c b/tests/validation_layer/reduce_mean.c index b2b57c40..e76193d9 100644 --- a/tests/validation_layer/reduce_mean.c +++ b/tests/validation_layer/reduce_mean.c @@ -16,33 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_mean(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input->dim[0] = buffer[0]; // batch - reference->dim[1] = input->dim[1] = buffer[1]; // height - reference->dim[2] = input->dim[2] = buffer[2]; // width - reference->dim[3] = input->dim[3] = buffer[3]; // channel + reference->dim[0] = input->dim[0] = buffer[0]; // batch + reference->dim[1] = input->dim[1] = buffer[1]; // height + reference->dim[2] = input->dim[2] = buffer[2]; // width + reference->dim[3] = input->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reduce_mean_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_mean_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_mean_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reduce_mean_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_mean_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_mean_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_min.c b/tests/validation_layer/reduce_min.c index 09509701..755bdc6d 100644 --- a/tests/validation_layer/reduce_min.c +++ b/tests/validation_layer/reduce_min.c @@ -16,33 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_min(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input->dim[0] = buffer[0]; // batch - reference->dim[1] = input->dim[1] = buffer[1]; // height - reference->dim[2] = input->dim[2] = buffer[2]; // width - reference->dim[3] = input->dim[3] = buffer[3]; // channel + reference->dim[0] = input->dim[0] = buffer[0]; // batch + reference->dim[1] = input->dim[1] = buffer[1]; // height + reference->dim[2] = input->dim[2] = buffer[2]; // width + reference->dim[3] = input->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reduce_min_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_min_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_min_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reduce_min_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_min_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_min_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_prod.c b/tests/validation_layer/reduce_prod.c index 543f8e4e..755a5a78 100644 --- a/tests/validation_layer/reduce_prod.c +++ b/tests/validation_layer/reduce_prod.c @@ -16,37 +16,40 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_prod(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input->dim[0] = buffer[0]; - reference->dim[1] = input->dim[1] = buffer[1]; - reference->dim[2] = input->dim[2] = buffer[2]; - reference->dim[3] = input->dim[3] = buffer[3]; + reference->dim[0] = input->dim[0] = buffer[0]; + reference->dim[1] = input->dim[1] = buffer[1]; + reference->dim[2] = input->dim[2] = buffer[2]; + reference->dim[3] = input->dim[3] = buffer[3]; - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; - input->dtype = CSINN_DTYPE_FLOAT32; + input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); + input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size0); - if(params.axis[0]==-1) { + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reduce_prod_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_prod_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_prod_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reduce_prod_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_prod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_prod_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reduce_sum.c b/tests/validation_layer/reduce_sum.c index a9c59978..4064b298 100644 --- a/tests/validation_layer/reduce_sum.c +++ b/tests/validation_layer/reduce_sum.c @@ -16,33 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reduce_sum(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = + csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - reference->dim[0] = input->dim[0] = buffer[0]; // batch - reference->dim[1] = input->dim[1] = buffer[1]; // height - reference->dim[2] = input->dim[2] = buffer[2]; // width - reference->dim[3] = input->dim[3] = buffer[3]; // channel + reference->dim[0] = input->dim[0] = buffer[0]; // batch + reference->dim[1] = input->dim[1] = buffer[1]; // height + reference->dim[2] = input->dim[2] = buffer[2]; // width + reference->dim[3] = input->dim[3] = buffer[3]; // channel - params.axis_count = 1; - params.axis = (int *)malloc(sizeof(int) * params.axis_count); - params.axis[0] = buffer[4]; + params->axis_count = 1; + params->axis = (int *)malloc(sizeof(int) * params->axis_count); + params->axis[0] = buffer[4]; in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->dim_count = 4; @@ -54,32 +57,31 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size0 ); - if(params.axis[0]==-1) { + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size0); + if (params->axis[0] == -1) { out_size = 1; output->dim_count = 1; output->dim[0] = 1; } else { - out_size = in_size0/input->dim[params.axis[0]]; + out_size = in_size0 / input->dim[params->axis[0]]; output->dim_count = 4; // keep_dim = 1 - for(int i = 0; i < output->dim_count; i++) { - if(params.axis[0] == i) { + for (int i = 0; i < output->dim_count; i++) { + if (params->axis[0] == i) { output->dim[i] = 1; } else { output->dim[i] = input->dim[i]; } } } - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reduce_sum_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reduce_sum_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reduce_sum_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reduce_sum_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reduce_sum_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reduce_sum_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/relu.cpp b/tests/validation_layer/relu.cpp index 7aabad6f..9b2dc120 100644 --- a/tests/validation_layer/relu.cpp +++ b/tests/validation_layer/relu.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); @@ -57,8 +58,7 @@ int main(int argc, char **argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 4); reference->data = (float *)(buffer + 4 + in_size); @@ -66,18 +66,18 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if THEAD_RVV - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_relu_init, csi_nn_rvv_relu_fp32, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu_init, shl_rvv_relu_fp32, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_relu_init, csi_nn_rvv_relu_fp16, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_relu_init, shl_rvv_relu_fp16, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_ASYM, csi_relu_init, csi_nn_rvv_relu_int8, + test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_relu_init, shl_rvv_relu_int8, &difference); #else - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_relu_init, csi_relu, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu_init, csinn_relu, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_UINT8_ASYM, csi_relu_init, csi_relu, + test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_relu_init, csinn_relu, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_relu_init, csi_relu, + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_relu_init, csinn_relu, &difference); #endif diff --git a/tests/validation_layer/relu1.c b/tests/validation_layer/relu1.c index b0417359..df07f750 100644 --- a/tests/validation_layer/relu1.c +++ b/tests/validation_layer/relu1.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu1(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -54,17 +56,16 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_relu1_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_relu1_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_relu1_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_relu1_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_relu1_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_relu1_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/relu6.c b/tests/validation_layer/relu6.c index 4960d6dd..3bbef320 100644 --- a/tests/validation_layer/relu6.c +++ b/tests/validation_layer/relu6.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -54,17 +56,16 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_relu6_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_relu6_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_relu6_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_relu6_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_relu6_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_relu6_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/relun.c b/tests/validation_layer/relun.c index 9b94be51..246f618e 100644 --- a/tests/validation_layer/relun.c +++ b/tests/validation_layer/relun.c @@ -16,34 +16,36 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relun(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_relun_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_relun_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_relun_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_relun_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_relun_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_relun_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reshape.c b/tests/validation_layer/reshape.c index 50b6f9f8..29fa8506 100644 --- a/tests/validation_layer/reshape.c +++ b/tests/validation_layer/reshape.c @@ -16,38 +16,41 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reshape(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reshape_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reshape_params *params = + csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); int reshape_count = buffer[4]; int *reshape = (int *)malloc(reshape_count * sizeof(int)); - for(int i = 0; i < reshape_count; i++) { + for (int i = 0; i < reshape_count; i++) { reshape[i] = buffer[5 + i]; } - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; input->name = "input"; float *input_data = (float *)(buffer + 5 + reshape_count); - input->data = input_data; + input->data = input_data; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -58,7 +61,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; out_size = in_size; - for(int i = 0; i < output->dim_count; i++) { + for (int i = 0; i < output->dim_count; i++) { output->dim[i] = reshape[i]; // out_size *= output->dim[i]; } @@ -68,17 +71,16 @@ int main(int argc, char** argv) output->name = "output"; output->dtype = CSINN_DTYPE_FLOAT32; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; - params.shape = reshape; - params.shape_num = output->dim_count; - + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->shape = reshape; + params->shape_num = output->dim_count; + float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reshape_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reshape_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reshape_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reshape_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reshape_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reshape_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/resize_bilinear.c b/tests/validation_layer/resize_bilinear.c index 86e456d6..009620b4 100644 --- a/tests/validation_layer/resize_bilinear.c +++ b/tests/validation_layer/resize_bilinear.c @@ -16,35 +16,38 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize bilinear f32.\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[4]; // height - output->dim[2] = buffer[5]; // width - output->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[4]; // height + output->dim[2] = buffer[5]; // width + output->dim[3] = buffer[3]; // channel input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_BILINEAR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_BILINEAR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -53,20 +56,19 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NHWC; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NHWC; - input->data = (float *)(buffer + 7); - reference->data = (float *)(buffer + 7 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 7); + reference->data = (float *)(buffer + 7 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_resize_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_resize_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_resize_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_resize_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_resize_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_resize_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/resize_nearestneighbor.c b/tests/validation_layer/resize_nearestneighbor.c index 8758eba7..c28dcd74 100644 --- a/tests/validation_layer/resize_nearestneighbor.c +++ b/tests/validation_layer/resize_nearestneighbor.c @@ -16,39 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of resize nearestneighbor(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct resize_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_resize_params *params = + csinn_alloc_params(sizeof(struct csinn_resize_params), sess); int in_size, out_size; int zp, quantized_multiplier, shift; float scale, min_value, max_value; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width - output->dim[0] = buffer[0]; // batch - output->dim[1] = buffer[1]; // channel - output->dim[2] = buffer[4]; // height - output->dim[3] = buffer[5]; // width + output->dim[0] = buffer[0]; // batch + output->dim[1] = buffer[1]; // channel + output->dim[2] = buffer[4]; // height + output->dim[3] = buffer[5]; // width input->dim_count = 4; output->dim_count = 4; - params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; - params.align_corners = buffer[6]; + params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR; + params->align_corners = buffer[6]; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -57,20 +60,19 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + params->base.layout = CSINN_LAYOUT_NCHW; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 7); - reference->data = (float *)(buffer + 7 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 7); + reference->data = (float *)(buffer + 7 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_resize_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_resize_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_resize_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_resize_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_resize_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_resize_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/reverse.c b/tests/validation_layer/reverse.c index cb6c5e4d..b6d38f72 100644 --- a/tests/validation_layer/reverse.c +++ b/tests/validation_layer/reverse.c @@ -16,36 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reverse(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reverse_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reverse_params *params = + csinn_alloc_params(sizeof(struct csinn_reverse_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; @@ -59,17 +62,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); + input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_reverse_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_reverse_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_reverse_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_reverse_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_reverse_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_reverse_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/roialign.c b/tests/validation_layer/roialign.c index ddab2691..aaef5d17 100644 --- a/tests/validation_layer/roialign.c +++ b/tests/validation_layer/roialign.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of roialign(layer).\n"); - - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct roi_align_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_roi_align_params *params = + csinn_alloc_params(sizeof(struct csinn_roi_align_params), sess); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; @@ -48,7 +50,6 @@ int main(int argc, char** argv) input0->name = "input0"; input0->data = (float *)(buffer + 11); - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; @@ -60,9 +61,8 @@ int main(int argc, char** argv) input1->name = "input1"; input1->data = (float *)(buffer + 11 + in0_size); - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = input0->dim[1]; // channel + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = input0->dim[1]; // channel output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; @@ -71,23 +71,22 @@ int main(int argc, char** argv) output->quant_channel = 1; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 11 + in0_size + in1_size); - output->data = reference->data; + output->data = reference->data; output->name = "output"; output->dtype = CSINN_DTYPE_FLOAT32; float difference = argc > 2 ? atof(argv[2]) : 0.9; - params.spatial_scale = *((float *)buffer + 9); - params.sample_ratio = *((int32_t *)buffer + 10); - params.pooled_size_h = buffer[7]; - params.pooled_size_w = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->spatial_scale = *((float *)buffer + 9); + params->sample_ratio = *((int32_t *)buffer + 10); + params->pooled_size_h = buffer[7]; + params->pooled_size_w = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - test_roi_align_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_roi_align_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_roi_align_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_roi_align_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_roi_align_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_roi_align_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/roipooling.c b/tests/validation_layer/roipooling.c index dd41e1a8..ae492767 100644 --- a/tests/validation_layer/roipooling.c +++ b/tests/validation_layer/roipooling.c @@ -16,29 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of roipooling(layer).\n"); - - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct roi_pool_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_roi_pool_params *params = + csinn_alloc_params(sizeof(struct csinn_roi_pool_params), sess); int in0_size = 0, in1_size = 0, out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input0->dim[0] = buffer[0]; // batch - input0->dim[1] = buffer[1]; // channel - input0->dim[2] = buffer[2]; // height - input0->dim[3] = buffer[3]; // width + input0->dim[0] = buffer[0]; // batch + input0->dim[1] = buffer[1]; // channel + input0->dim[2] = buffer[2]; // height + input0->dim[3] = buffer[3]; // width input0->dim_count = 4; input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; @@ -48,7 +50,6 @@ int main(int argc, char** argv) input0->name = "input0"; input0->data = (float *)(buffer + 10); - input1->dim[0] = buffer[6]; input1->dim[1] = 5; input1->dim_count = 2; @@ -60,9 +61,8 @@ int main(int argc, char** argv) input1->name = "input1"; input1->data = (float *)(buffer + 10 + in0_size); - - output->dim[0] = input1->dim[0]; // num_rois - output->dim[1] = input0->dim[1]; // channel + output->dim[0] = input1->dim[0]; // num_rois + output->dim[1] = input0->dim[1]; // channel output->dim[2] = buffer[4]; output->dim[3] = buffer[5]; output->dim_count = 4; @@ -71,22 +71,21 @@ int main(int argc, char** argv) output->quant_channel = 1; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; reference->data = (float *)(buffer + 10 + in0_size + in1_size); - output->data = reference->data; + output->data = reference->data; output->name = "output"; output->dtype = CSINN_DTYPE_FLOAT32; float difference = argc > 2 ? atof(argv[2]) : 0.99; - params.spatial_scale = *((float *)buffer + 9); - params.pooled_size_h = buffer[7]; - params.pooled_size_w = buffer[8]; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->spatial_scale = *((float *)buffer + 9); + params->pooled_size_h = buffer[7]; + params->pooled_size_w = buffer[8]; + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - test_roipool_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_roipool_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_roipool_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_roipool_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_roipool_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_roipool_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/round.c b/tests/validation_layer/round.c index 3d189f59..5d5e416f 100644 --- a/tests/validation_layer/round.c +++ b/tests/validation_layer/round.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of round(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_round_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_round_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_round_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_round_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_round_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_round_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/rsqrt.c b/tests/validation_layer/rsqrt.c index 40558fc7..9a05fcf7 100644 --- a/tests/validation_layer/rsqrt.c +++ b/tests/validation_layer/rsqrt.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of rsqrt(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_rsqrt_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_rsqrt_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_rsqrt_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_rsqrt_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_rsqrt_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_rsqrt_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/segment_max.c b/tests/validation_layer/segment_max.c index 505f0566..8224bc89 100644 --- a/tests/validation_layer/segment_max.c +++ b/tests/validation_layer/segment_max.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment max(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -54,23 +57,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/segment_mean.c b/tests/validation_layer/segment_mean.c index 8ab5e699..660e4fee 100644 --- a/tests/validation_layer/segment_mean.c +++ b/tests/validation_layer/segment_mean.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment mean(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -55,26 +58,23 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.base.layout = CSINN_LAYOUT_NCHW; - + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/segment_min.c b/tests/validation_layer/segment_min.c index 80434685..93a77d0d 100644 --- a/tests/validation_layer/segment_min.c +++ b/tests/validation_layer/segment_min.c @@ -16,33 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment min(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; - int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -55,25 +57,23 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/segment_prod.c b/tests/validation_layer/segment_prod.c index 983dbbf3..862ac1a6 100644 --- a/tests/validation_layer/segment_prod.c +++ b/tests/validation_layer/segment_prod.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment prod(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -54,23 +57,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + + test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/segment_sum.c b/tests/validation_layer/segment_sum.c index 8300229d..84c82f7d 100644 --- a/tests/validation_layer/segment_sum.c +++ b/tests/validation_layer/segment_sum.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of segment sum(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; @@ -54,23 +57,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_FALSE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_FALSE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/select.c b/tests/validation_layer/select.c index d83afe30..4f46ca3c 100644 --- a/tests/validation_layer/select.c +++ b/tests/validation_layer/select.c @@ -16,36 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of select(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *condition = csi_alloc_tensor(NULL); - struct select_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *condition = csinn_alloc_tensor(sess); + struct csinn_select_params *params = + csinn_alloc_params(sizeof(struct csinn_select_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input0->dim[0] = input1->dim[0] = buffer[0]; - input0->dim[1] = input1->dim[1] = buffer[1]; - input0->dim[2] = input1->dim[2] = buffer[2]; - input0->dim[3] = input1->dim[3] = buffer[3]; + int flag = buffer[4]; + input0->dim[0] = input1->dim[0] = buffer[0]; + input0->dim[1] = input1->dim[1] = buffer[1]; + input0->dim[2] = input1->dim[2] = buffer[2]; + input0->dim[3] = input1->dim[3] = buffer[3]; - condition->dim[0] = buffer[0]; - condition->dim[1] = buffer[1]; - condition->dim[2] = buffer[2]; - condition->dim[3] = buffer[3]; + condition->dim[0] = buffer[0]; + condition->dim[1] = buffer[1]; + condition->dim[2] = buffer[2]; + condition->dim[3] = buffer[3]; output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -66,21 +67,18 @@ int main(int argc, char** argv) input1->layout = CSINN_LAYOUT_NCHW; condition->layout = CSINN_LAYOUT_NCHW; output->layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); condition->data = (float *)(buffer + 4 + 2 * in_size); reference->data = (float *)(buffer + 4 + 3 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_select_CSINN_QUANT_FLOAT32(condition, input0, input1, output, ¶ms, &difference); - test_select_CSINN_QUANT_UINT8_ASYM(condition, input0, input1, output, ¶ms, &difference); - test_select_CSINN_QUANT_INT8_SYM(condition, input0, input1, output, ¶ms, &difference); + test_select_CSINN_QUANT_FLOAT32(condition, input0, input1, output, params, &difference); + test_select_CSINN_QUANT_UINT8_ASYM(condition, input0, input1, output, params, &difference); + test_select_CSINN_QUANT_INT8_SYM(condition, input0, input1, output, params, &difference); return done_testing(); } - - diff --git a/tests/validation_layer/shuffle_channel.c b/tests/validation_layer/shuffle_channel.c index 7c98bab9..bf7f5997 100644 --- a/tests/validation_layer/shuffle_channel.c +++ b/tests/validation_layer/shuffle_channel.c @@ -16,30 +16,32 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of shuffle_channel(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct shuffle_channel_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_shuffle_channel_params *params = + csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), sess); int in_size = 1, out_size = 1; - int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width - params.group = buffer[4]; + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width + params->group = buffer[4]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -51,9 +53,8 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.layout = CSINN_LAYOUT_NCHW; + params->base.api = CSINN_API; output->dim_count = 4; output->dtype = CSINN_DTYPE_FLOAT32; @@ -62,17 +63,17 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; //out_size = in_size; + out_size = + output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; // out_size = in_size; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_shuffle_channel_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_shuffle_channel_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_shuffle_channel_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_shuffle_channel_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_shuffle_channel_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_shuffle_channel_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/sigmoid.cpp b/tests/validation_layer/sigmoid.cpp index 0b87f0f3..74504d06 100644 --- a/tests/validation_layer/sigmoid.cpp +++ b/tests/validation_layer/sigmoid.cpp @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_utils.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -28,17 +27,19 @@ int main(int argc, char** argv) { init_testsuite("Testing function of sigmoid(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct sigmoid_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_sigmoid_params *params = csinn_alloc_params(sizeof(struct csinn_sigmoid_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -56,8 +57,7 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 4); reference->data = (float *)(buffer + 4 + in_size); @@ -65,12 +65,12 @@ int main(int argc, char** argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_sigmoid_init, - csi_sigmoid, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_sigmoid_init, - csi_sigmoid, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_sigmoid_init, - csi_sigmoid, &difference); + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sigmoid_init, + csinn_sigmoid, &difference); + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sigmoid_init, + csinn_sigmoid, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sigmoid_init, + csinn_sigmoid, &difference); return done_testing(); diff --git a/tests/validation_layer/sign.c b/tests/validation_layer/sign.c index 778a1670..eaae473a 100644 --- a/tests/validation_layer/sign.c +++ b/tests/validation_layer/sign.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sign(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_sign_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_sign_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_sign_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_sign_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_sign_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_sign_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/sin.c b/tests/validation_layer/sin.c index ce2dc168..21b1deae 100644 --- a/tests/validation_layer/sin.c +++ b/tests/validation_layer/sin.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sin(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_sin_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_sin_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_sin_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_sin_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_sin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_sin_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/sinh.c b/tests/validation_layer/sinh.c index 2bbdfc5c..be2df021 100644 --- a/tests/validation_layer/sinh.c +++ b/tests/validation_layer/sinh.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sinh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_sinh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_sinh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_sinh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_sinh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_sinh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_sinh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/slice.c b/tests/validation_layer/slice.c index c3c2725f..901cb53e 100644 --- a/tests/validation_layer/slice.c +++ b/tests/validation_layer/slice.c @@ -16,41 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of slice(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct slice_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.slice_num = 4; - params.begin = (int *)malloc(4 * sizeof(int)); - params.end = (int *)malloc(4 * sizeof(int)); - for(int i = 0; i < 4; i++) { - params.begin[i] = buffer[4+i]; - params.end[i] = buffer[8+i]; + params->slice_num = 4; + params->begin = (int *)malloc(4 * sizeof(int)); + params->end = (int *)malloc(4 * sizeof(int)); + for (int i = 0; i < 4; i++) { + params->begin[i] = buffer[4 + i]; + params->end[i] = buffer[8 + i]; } - output->dim[0] = params.end[0] - params.begin[0]; - output->dim[1] = params.end[1] - params.begin[1]; - output->dim[2] = params.end[2] - params.begin[2]; - output->dim[3] = params.end[3] - params.begin[3]; + output->dim[0] = params->end[0] - params->begin[0]; + output->dim[1] = params->end[1] - params->begin[1]; + output->dim[2] = params->end[2] - params->begin[2]; + output->dim[3] = params->end[3] - params->begin[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; input->dim_count = 4; @@ -64,17 +66,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 12); - reference->data = (float *)(buffer + 12 + in_size); + input->data = (float *)(buffer + 12); + reference->data = (float *)(buffer + 12 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_slice_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_slice_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_slice_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_slice_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_slice_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_slice_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/softmax.cpp b/tests/validation_layer/softmax.cpp index 280f886a..8c451e4c 100644 --- a/tests/validation_layer/softmax.cpp +++ b/tests/validation_layer/softmax.cpp @@ -16,10 +16,9 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_utils.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -28,24 +27,26 @@ int main(int argc, char** argv) { init_testsuite("Testing function of softmax(layer)\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct softmax_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_softmax_params *params = csinn_alloc_params(sizeof(struct csinn_softmax_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.axis = buffer[4]; + params->axis = buffer[4]; input->dim_count = 4; output->dim_count = 4; @@ -58,24 +59,23 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - - params.base.layout = CSINN_LAYOUT_NCHW; + + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->data = (float *)(buffer + 5); reference->data = (float *)(buffer + 5 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_softmax_init, - csi_softmax, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_softmax_init, - csi_softmax, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_softmax_init, - csi_softmax, &difference); + + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_softmax_init, + csinn_softmax, &difference); + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_softmax_init, + csinn_softmax, &difference); + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_softmax_init, + csinn_softmax, &difference); return done_testing(); } diff --git a/tests/validation_layer/softplus.c b/tests/validation_layer/softplus.c index ee9c432f..5994f71f 100644 --- a/tests/validation_layer/softplus.c +++ b/tests/validation_layer/softplus.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softplus(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_softplus_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_softplus_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_softplus_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_softplus_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_softplus_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_softplus_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/softrelu.c b/tests/validation_layer/softrelu.c index 1f0877c5..bf456521 100644 --- a/tests/validation_layer/softrelu.c +++ b/tests/validation_layer/softrelu.c @@ -16,20 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softrelu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); @@ -43,7 +45,7 @@ int main(int argc, char** argv) output->dim[2] = input->dim[2]; output->dim[3] = input->dim[3]; - params.n = buffer[4]; + params->n = buffer[4]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -56,17 +58,16 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_softrelu_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_softrelu_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_softrelu_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_softrelu_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_softrelu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_softrelu_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/softsign.c b/tests/validation_layer/softsign.c index 8b371210..dcdf2ccb 100644 --- a/tests/validation_layer/softsign.c +++ b/tests/validation_layer/softsign.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softsign(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_softsign_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_softsign_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_softsign_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_softsign_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_softsign_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_softsign_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/space_to_batch.c b/tests/validation_layer/space_to_batch.c index 9c090c09..3a4529a9 100644 --- a/tests/validation_layer/space_to_batch.c +++ b/tests/validation_layer/space_to_batch.c @@ -16,40 +16,43 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_batch_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_space_to_batch_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; - params.pad_top = buffer[5]; - params.pad_bottom = buffer[6]; - params.pad_left = buffer[7]; - params.pad_right = buffer[8]; + params->block_size = buffer[4]; + params->pad_top = buffer[5]; + params->pad_bottom = buffer[6]; + params->pad_left = buffer[7]; + params->pad_right = buffer[8]; - output->dim[0] = input->dim[0] * params.block_size * params.block_size; + output->dim[0] = input->dim[0] * params->block_size * params->block_size; output->dim[1] = input->dim[1]; - output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size; - output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size; + output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size; + output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -65,17 +68,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 9); - reference->data = (float *)(buffer + 9 + in_size); + input->data = (float *)(buffer + 9); + reference->data = (float *)(buffer + 9 + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_space_to_batch_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_space_to_batch_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_space_to_batch_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_space_to_batch_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_space_to_batch_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_space_to_batch_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/space_to_depth.c b/tests/validation_layer/space_to_depth.c index d3d4aecc..03cf8cff 100644 --- a/tests/validation_layer/space_to_depth.c +++ b/tests/validation_layer/space_to_depth.c @@ -16,36 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of space_to_depth(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct space_to_depth_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_space_to_depth_params *params = + csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), sess); int in_size = 0; int out_size = 0; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; //batch - input->dim[1] = buffer[1]; //in_channel - input->dim[2] = buffer[2]; //in_height - input->dim[3] = buffer[3]; //in_width + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // in_channel + input->dim[2] = buffer[2]; // in_height + input->dim[3] = buffer[3]; // in_width - params.block_size = buffer[4]; + params->block_size = buffer[4]; output->dim[0] = input->dim[0]; - output->dim[1] = input->dim[1] * params.block_size * params.block_size; - output->dim[2] = input->dim[2] / params.block_size; - output->dim[3] = input->dim[3] / params.block_size; + output->dim[1] = input->dim[1] * params->block_size * params->block_size; + output->dim[2] = input->dim[2] / params->block_size; + output->dim[3] = input->dim[3] / params->block_size; input->dim_count = 4; output->dim_count = 4; @@ -53,7 +56,7 @@ int main(int argc, char** argv) input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; @@ -61,18 +64,16 @@ int main(int argc, char** argv) in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_space_to_depth_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_space_to_depth_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_space_to_depth_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_space_to_depth_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_space_to_depth_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_space_to_depth_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/split.c b/tests/validation_layer/split.c index 0a6753f3..f14dc2ff 100644 --- a/tests/validation_layer/split.c +++ b/tests/validation_layer/split.c @@ -16,14 +16,13 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of split(layer).\n"); @@ -31,38 +30,38 @@ int main(int argc, char** argv) int axis = buffer[4]; int output_cnt = buffer[5]; int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t)); - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { split_index[i] = buffer[axis] / output_cnt; } - - struct csi_tensor *reference[output_cnt]; - for(int i = 0; i < output_cnt; i++) { - reference[i] = csi_alloc_tensor(NULL); + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *reference[output_cnt]; + for (int i = 0; i < output_cnt; i++) { + reference[i] = csinn_alloc_tensor(sess); } int in_size = 0; int out_size[output_cnt]; int acc_out_size = 0; - - struct csi_tensor *input = csi_alloc_tensor(NULL); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // channel - input->dim[2] = buffer[2]; // height - input->dim[3] = buffer[3]; // width + struct csinn_tensor *input = csinn_alloc_tensor(sess); + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // channel + input->dim[2] = buffer[2]; // height + input->dim[3] = buffer[3]; // width input->dim_count = 4; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - input->data = (float *)(buffer + 6); + input->data = (float *)(buffer + 6); input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - struct csi_tensor *output[output_cnt]; - for(int i = 0; i < output_cnt; i++) { - output[i] = csi_alloc_tensor(NULL); - for(int j = 0; j < 4; j++) { - if(j == axis) { + struct csinn_tensor *output[output_cnt]; + for (int i = 0; i < output_cnt; i++) { + output[i] = csinn_alloc_tensor(sess); + for (int j = 0; j < 4; j++) { + if (j == axis) { output[i]->dim[j] = split_index[i]; } else { output[i]->dim[j] = input->dim[j]; @@ -72,34 +71,32 @@ int main(int argc, char** argv) out_size[i] = output[i]->dim[0] * output[i]->dim[1] * output[i]->dim[2] * output[i]->dim[3]; reference[i]->data = (float *)(buffer + 6 + in_size + acc_out_size); - output[i]->data = reference[i]->data; - acc_out_size += out_size[i]; + output[i]->data = reference[i]->data; + acc_out_size += out_size[i]; output[i]->dtype = CSINN_DTYPE_FLOAT32; output[i]->is_const = 0; output[i]->layout = CSINN_LAYOUT_NCHW; output[i]->quant_channel = 1; } - struct split_params params; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.axis = axis; - params.output_num = output_cnt; + struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), sess); + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->output_num = output_cnt; int temp = 0; - for(int i = 0; i < output_cnt; i++) { + for (int i = 0; i < output_cnt; i++) { temp += split_index[i]; split_index[i] = temp; printf("%d\n", split_index[i]); } - params.split_index = split_index; + params->split_index = split_index; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_split_CSINN_QUANT_FLOAT32(input, (struct csi_tensor **)output, ¶ms, &difference); - test_split_CSINN_QUANT_UINT8_ASYM(input, (struct csi_tensor **)output, ¶ms, &difference); - test_split_CSINN_QUANT_INT8_SYM(input, (struct csi_tensor **)output, ¶ms, &difference); - + test_split_CSINN_QUANT_FLOAT32(input, (struct csinn_tensor **)output, params, &difference); + test_split_CSINN_QUANT_UINT8_ASYM(input, (struct csinn_tensor **)output, params, &difference); + test_split_CSINN_QUANT_INT8_SYM(input, (struct csinn_tensor **)output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/sqrt.c b/tests/validation_layer/sqrt.c index e18bca51..38cee86d 100644 --- a/tests/validation_layer/sqrt.c +++ b/tests/validation_layer/sqrt.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sqrt(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_sqrt_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_sqrt_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_sqrt_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_sqrt_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_sqrt_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_sqrt_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/square.c b/tests/validation_layer/square.c index cef086ba..7494a43b 100644 --- a/tests/validation_layer/square.c +++ b/tests/validation_layer/square.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of square(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_square_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_square_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_square_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_square_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_square_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_square_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/squeeze.c b/tests/validation_layer/squeeze.c index aa790fd4..30e6861a 100644 --- a/tests/validation_layer/squeeze.c +++ b/tests/validation_layer/squeeze.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of squeeze(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct squeeze_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_squeeze_params *params = + csinn_alloc_params(sizeof(struct csinn_squeeze_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); int axis_len = buffer[3]; int32_t *axis = (int32_t *)malloc(axis_len * sizeof(int32_t)); - for(int i = 0; i < axis_len; i++) { + for (int i = 0; i < axis_len; i++) { axis[i] = buffer[4 + i]; } - - output->dim[0] = input->dim[0] = buffer[0]; // batch - output->dim[1] = input->dim[1] = buffer[1]; // height - output->dim[2] = input->dim[2] = buffer[2]; // width + + output->dim[0] = input->dim[0] = buffer[0]; // batch + output->dim[1] = input->dim[1] = buffer[1]; // height + output->dim[2] = input->dim[2] = buffer[2]; // width input->dim[3] = 1; input->dim[4] = 1; input->dim[5] = 1; @@ -58,22 +61,20 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - params.axis = axis; - params.axis_num = axis_len; - params.base.layout = CSINN_LAYOUT_NCHW; + params->axis = axis; + params->axis_num = axis_len; + params->base.layout = CSINN_LAYOUT_NCHW; in_size = input->dim[0] * input->dim[1] * input->dim[2]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4 + axis_len); - reference->data = (float *)(buffer + 4 + axis_len + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4 + axis_len); + reference->data = (float *)(buffer + 4 + axis_len + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_squeeze_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_squeeze_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_squeeze_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_squeeze_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_squeeze_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_squeeze_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/stack.c b/tests/validation_layer/stack.c index 628119da..afe584d1 100644 --- a/tests/validation_layer/stack.c +++ b/tests/validation_layer/stack.c @@ -16,37 +16,37 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of stack(layer).\n"); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); - struct stack_params params; - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - - params.inputs_count = buffer[0]; - params.axis = buffer[1]; + params->inputs_count = buffer[0]; + params->axis = buffer[1]; output->dim_count = buffer[2]; - for(int i = 0; i < output->dim_count; i++) { - output->dim[i] = buffer[3+i]; + for (int i = 0; i < output->dim_count; i++) { + output->dim[i] = buffer[3 + i]; out_size *= output->dim[i]; } - in_size = out_size / params.inputs_count; + in_size = out_size / params->inputs_count; - struct csi_tensor *input[params.inputs_count]; - for (int i = 0; i < params.inputs_count; i++) { - input[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *input[params->inputs_count]; + for (int i = 0; i < params->inputs_count; i++) { + input[i] = csinn_alloc_tensor(sess); input[i]->data = (float *)(buffer + 3 + output->dim_count + in_size * i); input[i]->dim_count = buffer[2] - 1; input[i]->layout = CSINN_LAYOUT_NCHW; @@ -54,10 +54,10 @@ int main(int argc, char** argv) input[i]->quant_channel = 1; input[i]->dtype = CSINN_DTYPE_FLOAT32; for (int j = 0; j < input[i]->dim_count; j++) { - if (j < params.axis) { - input[i]->dim[j] = buffer[3+j]; // input[i]->dim[j] = output->dim[j] + if (j < params->axis) { + input[i]->dim[j] = buffer[3 + j]; // input[i]->dim[j] = output->dim[j] } else { - input[i]->dim[j] = buffer[3+j+1]; // input[i]->dim[j] = output->dim[j + 1] + input[i]->dim[j] = buffer[3 + j + 1]; // input[i]->dim[j] = output->dim[j + 1] } } } @@ -66,15 +66,14 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count); - output->data = reference->data; + params->base.api = CSINN_API; + reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_stack_CSINN_QUANT_FLOAT32((struct csi_tensor **)input, output, ¶ms, &difference); - test_stack_CSINN_QUANT_UINT8_ASYM((struct csi_tensor **)input, output, ¶ms, &difference); - test_stack_CSINN_QUANT_INT8_SYM((struct csi_tensor **)input, output, ¶ms, &difference); + test_stack_CSINN_QUANT_FLOAT32((struct csinn_tensor **)input, output, params, &difference); + test_stack_CSINN_QUANT_UINT8_ASYM((struct csinn_tensor **)input, output, params, &difference); + test_stack_CSINN_QUANT_INT8_SYM((struct csinn_tensor **)input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/strided_slice.c b/tests/validation_layer/strided_slice.c index 92540d5c..07a65b17 100644 --- a/tests/validation_layer/strided_slice.c +++ b/tests/validation_layer/strided_slice.c @@ -16,49 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of strided_slice(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct strided_slice_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_strided_slice_params *params = + csinn_alloc_params(sizeof(struct csinn_strided_slice_params), sess); int in_size = 1; int out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.slice_count = buffer[1+input->dim_count]; - params.begin = (int *)malloc(params.slice_count * sizeof(int)); - params.end = (int *)malloc(params.slice_count * sizeof(int)); - params.stride = (int *)malloc(params.slice_count * sizeof(int)); - for(int i = 0; i < params.slice_count; i++) { - params.begin[i] = buffer[2+input->dim_count+3*i]; - params.end[i] = buffer[3+input->dim_count+3*i]; - params.stride[i] = buffer[4+input->dim_count+3*i]; + params->slice_count = buffer[1 + input->dim_count]; + params->begin = (int *)malloc(params->slice_count * sizeof(int)); + params->end = (int *)malloc(params->slice_count * sizeof(int)); + params->stride = (int *)malloc(params->slice_count * sizeof(int)); + for (int i = 0; i < params->slice_count; i++) { + params->begin[i] = buffer[2 + input->dim_count + 3 * i]; + params->end[i] = buffer[3 + input->dim_count + 3 * i]; + params->stride[i] = buffer[4 + input->dim_count + 3 * i]; } output->dim_count = input->dim_count; - for(int i = 0; i < output->dim_count; i++) { - if(i < params.slice_count) { - output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]); + for (int i = 0; i < output->dim_count; i++) { + if (i < params->slice_count) { + output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]); } else { output->dim[i] = input->dim[i]; } } - out_size = buffer[2+input->dim_count+3*params.slice_count]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = buffer[2 + input->dim_count + 3 * params->slice_count]; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -69,15 +71,15 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; - - input->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count); - reference->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size + input->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count); + reference->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count + + in_size); // input->data + in_size output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_strided_slice_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_strided_slice_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_strided_slice_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_strided_slice_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_strided_slice_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_strided_slice_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/sub.c b/tests/validation_layer/sub.c index d43fa487..8c4ca445 100644 --- a/tests/validation_layer/sub.c +++ b/tests/validation_layer/sub.c @@ -16,29 +16,30 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sub(layer).\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - int flag = buffer[4]; - input1->dim[0] = input0->dim[0] = buffer[0]; // batch - input1->dim[1] = input0->dim[1] = buffer[1]; // height - input1->dim[2] = input0->dim[2] = buffer[2]; // width - input1->dim[3] = input0->dim[3] = buffer[3]; // channel + int flag = buffer[4]; + input1->dim[0] = input0->dim[0] = buffer[0]; // batch + input1->dim[1] = input0->dim[1] = buffer[1]; // height + input1->dim[2] = input0->dim[2] = buffer[2]; // width + input1->dim[3] = input0->dim[3] = buffer[3]; // channel output->dim[0] = input0->dim[0]; output->dim[1] = input0->dim[1]; @@ -62,18 +63,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (float *)(buffer + 4); - input1->data = (float *)(buffer + 4 + in_size); + input0->data = (float *)(buffer + 4); + input1->data = (float *)(buffer + 4 + in_size); reference->data = (float *)(buffer + 4 + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_sub_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_sub_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_sub_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_sub_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_sub_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_sub_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/sum_stride.cpp b/tests/validation_layer/sum_stride.cpp index 237dbad5..08747155 100644 --- a/tests/validation_layer/sum_stride.cpp +++ b/tests/validation_layer/sum_stride.cpp @@ -16,11 +16,10 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ #include "csi_nn.h" -#include "csi_thead_rvv.h" -#include "csi_utils.h" +#include "shl_thead_rvv.h" #include "math_snr.h" #include "test_utils.h" #include "testutil.h" @@ -29,10 +28,12 @@ int main(int argc, char **argv) { init_testsuite("Testing function of sum(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct reduce_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_reduce_params *params = csinn_alloc_params(sizeof(struct csinn_reduce_params), sess); int in_size = 0; int out_size = 0; @@ -82,24 +83,23 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - params.axis = &axis; - params.axis_count = 1; // must be 1 - params.m = m; - params.n = n; - params.out_strides = out_strides_0; - params.out_extents = out_extents_0; - params.inner_strides = inner_strides_0; - params.inner_extents = inner_extents_0; - params.base.api = CSINN_API; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + params->axis = &axis; + params->axis_count = 1; // must be 1 + params->m = m; + params->n = n; + params->out_strides = out_strides_0; + params->out_extents = out_extents_0; + params->inner_strides = inner_strides_0; + params->inner_extents = inner_extents_0; + params->base.api = CSINN_API; + params->base.layout = CSINN_LAYOUT_NCHW; - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT32, csi_sum_init, csi_sum, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sum_init, csinn_sum, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_FLOAT16, csi_sum_init, csi_sum, + test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sum_init, csinn_sum, &difference); - test_unary_op(input, output, ¶ms, CSINN_QUANT_INT8_SYM, csi_sum_init, csi_sum, + test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sum_init, csinn_sum, &difference); diff --git a/tests/validation_layer/tan.c b/tests/validation_layer/tan.c index ae7639da..43909d28 100644 --- a/tests/validation_layer/tan.c +++ b/tests/validation_layer/tan.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tan(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_tan_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_tan_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_tan_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_tan_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_tan_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_tan_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/tanh.c b/tests/validation_layer/tanh.c index fb7232a3..5a861094 100644 --- a/tests/validation_layer/tanh.c +++ b/tests/validation_layer/tanh.c @@ -16,26 +16,28 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 1]; output->dim[i] = input->dim[i]; in_size *= input->dim[i]; @@ -50,17 +52,16 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 1 + input->dim_count); + input->data = (float *)(buffer + 1 + input->dim_count); reference->data = (float *)(buffer + 1 + input->dim_count + in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_tanh_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_tanh_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_tanh_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_tanh_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_tanh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_tanh_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/testutil.h b/tests/validation_layer/testutil.h index 076f66de..af43ac0f 100644 --- a/tests/validation_layer/testutil.h +++ b/tests/validation_layer/testutil.h @@ -16,7 +16,7 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ // #include "common.h" @@ -28,109 +28,173 @@ #include "test_utils.h" template -void test_unary_op(struct csi_tensor *input, struct csi_tensor *output, T *params, +void test_unary_op(struct csinn_tensor *input, struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csi_tensor *, struct csi_tensor *, T *), - int (*unary_op)(struct csi_tensor *, struct csi_tensor *, T *), + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { enum csinn_quant_enum test_dtype = quant_dtype; int test_api = params->base.api; - struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); - struct csi_tensor *qoutput = + struct csinn_tensor *qinput = + convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); if (init_op(qinput, qoutput, params) == CSINN_TRUE) { unary_op(qinput, qoutput, params); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, + *difference, csinn_tensor_size(output), false); + shl_ref_tensor_transform_free_f32(foutput); + } else { + printf("Function init failed\n"); + exit(-1); } - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, - *difference, csi_tensor_size(output), false); - csi_ref_tensor_transform_free_f32(foutput); } template -void test_binary_op(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, - T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *, - T *), - int (*binary_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *, - T *), +void test_binary_op(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), + int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), float *difference) { enum csinn_quant_enum test_dtype = quant_dtype; int test_api = params->base.api; - struct csi_tensor *qinput0 = + struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, (enum csinn_api_enum)test_api); - struct csi_tensor *qinput1 = + struct csinn_tensor *qinput1 = convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api); - struct csi_tensor *qoutput = + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { binary_op(qinput0, qinput1, qoutput, params); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, + *difference, csinn_tensor_size(output), false); + shl_ref_tensor_transform_free_f32(foutput); + } else { + printf("Function init failed\n"); + exit(-1); } - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, - *difference, csi_tensor_size(output), false); - csi_ref_tensor_transform_free_f32(foutput); } template -void test_concat_op(struct csi_tensor **input, struct csi_tensor *output, T *params, +void test_concat_op(struct csinn_tensor **input, struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csi_tensor **, struct csi_tensor *, T *), - int (*unary_op)(struct csi_tensor **, struct csi_tensor *, T *), + int (*init_op)(struct csinn_tensor **, struct csinn_tensor *, T *), + int (*unary_op)(struct csinn_tensor **, struct csinn_tensor *, T *), float *difference) { enum csinn_quant_enum test_dtype = quant_dtype; int test_api = params->base.api; - struct csi_tensor *qinput[params->inputs_count]; + struct csinn_tensor *qinput[params->inputs_count]; for (int i = 0; i < params->inputs_count; i++) { qinput[i] = convert_f32_layer(input[i], test_dtype, (enum csinn_api_enum)test_api); } - struct csi_tensor *qoutput = + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); - if (init_op((struct csi_tensor **)qinput, qoutput, params) == CSINN_TRUE) { - unary_op((struct csi_tensor **)qinput, qoutput, params); + if (init_op((struct csinn_tensor **)qinput, qoutput, params) == CSINN_TRUE) { + unary_op((struct csinn_tensor **)qinput, qoutput, params); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input[0]->data, + *difference, csinn_tensor_size(output), false); + shl_ref_tensor_transform_free_f32(foutput); + } else { + printf("Function init failed\n"); + exit(-1); } - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input[0]->data, - *difference, csi_tensor_size(output), false); - csi_ref_tensor_transform_free_f32(foutput); } template -void test_conv2d_op(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, - struct csi_tensor *bias, T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *, - struct csi_tensor *, T *), - int (*conv2d_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *, - struct csi_tensor *, T *), +void test_conv2d_op(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params, + enum csinn_quant_enum quant_dtype, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), + int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { enum csinn_quant_enum test_dtype = quant_dtype; int test_api = params->base.api; - struct csi_tensor *qbias; - struct csi_tensor *qinput; + struct csinn_tensor *qbias; + struct csinn_tensor *qinput; + + struct csinn_tensor *qkernel = + convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api); + + if (test_dtype == CSINN_QUANT_INT8_SYM) { + if (!params->conv_extra.fuse_zp2bias) { + qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + qbias = convert_f32_bias(input, kernel, bias, (enum csinn_api_enum)test_api); + } else { + qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + qinput->qinfo->zero_point = 0; + } + + } else { + qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + } + + struct csinn_tensor *qoutput = + convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + + if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { + conv2d_op(qinput, qoutput, qkernel, qbias, params); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, + *difference, csinn_tensor_size(output), false); + shl_ref_tensor_transform_free_f32(foutput); + } else { + printf("Function init failed\n"); + exit(-1); + } +} + +template +void test_fully_op(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params, + enum csinn_quant_enum quant_dtype, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), + int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) +{ + enum csinn_quant_enum test_dtype = quant_dtype; + int test_api = params->base.api; + struct csinn_tensor *qbias; + struct csinn_tensor *qinput; + + struct csinn_tensor *qkernel = + convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api); if (test_dtype == CSINN_QUANT_INT8_SYM) { qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api); qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); qinput->qinfo->zero_point = 0; + } else { qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api); qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); } - struct csi_tensor *qoutput = + struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); - struct csi_tensor *qkernel = - convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api); if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { conv2d_op(qinput, qoutput, qkernel, qbias, params); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, + *difference, csinn_tensor_size(output), false); + shl_ref_tensor_transform_free_f32(foutput); + } else { + printf("Function init failed\n"); + exit(-1); } - struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, - *difference, csi_tensor_size(output), false); - csi_ref_tensor_transform_free_f32(foutput); } \ No newline at end of file diff --git a/tests/validation_layer/threshold_relu.c b/tests/validation_layer/threshold_relu.c index 96b90c68..fb15ccde 100644 --- a/tests/validation_layer/threshold_relu.c +++ b/tests/validation_layer/threshold_relu.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of threshold relu(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct relu_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -54,19 +56,18 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.n = *(float *)&buffer[4]; // theta + params->base.api = CSINN_API; + params->n = *(float *)&buffer[4]; // theta in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - input->data = (float *)(buffer + 5); - reference->data = (float *)(buffer + 5 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 5); + reference->data = (float *)(buffer + 5 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_threshold_relu_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_threshold_relu_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_threshold_relu_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_threshold_relu_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_threshold_relu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_threshold_relu_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/tile.c b/tests/validation_layer/tile.c index ae2b292f..92859c50 100644 --- a/tests/validation_layer/tile.c +++ b/tests/validation_layer/tile.c @@ -16,61 +16,61 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tile(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct tile_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), sess); int in_size = 1; int out_size = 1; - int *buffer = read_input_data_f32(argv[1]); input->dim_count = buffer[0]; output->dim_count = input->dim_count; - params.reps_num = buffer[0]; + params->reps_num = buffer[0]; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i+1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; in_size *= input->dim[i]; } - params.reps = (int *)malloc(params.reps_num * sizeof(int)); - for(int i = 0; i < params.reps_num; i++) { - params.reps[i] = buffer[i+1+input->dim_count]; - output->dim[i] = input->dim[i] * params.reps[i]; - out_size *= params.reps[i]; + params->reps = (int *)malloc(params->reps_num * sizeof(int)); + for (int i = 0; i < params->reps_num; i++) { + params->reps[i] = buffer[i + 1 + input->dim_count]; + output->dim[i] = input->dim[i] * params->reps[i]; + out_size *= params->reps[i]; } out_size = out_size * in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - input->data = (float *)(buffer + 1 + input->dim_count + input->dim_count); - reference->data = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 1 + input->dim_count + input->dim_count); + reference->data = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_tile_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_tile_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_tile_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + + test_tile_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_tile_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_tile_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/topk.c b/tests/validation_layer/topk.c index a3d69e64..3b74cfc4 100644 --- a/tests/validation_layer/topk.c +++ b/tests/validation_layer/topk.c @@ -16,38 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of topk(layer).\n"); - - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output1 = csi_alloc_tensor(NULL); - struct csi_tensor *output2 = csi_alloc_tensor(NULL); - struct csi_tensor *reference1 = csi_alloc_tensor(NULL); - struct csi_tensor *reference2 = csi_alloc_tensor(NULL); - struct topk_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output2 = csinn_alloc_tensor(sess); + struct csinn_tensor *reference1 = csinn_alloc_tensor(sess); + struct csinn_tensor *reference2 = csinn_alloc_tensor(sess); + struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), sess); int in_size = 1, out_size = 1; float error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - params.k = buffer[0]; + params->k = buffer[0]; input->dim_count = buffer[1]; output1->dim_count = input->dim_count; output2->dim_count = input->dim_count; - for(int i = 0; i < input->dim_count; i++) { + for (int i = 0; i < input->dim_count; i++) { input->dim[i] = buffer[i + 2]; output1->dim[i] = input->dim[i]; output2->dim[i] = input->dim[i]; in_size *= input->dim[i]; } - out_size = in_size / input->dim[input->dim_count - 1] * params.k; + out_size = in_size / input->dim[input->dim_count - 1] * params->k; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; @@ -58,47 +59,46 @@ int main(int argc, char** argv) output1->is_const = 0; output1->quant_channel = 1; - output2->dtype = CSINN_DTYPE_INT32; output2->layout = CSINN_LAYOUT_NCHW; output2->is_const = 0; output2->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; float *src_in_data = (float *)(buffer + 2 + input->dim_count); float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size); - int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); + int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size); uint8_t *input_data = (uint8_t *)malloc(in_size * sizeof(uint8_t)); input->data = src_in_data; get_quant_info(input); - for(int i = 0; i < in_size; i++) { - input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo); } /* compute the max quantize error */ - for(int i = 0; i < in_size; i++) { + for (int i = 0; i < in_size; i++) { float error1; - float output_tmp = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); - if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) { + float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo); + if (isinf(src_in_data[i]) && isinf(output_tmp) || + isnan(src_in_data[i]) && isnan(output_tmp)) { continue; } else { error1 = fabs(src_in_data[i] - output_tmp); - if(error1 > 1e-6) { - error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9); + if (error1 > 1e-6) { + error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9); } } - if(error1 > error) { + if (error1 > error) { error = error1; } } - // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point - if(input->dim_count != 1 || params.k != 1) { - output1->data= ref_data1; + // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point + if (input->dim_count != 1 || params->k != 1) { + output1->data = ref_data1; get_quant_info(output1); } else { output1->qinfo = input->qinfo; @@ -114,8 +114,8 @@ int main(int argc, char** argv) float difference2 = argc > 3 ? atof(argv[3]) : 0; printf("The max error is %.6lf.\n", error); - if (csi_topk_init(input, output1, output2, ¶ms) == CSINN_TRUE) { - csi_topk(input, output1, output2, ¶ms); + if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) { + csinn_topk(input, output1, output2, params); } result_verify_8(reference1->data, output1, input->data, difference1, out_size, false); @@ -124,7 +124,8 @@ int main(int argc, char** argv) they all quantized by [200, 200] so their output_indices are reversed */ - // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false); + // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, + // false); free(buffer); free(output1->data); diff --git a/tests/validation_layer/transpose.c b/tests/validation_layer/transpose.c index 8a716a74..1365800f 100644 --- a/tests/validation_layer/transpose.c +++ b/tests/validation_layer/transpose.c @@ -16,31 +16,34 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of transpose(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct transpose_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_transpose_params *params = + csinn_alloc_params(sizeof(struct csinn_transpose_params), sess); int in_size = 1, out_size = 1; float max_error = 0.0f; int *buffer = read_input_data_f32(argv[1]); - input->dim_count = buffer[0]; // input->dim_count == 4 + input->dim_count = buffer[0]; // input->dim_count == 4 output->dim_count = input->dim_count; int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t)); - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[i + 1]; - perm[i] = buffer[input->dim_count + i + 1]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[i + 1]; + perm[i] = buffer[input->dim_count + i + 1]; output->dim[i] = buffer[2 * input->dim_count + i + 1]; in_size *= input->dim[i]; } @@ -55,20 +58,19 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; - params.permute = perm; - params.permute_num = input->dim_count; - params.base.layout = CSINN_LAYOUT_NCHW; - - input->data = (float *)(buffer + 1 + input->dim_count * 3); - reference->data = (float *)(buffer + 1 + input->dim_count * 3 + in_size); - output->data = reference->data; + params->base.api = CSINN_API; + params->permute = perm; + params->permute_num = input->dim_count; + params->base.layout = CSINN_LAYOUT_NCHW; + + input->data = (float *)(buffer + 1 + input->dim_count * 3); + reference->data = (float *)(buffer + 1 + input->dim_count * 3 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_transpose_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_transpose_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_transpose_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_transpose_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_transpose_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_transpose_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/trunc.c b/tests/validation_layer/trunc.c index 3299ff42..76a33660 100644 --- a/tests/validation_layer/trunc.c +++ b/tests/validation_layer/trunc.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of trunc(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = buffer[3]; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = buffer[3]; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -55,17 +57,16 @@ int main(int argc, char** argv) output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = in_size; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 4); - reference->data = (float *)(buffer + 4 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 4); + reference->data = (float *)(buffer + 4 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_trunc_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_trunc_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_trunc_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_trunc_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_trunc_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_trunc_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/unsorted_segment_max.c b/tests/validation_layer/unsorted_segment_max.c index ba9e9afa..e708f702 100644 --- a/tests/validation_layer/unsorted_segment_max.c +++ b/tests/validation_layer/unsorted_segment_max.c @@ -16,60 +16,62 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment max(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - + output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/unsorted_segment_mean.c b/tests/validation_layer/unsorted_segment_mean.c index 8d37ade8..177078e0 100644 --- a/tests/validation_layer/unsorted_segment_mean.c +++ b/tests/validation_layer/unsorted_segment_mean.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment mean(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -53,23 +56,23 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]);; - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + ; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/unsorted_segment_min.c b/tests/validation_layer/unsorted_segment_min.c index 4faae05a..33c717fc 100644 --- a/tests/validation_layer/unsorted_segment_min.c +++ b/tests/validation_layer/unsorted_segment_min.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment min(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -53,23 +56,23 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]);; - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + ; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - - test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + + test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/unsorted_segment_prod.c b/tests/validation_layer/unsorted_segment_prod.c index 5b356918..bcb4b1c9 100644 --- a/tests/validation_layer/unsorted_segment_prod.c +++ b/tests/validation_layer/unsorted_segment_prod.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment prod(laye).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -53,23 +56,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/unsorted_segment_sum.c b/tests/validation_layer/unsorted_segment_sum.c index 91888e92..eb81dcbb 100644 --- a/tests/validation_layer/unsorted_segment_sum.c +++ b/tests/validation_layer/unsorted_segment_sum.c @@ -16,32 +16,35 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unsorted segment sum(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct csi_tensor *segment = csi_alloc_tensor(NULL); - struct segment_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_tensor *segment = csinn_alloc_tensor(sess); + struct csinn_segment_params *params = + csinn_alloc_params(sizeof(struct csinn_segment_params), sess); int in_size, out_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; - input->dim[1] = buffer[1]; - input->dim[2] = buffer[2]; - input->dim[3] = buffer[3]; - output->dim[0] = buffer[4]; - output->dim[1] = buffer[1]; - output->dim[2] = buffer[2]; - output->dim[3] = buffer[3]; + input->dim[0] = buffer[0]; + input->dim[1] = buffer[1]; + input->dim[2] = buffer[2]; + input->dim[3] = buffer[3]; + output->dim[0] = buffer[4]; + output->dim[1] = buffer[1]; + output->dim[2] = buffer[2]; + output->dim[3] = buffer[3]; input->dim_count = 4; output->dim_count = 4; input->dtype = CSINN_DTYPE_FLOAT32; @@ -53,23 +56,22 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.num_segments = buffer[4]; - params.unsorted = CSINN_TRUE; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->num_segments = buffer[4]; + params->unsorted = CSINN_TRUE; + params->base.api = CSINN_API; - in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; + in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - input->data = (float *)(buffer + 5); - segment->data = (int *)(buffer + 5 + in_size); - reference->data = (float *)(buffer + 5 + in_size + buffer[0]); - output->data = reference->data; + input->data = (float *)(buffer + 5); + segment->data = (int *)(buffer + 5 + in_size); + reference->data = (float *)(buffer + 5 + in_size + buffer[0]); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, ¶ms, &difference); - test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, ¶ms, &difference); - test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, ¶ms, &difference); + test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference); + test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference); + test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/unstack.c b/tests/validation_layer/unstack.c index cd76944a..0cf79ee4 100644 --- a/tests/validation_layer/unstack.c +++ b/tests/validation_layer/unstack.c @@ -16,74 +16,74 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of unstack(layer).\n"); int in_size = 1; int out_size = 1; - - + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; int *buffer = read_input_data_f32(argv[1]); - struct unstack_params params; - struct csi_tensor *input = csi_alloc_tensor(NULL); - params.axis = buffer[0]; + struct csinn_unstack_params *params = + csinn_alloc_params(sizeof(struct csinn_unstack_params), sess); + struct csinn_tensor *input = csinn_alloc_tensor(sess); + params->axis = buffer[0]; input->dim_count = buffer[1]; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - for(int i = 0; i < input->dim_count; i++) { - input->dim[i] = buffer[2+i]; + for (int i = 0; i < input->dim_count; i++) { + input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } - params.outputs_count = input->dim[params.axis]; + params->outputs_count = input->dim[params->axis]; - struct csi_tensor *reference[params.outputs_count]; - for(int i = 0; i < params.outputs_count; i++) { - reference[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *reference[params->outputs_count]; + for (int i = 0; i < params->outputs_count; i++) { + reference[i] = csinn_alloc_tensor(sess); } - out_size = in_size / params.outputs_count; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + out_size = in_size / params->outputs_count; + params->base.api = CSINN_API; input->dtype = CSINN_DTYPE_FLOAT32; input->layout = CSINN_LAYOUT_NCHW; input->is_const = 0; input->quant_channel = 1; - input->data = (float *)(buffer + 2 + input->dim_count); + input->data = (float *)(buffer + 2 + input->dim_count); - struct csi_tensor *output[params.outputs_count]; - for (int i = 0; i < params.outputs_count; i++) { - output[i] = csi_alloc_tensor(NULL); + struct csinn_tensor *output[params->outputs_count]; + for (int i = 0; i < params->outputs_count; i++) { + output[i] = csinn_alloc_tensor(sess); output[i]->dim_count = input->dim_count - 1; output[i]->dtype = CSINN_DTYPE_FLOAT32; output[i]->layout = CSINN_LAYOUT_NCHW; output[i]->is_const = 0; output[i]->quant_channel = 1; - for(int j = 0; j < input->dim_count; j++) { - if(j < params.axis) { + for (int j = 0; j < input->dim_count; j++) { + if (j < params->axis) { output[i]->dim[j] = input->dim[j]; - } else if(j > params.axis) { - output[i]->dim[j-1] = input->dim[j]; + } else if (j > params->axis) { + output[i]->dim[j - 1] = input->dim[j]; } } - reference[i]->data = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); - output[i]->data = reference[i]->data; + reference[i]->data = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i); + output[i]->data = reference[i]->data; } float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_unstack_CSINN_QUANT_FLOAT32(input, (struct csi_tensor **)output, ¶ms, &difference); - test_unstack_CSINN_QUANT_UINT8_ASYM(input, (struct csi_tensor **)output, ¶ms, &difference); - test_unstack_CSINN_QUANT_INT8_SYM(input, (struct csi_tensor **)output, ¶ms, &difference); + test_unstack_CSINN_QUANT_FLOAT32(input, (struct csinn_tensor **)output, params, &difference); + test_unstack_CSINN_QUANT_UINT8_ASYM(input, (struct csinn_tensor **)output, params, &difference); + test_unstack_CSINN_QUANT_INT8_SYM(input, (struct csinn_tensor **)output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/xor.c b/tests/validation_layer/xor.c index 7c693ec6..1189665f 100644 --- a/tests/validation_layer/xor.c +++ b/tests/validation_layer/xor.c @@ -16,28 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of xor u32.\n"); - - struct csi_tensor *input0 = csi_alloc_tensor(NULL); - struct csi_tensor *input1 = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct diso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input0 = csinn_alloc_tensor(sess); + struct csinn_tensor *input1 = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); int in_size = 1, out_size = 1; int *buffer = read_input_data_f32(argv[1]); input0->dim_count = buffer[0]; input1->dim_count = buffer[0]; output->dim_count = input0->dim_count; - for(int i = 0; i < input0->dim_count; i++) { + for (int i = 0; i < input0->dim_count; i++) { input0->dim[i] = buffer[i + 1]; input1->dim[i] = buffer[i + 1]; output->dim[i] = input0->dim[i]; @@ -57,18 +58,17 @@ int main(int argc, char** argv) output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input0->data = (uint32_t *)(buffer + 1 + input0->dim_count); - input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size); + input0->data = (uint32_t *)(buffer + 1 + input0->dim_count); + input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size); reference->data = (uint32_t *)(buffer + 1 + input0->dim_count + 2 * in_size); - output->data = reference->data; + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_xor_CSINN_QUANT_FLOAT32(input0, input1, output, ¶ms, &difference); - test_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, ¶ms, &difference); - test_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, ¶ms, &difference); + test_xor_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference); + test_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference); + test_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference); return done_testing(); } diff --git a/tests/validation_layer/yuv_rgb_scale.c b/tests/validation_layer/yuv_rgb_scale.c index 54796744..604408cf 100644 --- a/tests/validation_layer/yuv_rgb_scale.c +++ b/tests/validation_layer/yuv_rgb_scale.c @@ -16,27 +16,29 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of yuv_rgb_scale(layer).\n"); - struct csi_tensor *input = csi_alloc_tensor(NULL); - struct csi_tensor *output = csi_alloc_tensor(NULL); - struct csi_tensor *reference = csi_alloc_tensor(NULL); - struct siso_params params; + struct csinn_session *sess = csinn_alloc_session(); + sess->base_run_mode = CSINN_RM_LAYER; + struct csinn_tensor *input = csinn_alloc_tensor(sess); + struct csinn_tensor *output = csinn_alloc_tensor(sess); + struct csinn_tensor *reference = csinn_alloc_tensor(sess); + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess); int in_size; int *buffer = read_input_data_f32(argv[1]); - input->dim[0] = buffer[0]; // batch - input->dim[1] = buffer[1]; // height - input->dim[2] = buffer[2]; // width - input->dim[3] = 3; // channel + input->dim[0] = buffer[0]; // batch + input->dim[1] = buffer[1]; // height + input->dim[2] = buffer[2]; // width + input->dim[3] = 3; // channel output->dim[0] = input->dim[0]; output->dim[1] = input->dim[1]; @@ -54,17 +56,16 @@ int main(int argc, char** argv) output->is_const = 0; output->quant_channel = 1; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - params.base.api = CSINN_API; - params.base.run_mode = CSINN_RM_LAYER; + params->base.api = CSINN_API; - input->data = (float *)(buffer + 3); - reference->data = (float *)(buffer + 3 + in_size); - output->data = reference->data; + input->data = (float *)(buffer + 3); + reference->data = (float *)(buffer + 3 + in_size); + output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - test_yuv_rgb_scale_CSINN_QUANT_FLOAT32(input, output, ¶ms, &difference); - test_yuv_rgb_scale_CSINN_QUANT_UINT8_ASYM(input, output, ¶ms, &difference); - test_yuv_rgb_scale_CSINN_QUANT_INT8_SYM(input, output, ¶ms, &difference); + test_yuv_rgb_scale_CSINN_QUANT_FLOAT32(input, output, params, &difference); + test_yuv_rgb_scale_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference); + test_yuv_rgb_scale_CSINN_QUANT_INT8_SYM(input, output, params, &difference); return done_testing(); } \ No newline at end of file diff --git a/tests/validation_xt800/Makefile.e804 b/tests/validation_xt800/Makefile.e804 index bc2595a6..743390d0 100644 --- a/tests/validation_xt800/Makefile.e804 +++ b/tests/validation_xt800/Makefile.e804 @@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -static CFLAGS += -mcpu=e804d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=10 -DCSI_BUILD_E804 -DCSI_BUILD_RTOS -LIB_NAME = csi_nn2_e804 +CFLAGS += -DCSINN_API=10 -DSHL_BUILD_E804 -DSHL_BUILD_RTOS +LIB_NAME = shl_e804 CC = csky-abiv2-elf-gcc BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o diff --git a/tests/validation_xt800/Makefile.i805 b/tests/validation_xt800/Makefile.i805 index 15eced16..de9a0f12 100644 --- a/tests/validation_xt800/Makefile.i805 +++ b/tests/validation_xt800/Makefile.i805 @@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -static CFLAGS += -mcpu=ck805ef -mhard-float CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=9 -DCSI_BUILD_I805 -DCSI_BUILD_RTOS -LIB_NAME = csi_nn2_i805 +CFLAGS += -DCSINN_API=9 -DSHL_BUILD_I805 -DSHL_BUILD_RTOS +LIB_NAME = shl_i805 CC = csky-abiv2-elf-gcc #BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o BOARD = ./board/smartl/crt0.o -T./board/smartl/qemu.ld ./board/smartl/uart.o diff --git a/tests/validation_xt800/Makefile.ref_i805 b/tests/validation_xt800/Makefile.ref_i805 index 911d0d09..33548286 100644 --- a/tests/validation_xt800/Makefile.ref_i805 +++ b/tests/validation_xt800/Makefile.ref_i805 @@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils CFLAGS = -O0 -g3 -static CFLAGS += -mcpu=i805 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections -CFLAGS += -DCSINN_API=11 -DCSI_BUILD_REF_I805 -DCSI_BUILD_RTOS -LIB_NAME = csi_nn2_ref_i805 +CFLAGS += -DCSINN_API=11 -DSHL_BUILD_REF_I805 -DSHL_BUILD_RTOS +LIB_NAME = shl_ref_i805 CC = csky-abiv2-elf-gcc BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o diff --git a/tests/validation_xt800/avgpool_nonsquare_q7_1.c b/tests/validation_xt800/avgpool_nonsquare_q7_1.c index fe0f466b..d3ea65be 100644 --- a/tests/validation_xt800/avgpool_nonsquare_q7_1.c +++ b/tests/validation_xt800/avgpool_nonsquare_q7_1.c @@ -16,60 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" - +#include "test_utils.h" -extern void verify_avgpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t out_lshift, - float difference); +extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t out_lshift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of avgpool nonsquare q7 for xt800.\n"); - verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_0, 1, 64, 16, 4, 62, 14, 4, - 3, 3, 1, 1, 0, 0, 1, 3.0f); // difference = 3.0 + verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_0, 1, 64, 16, 4, 62, 14, 4, 3, 3, + 1, 1, 0, 0, 1, 3.0f); // difference = 3.0 - verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_1, 1, 64, 16, 4, 29, 6, 4, - 7, 5, 2, 2, 1, 1, 1, 3.0f); + verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_1, 1, 64, 16, 4, 29, 6, 4, 7, 5, + 2, 2, 1, 1, 1, 3.0f); - verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_2, 1, 32, 32, 4, 8, 5, 4, - 5, 7, 4, 5, 0, 1, 1, 3.0f); + verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_2, 1, 32, 32, 4, 8, 5, 4, 5, 7, + 4, 5, 0, 1, 1, 3.0f); - verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_3, 1, 32, 128, 1, 30, 126, 1, - 3, 3, 1, 1, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_3, 1, 32, 128, 1, 30, 126, 1, 3, + 3, 1, 1, 0, 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_4, 1, 128, 32, 1, 26, 14, 1, - 5, 7, 5, 2, 1, 2, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_4, 1, 128, 32, 1, 26, 14, 1, 5, + 7, 5, 2, 1, 2, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_5, 1, 64, 64, 1, 30, 30, 1, - 8, 6, 2, 2, 0, 2, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_5, 1, 64, 64, 1, 30, 30, 1, 8, 6, + 2, 2, 0, 2, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_6, 1, 32, 8, 16, 30, 6, 16, - 5, 3, 1, 1, 0, 2, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_6, 1, 32, 8, 16, 30, 6, 16, 5, 3, + 1, 1, 0, 2, 2, 3.0f); - verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_7, 1, 8, 32, 16, 4, 15, 16, - 3, 5, 1, 2, 1, 2, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_7, 1, 8, 32, 16, 4, 15, 16, 3, 5, + 1, 2, 1, 2, 2, 3.0f); - verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_8, 1, 16, 16, 16, 8, 5, 16, - 3, 5, 2, 3, 1, 1, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_8, 1, 16, 16, 16, 8, 5, 16, 3, 5, + 2, 3, 1, 1, 2, 3.0f); } diff --git a/tests/validation_xt800/avgpool_nonsquare_q7_2.c b/tests/validation_xt800/avgpool_nonsquare_q7_2.c index 029d313a..6ad0e6f7 100644 --- a/tests/validation_xt800/avgpool_nonsquare_q7_2.c +++ b/tests/validation_xt800/avgpool_nonsquare_q7_2.c @@ -16,61 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" - +#include "test_utils.h" -extern void verify_avgpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t out_lshift, - float difference); +extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t out_lshift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of avgpool nonsquare q7 for xt800.\n"); - /* ---------------- leftover ------------------------*/ - verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_9, 1, 63, 15, 4, 61, 13, 4, - 3, 3, 1, 1, 0, 0, 1, 3.0f); + /* ---------------- leftover ------------------------*/ + verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_9, 1, 63, 15, 4, 61, 13, 4, 3, 3, + 1, 1, 0, 0, 1, 3.0f); - verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_10, 1, 63, 15, 4, 29, 6, 4, - 7, 5, 2, 2, 0, 0, 1, 3.0f); + verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_10, 1, 63, 15, 4, 29, 6, 4, 7, 5, + 2, 2, 0, 0, 1, 3.0f); - verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_11, 1, 31, 31, 4, 8, 6, 4, - 5, 7, 4, 5, 1, 2, 1, 3.0f); + verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_11, 1, 31, 31, 4, 8, 6, 4, 5, 7, + 4, 5, 1, 2, 1, 3.0f); - verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_12, 1, 31, 127, 1, 29, 125, 1, - 3, 3, 1, 1, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_12, 1, 31, 127, 1, 29, 125, 1, 3, + 3, 1, 1, 0, 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_13, 1, 127, 31, 1, 26, 13, 1, - 5, 7, 5, 2, 0, 3, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_13, 1, 127, 31, 1, 26, 13, 1, 5, + 7, 5, 2, 0, 3, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_14, 1, 63, 63, 1, 29, 30, 1, - 8, 6, 2, 2, 1, 1, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_14, 1, 63, 63, 1, 29, 30, 1, 8, + 6, 2, 2, 1, 1, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_15, 1, 31, 7, 16, 29, 5, 16, - 5, 3, 1, 1, 0, 2, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_15, 1, 31, 7, 16, 29, 5, 16, 5, + 3, 1, 1, 0, 2, 2, 3.0f); - verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_16, 1, 7, 31, 16, 7, 14, 16, - 3, 5, 1, 2, 0, 2, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_16, 1, 7, 31, 16, 7, 14, 16, 3, + 5, 1, 2, 0, 2, 2, 3.0f); - verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_17, 1, 15, 15, 16, 7, 5, 16, - 3, 5, 2, 3, 2, 0, 2, 3.0f); + verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_17, 1, 15, 15, 16, 7, 5, 16, 3, + 5, 2, 3, 2, 0, 2, 3.0f); } diff --git a/tests/validation_xt800/avgpool_q7_1.c b/tests/validation_xt800/avgpool_q7_1.c index 3fe300d6..2aabd335 100644 --- a/tests/validation_xt800/avgpool_q7_1.c +++ b/tests/validation_xt800/avgpool_q7_1.c @@ -16,60 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" - +#include "test_utils.h" -extern void verify_avgpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t out_lshift, - float difference); +extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t out_lshift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of avgpool q7 for xt800.\n"); - verify_avgpool2d_q7(pooling_input_00, avepool_result_0, 1, 32, 32, 4, 30, 30, 4, - 3, 3, 1, 1, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_00, avepool_result_0, 1, 32, 32, 4, 30, 30, 4, 3, 3, 1, 1, 0, + 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_01, avepool_result_1, 1, 32, 32, 4, 16, 16, 4, - 2, 2, 2, 2, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_01, avepool_result_1, 1, 32, 32, 4, 16, 16, 4, 2, 2, 2, 2, 0, + 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_02, avepool_result_2, 1, 32, 32, 4, 17, 17, 4, - 2, 2, 2, 2, 1, 1, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_02, avepool_result_2, 1, 32, 32, 4, 17, 17, 4, 2, 2, 2, 2, 1, + 1, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_10, avepool_result_3, 1, 64, 64, 1, 62, 62, 1, - 3, 3, 1, 1, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_10, avepool_result_3, 1, 64, 64, 1, 62, 62, 1, 3, 3, 1, 1, 0, + 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_11, avepool_result_4, 1, 64, 64, 1, 32, 32, 1, - 2, 2, 2, 2, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_11, avepool_result_4, 1, 64, 64, 1, 32, 32, 1, 2, 2, 2, 2, 0, + 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_12, avepool_result_5, 1, 64, 64, 1, 33, 33, 1, - 2, 2, 2, 2, 1, 1, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_12, avepool_result_5, 1, 64, 64, 1, 33, 33, 1, 2, 2, 2, 2, 1, + 1, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_20, avepool_result_6, 1, 16, 16, 16, 14, 14, 16, - 3, 3, 1, 1, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_20, avepool_result_6, 1, 16, 16, 16, 14, 14, 16, 3, 3, 1, 1, + 0, 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_21, avepool_result_7, 1, 16, 16, 16, 8, 8, 16, - 2, 2, 2, 2, 0, 0, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_21, avepool_result_7, 1, 16, 16, 16, 8, 8, 16, 2, 2, 2, 2, 0, + 0, 0, 1.0f); - verify_avgpool2d_q7(pooling_input_22, avepool_result_8, 1, 16, 16, 16, 9, 9, 16, - 2, 2, 2, 2, 1, 1, 0, 1.0f); + verify_avgpool2d_q7(pooling_input_22, avepool_result_8, 1, 16, 16, 16, 9, 9, 16, 2, 2, 2, 2, 1, + 1, 0, 1.0f); } diff --git a/tests/validation_xt800/avgpool_q7_2.c b/tests/validation_xt800/avgpool_q7_2.c index c2ae610c..ed2db4b8 100644 --- a/tests/validation_xt800/avgpool_q7_2.c +++ b/tests/validation_xt800/avgpool_q7_2.c @@ -16,61 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" - +#include "test_utils.h" -extern void verify_avgpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t out_lshift, - float difference); +extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t out_lshift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of avgpool q7 for xt800.\n"); - /* ---------------- leftover ------------------------*/ // FIXME: error output - verify_avgpool2d_q7(pooling_input_00, avepool_result_9, 1, 31, 31, 4, 29, 29, 4, - 3, 3, 1, 1, 0, 0, 0, 3.0f); + /* ---------------- leftover ------------------------*/ // FIXME: error output + verify_avgpool2d_q7(pooling_input_00, avepool_result_9, 1, 31, 31, 4, 29, 29, 4, 3, 3, 1, 1, 0, + 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_01, avepool_result_10, 1, 31, 31, 4, 15, 15, 4, - 3, 3, 2, 2, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_01, avepool_result_10, 1, 31, 31, 4, 15, 15, 4, 3, 3, 2, 2, 0, + 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_02, avepool_result_11, 1, 31, 31, 4, 16, 16, 4, - 3, 3, 2, 2, 1, 1, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_02, avepool_result_11, 1, 31, 31, 4, 16, 16, 4, 3, 3, 2, 2, 1, + 1, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_10, avepool_result_12, 1, 63, 63, 1, 61, 61, 1, - 3, 3, 1, 1, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_10, avepool_result_12, 1, 63, 63, 1, 61, 61, 1, 3, 3, 1, 1, 0, + 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_11, avepool_result_13, 1, 63, 63, 1, 31, 31, 1, - 3, 3, 2, 2, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_11, avepool_result_13, 1, 63, 63, 1, 31, 31, 1, 3, 3, 2, 2, 0, + 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_12, avepool_result_14, 1, 63, 63, 1, 32, 32, 1, - 3, 3, 2, 2, 1, 1, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_12, avepool_result_14, 1, 63, 63, 1, 32, 32, 1, 3, 3, 2, 2, 1, + 1, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_20, avepool_result_15, 1, 15, 15, 16, 13, 13, 16, - 3, 3, 1, 1, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_20, avepool_result_15, 1, 15, 15, 16, 13, 13, 16, 3, 3, 1, 1, + 0, 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_21, avepool_result_16, 1, 15, 15, 16, 7, 7, 16, - 3, 3, 2, 2, 0, 0, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_21, avepool_result_16, 1, 15, 15, 16, 7, 7, 16, 3, 3, 2, 2, 0, + 0, 0, 3.0f); - verify_avgpool2d_q7(pooling_input_22, avepool_result_17, 1, 15, 15, 16, 8, 8, 16, - 3, 3, 2, 2, 1, 1, 0, 3.0f); + verify_avgpool2d_q7(pooling_input_22, avepool_result_17, 1, 15, 15, 16, 8, 8, 16, 3, 3, 2, 2, 1, + 1, 0, 3.0f); } diff --git a/tests/validation_xt800/convolution_1x1_q7_1.c b/tests/validation_xt800/convolution_1x1_q7_1.c index 9957b7c2..3823e8d1 100644 --- a/tests/validation_xt800/convolution_1x1_q7_1.c +++ b/tests/validation_xt800/convolution_1x1_q7_1.c @@ -16,70 +16,50 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_1x1_conv.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_1x1_conv.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of convolution 1x1 q7 for xt800.\n"); - /* -------------- conv2d 1x1 --------------- */ - verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, q7_1x1_conv_result_0, - 1, 32, 32, 16, 32, 32, 32, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, + q7_1x1_conv_result_0, 1, 32, 32, 16, 32, 32, 32, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); /* leftover test */ - verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, q7_1x1_conv_result_3, - 1, 31, 31, 12, 31, 31, 30, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, + q7_1x1_conv_result_3, 1, 31, 31, 12, 31, 31, 30, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); - verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, q7_1x1_conv_result_1, - 1, 64, 16, 16, 64, 16, 16, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, + q7_1x1_conv_result_1, 1, 64, 16, 16, 64, 16, 16, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); /* leftover test */ - verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, q7_1x1_conv_result_4, - 1, 63, 15, 12, 63, 15, 12, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); - + verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, + q7_1x1_conv_result_4, 1, 63, 15, 12, 63, 15, 12, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); // TODO: ld: region `DATA' overflowed by 41200 bytes - // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_2, + // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, + // q7_1x1_conv_result_2, // 1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); // // /* leftover test */ - // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_5, + // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, + // q7_1x1_conv_result_5, // 1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); } - - - - - - - - diff --git a/tests/validation_xt800/convolution_1x1_q7_2.c b/tests/validation_xt800/convolution_1x1_q7_2.c index e2087a79..5f0284ef 100644 --- a/tests/validation_xt800/convolution_1x1_q7_2.c +++ b/tests/validation_xt800/convolution_1x1_q7_2.c @@ -16,54 +16,31 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_1x1_conv.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_1x1_conv.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of convolution 1x1 q7 for xt800.\n"); // TODO: ld: region `DATA' overflowed by 41200 bytes - verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_2, - 1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, + q7_1x1_conv_result_2, 1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); // /* leftover test */ - verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_5, - 1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f); - + verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, + q7_1x1_conv_result_5, 1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12, + 0.0f); } - - - - - - - - diff --git a/tests/validation_xt800/convolution_RGB_q7.c b/tests/validation_xt800/convolution_RGB_q7.c index 11164e16..0ce699e0 100644 --- a/tests/validation_xt800/convolution_RGB_q7.c +++ b/tests/validation_xt800/convolution_RGB_q7.c @@ -16,85 +16,70 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_RGB.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_RGB.dat" +#include "test_utils.h" + +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); - - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution RGB q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, - 1, 32, 32, 3, 30, 30, 16, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32, + 3, 30, 30, 16, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, - 1, 32, 32, 3, 32, 32, 16, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32, + 3, 32, 32, 16, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, - 1, 32, 32, 3, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32, + 3, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, - 1, 32, 32, 3, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32, + 3, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, - 1, 32, 32, 3, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32, + 3, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, - 1, 32, 32, 3, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32, + 3, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, - 1, 32, 32, 3, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32, + 3, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, - 1, 32, 32, 3, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32, + 3, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); /* leftover test */ - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, - 1, 31, 31, 3, 29, 29, 15, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, 1, 31, 31, + 3, 29, 29, 15, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, - 1, 31, 31, 3, 31, 31, 15, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, 1, 31, 31, + 3, 31, 31, 15, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, - 1, 31, 31, 3, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, 1, 31, + 31, 3, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, - 1, 31, 31, 3, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, 1, 31, + 31, 3, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, - 1, 31, 31, 3, 1, 1, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, 1, 31, + 31, 3, 1, 1, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, - 1, 31, 31, 3, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, 1, 31, + 31, 3, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, - 1, 31, 31, 3, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, 1, 31, + 31, 3, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, - 1, 31, 31, 3, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, 1, 31, + 31, 3, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_basic_q7_1.c b/tests/validation_xt800/convolution_basic_q7_1.c index a3809edb..101c2c67 100644 --- a/tests/validation_xt800/convolution_basic_q7_1.c +++ b/tests/validation_xt800/convolution_basic_q7_1.c @@ -16,49 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of convolution basic q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, - 1, 32, 32, 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32, + 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, - 1, 32, 32, 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32, + 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, - 1, 31, 31, 15, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, 1, 31, 31, + 15, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, - 1, 31, 31, 15, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, 1, 31, 31, + 15, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_basic_q7_2.c b/tests/validation_xt800/convolution_basic_q7_2.c index 36a997d5..9b5ae169 100644 --- a/tests/validation_xt800/convolution_basic_q7_2.c +++ b/tests/validation_xt800/convolution_basic_q7_2.c @@ -16,55 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of convolution basic q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, - 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32, + 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, - 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32, + 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, - 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32, + 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, - 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, 1, 31, + 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, - 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, 1, 31, + 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, - 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, 1, 31, + 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_basic_q7_3.c b/tests/validation_xt800/convolution_basic_q7_3.c index ef3f2789..eba985df 100644 --- a/tests/validation_xt800/convolution_basic_q7_3.c +++ b/tests/validation_xt800/convolution_basic_q7_3.c @@ -16,55 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Third testing function of convolution basic q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, - 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32, + 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, - 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32, + 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, - 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32, + 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, - 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, 1, 31, + 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, - 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, 1, 31, + 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, - 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, 1, 31, + 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_nonsquare_q7_1.c b/tests/validation_xt800/convolution_nonsquare_q7_1.c index a9afe4ab..dfe5528e 100644 --- a/tests/validation_xt800/convolution_nonsquare_q7_1.c +++ b/tests/validation_xt800/convolution_nonsquare_q7_1.c @@ -16,50 +16,33 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of convolution nonsquare q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, - 1, 32, 32, 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32, + 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, - 1, 32, 32, 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32, + 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_16, - 1, 31, 31, 12, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_16, 1, 31, + 31, 12, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_17, - 1, 31, 31, 12, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_17, 1, 31, + 31, 12, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); } - diff --git a/tests/validation_xt800/convolution_nonsquare_q7_2.c b/tests/validation_xt800/convolution_nonsquare_q7_2.c index 89df723f..b1034a5e 100644 --- a/tests/validation_xt800/convolution_nonsquare_q7_2.c +++ b/tests/validation_xt800/convolution_nonsquare_q7_2.c @@ -16,55 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of convolution nonsquare q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, - 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32, + 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, - 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32, + 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, - 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32, + 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_18, - 1, 31, 31, 12, 27, 27, 14, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_18, 1, 31, + 31, 12, 27, 27, 14, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_19, - 1, 31, 31, 12, 31, 31, 14, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_19, 1, 31, + 31, 12, 31, 31, 14, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_20, - 1, 31, 31, 12, 11, 11, 14, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_20, 1, 31, + 31, 12, 11, 11, 14, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_nonsquare_q7_3.c b/tests/validation_xt800/convolution_nonsquare_q7_3.c index ce8d8178..ecb3afb8 100644 --- a/tests/validation_xt800/convolution_nonsquare_q7_3.c +++ b/tests/validation_xt800/convolution_nonsquare_q7_3.c @@ -16,55 +16,39 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - - -extern void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); +#include "test_utils.h" +extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Third testing function of convolution nonsquare q7 for xt800.\n"); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, - 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32, + 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, - 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32, + 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, - 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32, + 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_21, - 1, 31, 31, 12, 25, 25, 14, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_21, 1, 31, + 31, 12, 25, 25, 14, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_22, - 1, 31, 31, 12, 31, 31, 14, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_22, 1, 31, + 31, 12, 31, 31, 14, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_23, - 1, 31, 31, 12, 9, 9, 14, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); + verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_23, 1, 31, + 31, 12, 9, 9, 14, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/convolution_q15.c b/tests/validation_xt800/convolution_q15.c index 73b2a484..978f2540 100644 --- a/tests/validation_xt800/convolution_q15.c +++ b/tests/validation_xt800/convolution_q15.c @@ -16,65 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q15_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q15_conv_basic.dat" +#include "test_utils.h" +extern void verify_conv2d_q15(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, + uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift, + uint16_t out_shift, float difference); -extern void verify_conv2d_q15(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference); - - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution q15 for xt800.\n"); - verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_16, - 1, 16, 16, 8, 14, 14, 8, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); + verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_16, 1, + 16, 16, 8, 14, 14, 8, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f); - verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_17, - 1, 16, 16, 8, 16, 16, 8, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_17, 1, + 16, 16, 8, 16, 16, 8, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f); - verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_18, - 1, 16, 16, 8, 12, 12, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_18, 1, + 16, 16, 8, 12, 12, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_19, - 1, 16, 16, 8, 16, 16, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_19, 1, + 16, 16, 8, 16, 16, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_20, - 1, 16, 16, 8, 6, 6, 16, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_20, 1, + 16, 16, 8, 6, 6, 16, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); - verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_21, - 1, 16, 16, 8, 10, 10, 24, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); - - verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_22, - 1, 16, 16, 8, 16, 16, 24, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); - - verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_23, - 1, 16, 16, 8, 6, 6, 24, 7, 7, 3, 3, 3, 3, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_21, 1, + 16, 16, 8, 10, 10, 24, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_22, 1, + 16, 16, 8, 16, 16, 24, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_23, 1, + 16, 16, 8, 6, 6, 24, 7, 7, 3, 3, 3, 3, 0, 12, 0.0f); // FIXME: ld: region `DATA' overflowed by 41200 bytes // verify_conv2d_q15(q15_conv_input_0, q15_conv_weight_0, q15_conv_bias_0, q15_conv_result_0, @@ -126,4 +108,3 @@ int main(int argc, char** argv) // verify_conv2d_q15(q15_conv_input_2, q15_conv_weight_2, q15_conv_bias_2, q15_conv_result_15, // 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); } - diff --git a/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c b/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c index a622a895..81a36ebe 100644 --- a/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c +++ b/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c @@ -16,76 +16,71 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - +#include "test_utils.h" -extern void verify_depthwise_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, +extern void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, + void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w, + uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t bias_shift, uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution nonsquare q7 for xt800.\n"); - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_0, - 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_1, - 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_0, 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_2, - 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_1, 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, + 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_2, 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, + 3, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_3, - 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_3, 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_4, - 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_4, 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, + 3, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_5, - 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_5, 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, + 1, 0, 12, 0.0f); /* leftover test */ - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_6, - 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_7, - 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); - - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_8, - 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_6, 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, + 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_7, 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, + 2, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_9, - 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_8, 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, + 2, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_10, - 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_9, 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_11, - 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_10, 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, + 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_11, 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, + 0, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/depthwise_convolution_q7.c b/tests/validation_xt800/depthwise_convolution_q7.c index 53ed208b..630c700f 100644 --- a/tests/validation_xt800/depthwise_convolution_q7.c +++ b/tests/validation_xt800/depthwise_convolution_q7.c @@ -16,75 +16,71 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/q7_conv_basic.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/q7_conv_basic.dat" - +#include "test_utils.h" -extern void verify_depthwise_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, +extern void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, + void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w, + uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, uint16_t bias_shift, uint16_t out_shift, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution q7 for xt800.\n"); - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_0, - 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_1, - 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_0, 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_2, - 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_1, 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, + 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_2, 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, + 3, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_3, - 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_3, 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_4, - 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_4, 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, + 3, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_5, - 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_5, 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, + 1, 0, 12, 0.0f); /* leftover test */ - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_6, - 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f); - - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_7, - 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_6, 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_8, - 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_7, 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, + 2, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, + q7_depthwise_conv_result_8, 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, + 2, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_9, - 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_9, 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, + 0, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_10, - 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_10, 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, + 3, 3, 0, 12, 0.0f); - verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_11, - 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f); + verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, + q7_depthwise_conv_result_11, 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, + 0, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/fullyconnected_q15.c b/tests/validation_xt800/fullyconnected_q15.c index 68560be2..9c35891e 100644 --- a/tests/validation_xt800/fullyconnected_q15.c +++ b/tests/validation_xt800/fullyconnected_q15.c @@ -16,28 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/fully_data_q15.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/fully_data_q15.dat" - +#include "test_utils.h" -static void verify_fullyconnected_q15(void *input_data, - void *weight_data, - void *bias_data, - void *ref_data, - uint16_t in_nodes, - uint16_t out_nodes, - uint16_t bias_shift, - uint16_t out_shift, - float difference) +static void verify_fullyconnected_q15(void *input_data, void *weight_data, void *bias_data, + void *ref_data, uint16_t in_nodes, uint16_t out_nodes, + uint16_t bias_shift, uint16_t out_shift, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, weight_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_nodes; input->dim_count = 2; @@ -45,7 +38,7 @@ static void verify_fullyconnected_q15(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1]; - struct csi_tensor *weight = csi_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); weight->dim[0] = out_nodes; weight->dim[1] = in_nodes; weight->dim_count = 2; @@ -53,7 +46,7 @@ static void verify_fullyconnected_q15(void *input_data, weight->name = "weight"; weight_size = weight->dim[0] * weight->dim[1]; - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_nodes; bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT16; @@ -61,7 +54,7 @@ static void verify_fullyconnected_q15(void *input_data, bias_size = bias->dim[0]; bias->qinfo->shift = bias_shift; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_nodes; output->dim_count = 2; @@ -70,22 +63,21 @@ static void verify_fullyconnected_q15(void *input_data, out_size = output->dim[0] * output->dim[1]; output->qinfo->shift = out_shift; - struct fc_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.units = out_nodes; - - input->data = (uint16_t *)input_data; - weight->data = (uint16_t *)weight_data; - bias->data = (uint16_t *)bias_data; - reference->data = (uint16_t *)ref_data; + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->units = out_nodes; + + input->data = (uint16_t *)input_data; + weight->data = (uint16_t *)weight_data; + bias->data = (uint16_t *)bias_data; + reference->data = (uint16_t *)ref_data; uint16_t *output_tmp = (uint16_t *)malloc(out_size * sizeof(uint16_t)); - output->data = output_tmp; + output->data = output_tmp; - if (csi_fullyconnected_init(input, output, weight, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weight, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weight, bias, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); @@ -97,26 +89,25 @@ static void verify_fullyconnected_q15(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected q15 for xt800.\n"); - verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_6, - 256, 128, 0, 8, 0.0f); + verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, + fully_connect_result_6, 256, 128, 0, 8, 0.0f); - verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_7, - 256, 64, 0, 10, 0.0f); + verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, + fully_connect_result_7, 256, 64, 0, 10, 0.0f); - verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_8, - 128, 128, 0, 12, 0.0f); + verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, + fully_connect_result_8, 128, 128, 0, 12, 0.0f); - verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_9, - 255, 127, 0, 8, 0.0f); + verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, + fully_connect_result_9, 255, 127, 0, 8, 0.0f); - verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_10, - 255, 63, 0, 10, 0.0f); + verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, + fully_connect_result_10, 255, 63, 0, 10, 0.0f); - verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_11, - 127, 127, 0, 12, 0.0f); + verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, + fully_connect_result_11, 127, 127, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/fullyconnected_q7.c b/tests/validation_xt800/fullyconnected_q7.c index 6903ebbb..687e3e2e 100644 --- a/tests/validation_xt800/fullyconnected_q7.c +++ b/tests/validation_xt800/fullyconnected_q7.c @@ -16,28 +16,21 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/fully_data_q7.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/fully_data_q7.dat" - +#include "test_utils.h" -static void verify_fullyconnected_q7(void *input_data, - void *weight_data, - void *bias_data, - void *ref_data, - uint16_t in_nodes, - uint16_t out_nodes, - uint16_t bias_shift, - uint16_t out_shift, - float difference) +static void verify_fullyconnected_q7(void *input_data, void *weight_data, void *bias_data, + void *ref_data, uint16_t in_nodes, uint16_t out_nodes, + uint16_t bias_shift, uint16_t out_shift, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, weight_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_nodes; input->dim_count = 2; @@ -45,8 +38,7 @@ static void verify_fullyconnected_q7(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1]; - - struct csi_tensor *weight = csi_alloc_tensor(NULL); + struct csinn_tensor *weight = csinn_alloc_tensor(NULL); weight->dim[0] = out_nodes; weight->dim[1] = in_nodes; weight->dim_count = 2; @@ -54,8 +46,7 @@ static void verify_fullyconnected_q7(void *input_data, weight->name = "weight"; weight_size = weight->dim[0] * weight->dim[1]; - - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_nodes; bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT8; @@ -63,7 +54,7 @@ static void verify_fullyconnected_q7(void *input_data, bias_size = bias->dim[0]; bias->qinfo->shift = bias_shift; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_nodes; output->dim_count = 2; @@ -72,22 +63,21 @@ static void verify_fullyconnected_q7(void *input_data, out_size = output->dim[0] * output->dim[1]; output->qinfo->shift = out_shift; - struct fc_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.units = out_nodes; - - input->data = (uint8_t *)input_data; - weight->data = (uint8_t *)weight_data; - bias->data = (uint8_t *)bias_data; - reference->data = (uint8_t *)ref_data; + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->units = out_nodes; + + input->data = (uint8_t *)input_data; + weight->data = (uint8_t *)weight_data; + bias->data = (uint8_t *)bias_data; + reference->data = (uint8_t *)ref_data; uint8_t *output_tmp = (uint8_t *)malloc(out_size); - output->data = output_tmp; + output->data = output_tmp; - if (csi_fullyconnected_init(input, output, weight, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weight, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weight, bias, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); @@ -99,27 +89,26 @@ static void verify_fullyconnected_q7(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected q7 for xt800.\n"); - verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_6, - 256, 128, 0, 8, 0.0f); + verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, + fully_connect_result_6, 256, 128, 0, 8, 0.0f); - verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_7, - 256, 64, 0, 10, 0.0f); + verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, + fully_connect_result_7, 256, 64, 0, 10, 0.0f); - verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_8, - 128, 128, 0, 12, 0.0f); + verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, + fully_connect_result_8, 128, 128, 0, 12, 0.0f); /* leftover test */ - verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_9, - 255, 127, 0, 8, 0.0f); + verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, + fully_connect_result_9, 255, 127, 0, 8, 0.0f); - verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_10, - 255, 63, 0, 10, 0.0f); + verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, + fully_connect_result_10, 255, 63, 0, 10, 0.0f); - verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_11, - 127, 127, 0, 12, 0.0f); + verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, + fully_connect_result_11, 127, 127, 0, 12, 0.0f); } diff --git a/tests/validation_xt800/maxpool_q7_1.c b/tests/validation_xt800/maxpool_q7_1.c index ea884ef2..6d9cacf1 100644 --- a/tests/validation_xt800/maxpool_q7_1.c +++ b/tests/validation_xt800/maxpool_q7_1.c @@ -16,58 +16,47 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" +#include "test_utils.h" -extern void verify_maxpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - float difference); +extern void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("First testing function of maxpool q7 for xt800.\n"); - verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_0, 1, 32, 32, 4, 30, 30, 4, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_0, 1, 32, 32, 4, 30, 30, 4, 3, 3, 1, 1, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_1, 1, 32, 32, 4, 16, 16, 4, - 2, 2, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_1, 1, 32, 32, 4, 16, 16, 4, 2, 2, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_2, 1, 32, 32, 4, 17, 17, 4, - 2, 2, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_2, 1, 32, 32, 4, 17, 17, 4, 2, 2, 2, 2, + 1, 1, 0.0f); - verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_3, 1, 64, 64, 1, 62, 62, 1, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_3, 1, 64, 64, 1, 62, 62, 1, 3, 3, 1, 1, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_4, 1, 64, 64, 1, 32, 32, 1, - 2, 2, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_4, 1, 64, 64, 1, 32, 32, 1, 2, 2, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_5, 1, 64, 64, 1, 33, 33, 1, - 2, 2, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_5, 1, 64, 64, 1, 33, 33, 1, 2, 2, 2, 2, + 1, 1, 0.0f); - verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_6, 1, 16, 16, 16, 14, 14, 16, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_6, 1, 16, 16, 16, 14, 14, 16, 3, 3, 1, 1, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_7, 1, 16, 16, 16, 8, 8, 16, - 2, 2, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_7, 1, 16, 16, 16, 8, 8, 16, 2, 2, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_8, 1, 16, 16, 16, 9, 9, 16, - 2, 2, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_8, 1, 16, 16, 16, 9, 9, 16, 2, 2, 2, 2, + 1, 1, 0.0f); } diff --git a/tests/validation_xt800/maxpool_q7_2.c b/tests/validation_xt800/maxpool_q7_2.c index 25eb93ff..5ac857ae 100644 --- a/tests/validation_xt800/maxpool_q7_2.c +++ b/tests/validation_xt800/maxpool_q7_2.c @@ -16,59 +16,48 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/pool_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/pool_data.dat" +#include "test_utils.h" -extern void verify_maxpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - float difference); +extern void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, float difference); -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Second testing function of maxpool q7 for xt800.\n"); /* ---------------- leftover ------------------------*/ - verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_9, 1, 31, 31, 4, 29, 29, 4, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_9, 1, 31, 31, 4, 29, 29, 4, 3, 3, 1, 1, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_10, 1, 31, 31, 4, 15, 15, 4, - 3, 3, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_10, 1, 31, 31, 4, 15, 15, 4, 3, 3, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_11, 1, 31, 31, 4, 16, 16, 4, - 3, 3, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_11, 1, 31, 31, 4, 16, 16, 4, 3, 3, 2, 2, + 1, 1, 0.0f); - verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_12, 1, 63, 63, 1, 61, 61, 1, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_12, 1, 63, 63, 1, 61, 61, 1, 3, 3, 1, 1, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_13, 1, 63, 63, 1, 31, 31, 1, - 3, 3, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_13, 1, 63, 63, 1, 31, 31, 1, 3, 3, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_14, 1, 63, 63, 1, 32, 32, 1, - 3, 3, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_14, 1, 63, 63, 1, 32, 32, 1, 3, 3, 2, 2, + 1, 1, 0.0f); - verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_15, 1, 15, 15, 16, 13, 13, 16, - 3, 3, 1, 1, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_15, 1, 15, 15, 16, 13, 13, 16, 3, 3, 1, + 1, 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_16, 1, 15, 15, 16, 7, 7, 16, - 3, 3, 2, 2, 0, 0, 0.0f); + verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_16, 1, 15, 15, 16, 7, 7, 16, 3, 3, 2, 2, + 0, 0, 0.0f); - verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_17, 1, 15, 15, 16, 8, 8, 16, - 3, 3, 2, 2, 1, 1, 0.0f); + verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_17, 1, 15, 15, 16, 8, 8, 16, 3, 3, 2, 2, + 1, 1, 0.0f); } diff --git a/tests/validation_xt800/relu_q15.c b/tests/validation_xt800/relu_q15.c index 356d464d..18d6bb29 100644 --- a/tests/validation_xt800/relu_q15.c +++ b/tests/validation_xt800/relu_q15.c @@ -16,47 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_relu_q15(void *input_data, - void *ref_data, - int32_t size, - float difference) +static void verify_relu_q15(void *input_data, void *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT16; input->name = "input"; in_size = input->dim[0]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT16; output->name = "output"; out_size = output->dim[0]; - struct relu_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint16_t *)input_data; - reference->data = (uint16_t *)ref_data; + input->data = (uint16_t *)input_data; + reference->data = (uint16_t *)ref_data; - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); free(input); @@ -64,8 +59,7 @@ static void verify_relu_q15(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu q15 for xt800.\n"); @@ -74,7 +68,7 @@ int main(int argc, char** argv) verify_relu_q15(q15_relu_input2, q15_relu_result2, 1024, 0.0f); verify_relu_q15(q15_relu_input3, q15_relu_result3, 1024, 0.0f); verify_relu_q15(q15_relu_input4, q15_relu_result4, 1024, 0.0f); - + verify_relu_q15(q15_relu_input5, q15_relu_result0, 1023, 0.0f); verify_relu_q15(q15_relu_input6, q15_relu_result1, 1023, 0.0f); verify_relu_q15(q15_relu_input7, q15_relu_result2, 1023, 0.0f); diff --git a/tests/validation_xt800/relu_q7.c b/tests/validation_xt800/relu_q7.c index 7e7a5f24..f0684fbf 100644 --- a/tests/validation_xt800/relu_q7.c +++ b/tests/validation_xt800/relu_q7.c @@ -16,47 +16,42 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_relu_q7(void *input_data, - void *ref_data, - int32_t size, - float difference) +static void verify_relu_q7(void *input_data, void *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT8; input->name = "input"; in_size = input->dim[0]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT8; output->name = "output"; out_size = output->dim[0]; - struct relu_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)ref_data; - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); free(input); @@ -64,8 +59,7 @@ static void verify_relu_q7(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu q7 for xt800.\n"); @@ -74,7 +68,7 @@ int main(int argc, char** argv) verify_relu_q7(q7_relu_input2, q7_relu_result2, 1024, 0.0f); verify_relu_q7(q7_relu_input3, q7_relu_result3, 1024, 0.0f); verify_relu_q7(q7_relu_input4, q7_relu_result4, 1024, 0.0f); - + verify_relu_q7(q7_relu_input5, q7_relu_result0, 1023, 0.0f); verify_relu_q7(q7_relu_input6, q7_relu_result1, 1023, 0.0f); verify_relu_q7(q7_relu_input7, q7_relu_result2, 1023, 0.0f); diff --git a/tests/validation_xt800/sigmoid_q15.c b/tests/validation_xt800/sigmoid_q15.c index 12ae0b7d..6da09e3d 100644 --- a/tests/validation_xt800/sigmoid_q15.c +++ b/tests/validation_xt800/sigmoid_q15.c @@ -16,25 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_sigmoid_q15(void *input_data, - void *ref_data, - int32_t size, - float input_min, - float input_max, - float difference) +static void verify_sigmoid_q15(void *input_data, void *ref_data, int32_t size, float input_min, + float input_max, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); - int in_size, out_size; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT16; @@ -43,24 +38,24 @@ static void verify_sigmoid_q15(void *input_data, input->qinfo->min = input_min; input->qinfo->max = input_max; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT16; output->name = "output"; out_size = output->dim[0]; - struct sigmoid_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint16_t *)input_data; - reference->data = (uint16_t *)ref_data; + input->data = (uint16_t *)input_data; + reference->data = (uint16_t *)ref_data; - if (csi_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sigmoid(input, output, ¶ms); + if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_sigmoid(input, output, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); @@ -69,8 +64,7 @@ static void verify_sigmoid_q15(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid q15 for xt800.\n"); diff --git a/tests/validation_xt800/sigmoid_q7.c b/tests/validation_xt800/sigmoid_q7.c index e10cfd02..b1abe865 100644 --- a/tests/validation_xt800/sigmoid_q7.c +++ b/tests/validation_xt800/sigmoid_q7.c @@ -16,25 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_sigmoid_q7(void *input_data, - void *ref_data, - int32_t size, - float input_min, - float input_max, - float difference) +static void verify_sigmoid_q7(void *input_data, void *ref_data, int32_t size, float input_min, + float input_max, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); - int in_size, out_size; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT8; @@ -43,24 +38,24 @@ static void verify_sigmoid_q7(void *input_data, input->qinfo->min = input_min; input->qinfo->max = input_max; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT8; output->name = "output"; out_size = output->dim[0]; - struct sigmoid_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_sigmoid_params *params = + csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)ref_data; - if (csi_sigmoid_init(input, output, ¶ms) == CSINN_TRUE) { - csi_sigmoid(input, output, ¶ms); + if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) { + csinn_sigmoid(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); @@ -69,8 +64,7 @@ static void verify_sigmoid_q7(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of sigmoid q7 for xt800.\n"); diff --git a/tests/validation_xt800/softmax_q15.c b/tests/validation_xt800/softmax_q15.c index c8447b6b..41474ef9 100644 --- a/tests/validation_xt800/softmax_q15.c +++ b/tests/validation_xt800/softmax_q15.c @@ -16,56 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/softmax_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/softmax_data.dat" - +#include "test_utils.h" -static void verify_softmax_q15(void *input_data, - void *ref_data, - int32_t size, - float difference) +static void verify_softmax_q15(void *input_data, void *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT16; input->name = "input"; in_size = input->dim[0]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT16; output->name = "output"; out_size = output->dim[0]; - struct softmax_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint16_t *)input_data; - reference->data = (uint16_t *)ref_data; + input->data = (uint16_t *)input_data; + reference->data = (uint16_t *)ref_data; - if (csi_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softmax(input, output, ¶ms); + if (csinn_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_softmax(input, output, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); free(input); free(output); free(reference); -} - +} -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax q15 for xt800.\n"); diff --git a/tests/validation_xt800/softmax_q7.c b/tests/validation_xt800/softmax_q7.c index 553b3c8f..7a9b2303 100644 --- a/tests/validation_xt800/softmax_q7.c +++ b/tests/validation_xt800/softmax_q7.c @@ -16,56 +16,51 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/softmax_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/softmax_data.dat" - +#include "test_utils.h" -static void verify_softmax_q7(void *input_data, - void *ref_data, - int32_t size, - float difference) +static void verify_softmax_q7(void *input_data, void *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT8; input->name = "input"; in_size = input->dim[0]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT8; output->name = "output"; out_size = output->dim[0]; - struct softmax_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_softmax_params *params = + csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)ref_data; - if (csi_softmax_init(input, output, ¶ms) == CSINN_TRUE) { - csi_softmax(input, output, ¶ms); + if (csinn_softmax_init(input, output, params) == CSINN_TRUE) { + csinn_softmax(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); free(input); free(output); free(reference); -} - +} -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of softmax q7 for xt800.\n"); diff --git a/tests/validation_xt800/tanh_q15.c b/tests/validation_xt800/tanh_q15.c index 8bf7c4d2..bc77bc8e 100644 --- a/tests/validation_xt800/tanh_q15.c +++ b/tests/validation_xt800/tanh_q15.c @@ -16,25 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_tanh_q15(void *input_data, - void *ref_data, - int32_t size, - float input_min, - float input_max, - float difference) +static void verify_tanh_q15(void *input_data, void *ref_data, int32_t size, float input_min, + float input_max, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); - int in_size, out_size; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT16; @@ -43,24 +38,23 @@ static void verify_tanh_q15(void *input_data, input->qinfo->min = input_min; input->qinfo->max = input_max; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT16; output->name = "output"; out_size = output->dim[0]; - struct siso_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint16_t *)input_data; - reference->data = (uint16_t *)ref_data; + input->data = (uint16_t *)input_data; + reference->data = (uint16_t *)ref_data; - if (csi_tanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tanh(input, output, ¶ms); + if (csinn_tanh_init(input, output, params) == CSINN_TRUE) { + csinn_tanh(input, output, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); free(input); @@ -68,8 +62,7 @@ static void verify_tanh_q15(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh q15 for xt800.\n"); diff --git a/tests/validation_xt800/tanh_q7.c b/tests/validation_xt800/tanh_q7.c index dde2ff17..c4a1c8a2 100644 --- a/tests/validation_xt800/tanh_q7.c +++ b/tests/validation_xt800/tanh_q7.c @@ -16,25 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "./valid_data/active_data.dat" #include "csi_nn.h" #include "math_snr.h" -#include "./valid_data/active_data.dat" - +#include "test_utils.h" -static void verify_tanh_q7(void *input_data, - void *ref_data, - int32_t size, - float input_min, - float input_max, - float difference) +static void verify_tanh_q7(void *input_data, void *ref_data, int32_t size, float input_min, + float input_max, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); - int in_size, out_size; + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); + int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = size; input->dim_count = 1; input->dtype = CSINN_DTYPE_INT8; @@ -43,24 +38,23 @@ static void verify_tanh_q7(void *input_data, input->qinfo->min = input_min; input->qinfo->max = input_max; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim_count = 1; output->dtype = CSINN_DTYPE_INT8; output->name = "output"; out_size = output->dim[0]; - struct siso_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)ref_data; - if (csi_tanh_init(input, output, ¶ms) == CSINN_TRUE) { - csi_tanh(input, output, ¶ms); + if (csinn_tanh_init(input, output, params) == CSINN_TRUE) { + csinn_tanh(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); free(input); @@ -68,8 +62,7 @@ static void verify_tanh_q7(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of tanh q7 for xt800.\n"); diff --git a/tests/validation_xt800/u8_testcases/add_u8.c b/tests/validation_xt800/u8_testcases/add_u8.c index 00ac5a33..35140bb8 100644 --- a/tests/validation_xt800/u8_testcases/add_u8.c +++ b/tests/validation_xt800/u8_testcases/add_u8.c @@ -16,24 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/basic_math_func_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/basic_math_func_u8.dat" - +#include "test_utils.h" -static void verify_add_u8(float *input_0_data, - float *input_1_data, - float *ref_data, - int32_t size, +static void verify_add_u8(float *input_0_data, float *input_1_data, float *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = 1; input0->dim[1] = 1; input0->dim[2] = 1; @@ -47,13 +43,12 @@ static void verify_add_u8(float *input_0_data, in_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; uint8_t *src_tmp_0 = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp_0[i] = csi_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp_0[i] = shl_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo); } input0->data = src_tmp_0; - - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim[0] = 1; input1->dim[1] = 1; input1->dim[2] = 1; @@ -67,13 +62,12 @@ static void verify_add_u8(float *input_0_data, in_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3]; uint8_t *src_tmp_1 = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp_1[i] = csi_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp_1[i] = shl_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo); } input1->data = src_tmp_1; - - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -87,18 +81,16 @@ static void verify_add_u8(float *input_0_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(size); - struct diso_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_add_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_add(input0, input1, output, ¶ms); + if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_add(input0, input1, output, params); } - - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(input0); @@ -110,8 +102,7 @@ static void verify_add_u8(float *input_0_data, free(src_tmp_1); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elementwise add(u8) for i805.\n"); diff --git a/tests/validation_xt800/u8_testcases/clip_u8.c b/tests/validation_xt800/u8_testcases/clip_u8.c index d8d5c3d8..30388d85 100644 --- a/tests/validation_xt800/u8_testcases/clip_u8.c +++ b/tests/validation_xt800/u8_testcases/clip_u8.c @@ -16,24 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/clip_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/clip_u8.dat" +#include "test_utils.h" -static void verify_clip_u8(float *input_data, - float *ref_data, - float clip_fmin, - float clip_fmax, - int32_t size, - float difference) +static void verify_clip_u8(float *input_data, float *ref_data, float clip_fmin, float clip_fmax, + int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = 1; input->dim[2] = 1; @@ -47,12 +43,12 @@ static void verify_clip_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *src_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = src_tmp; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -66,19 +62,18 @@ static void verify_clip_u8(float *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(out_size); - struct clip_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.max_value = clip_fmax; - params.min_value = clip_fmin; + struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->max_value = clip_fmax; + params->min_value = clip_fmin; - if (csi_clip_init(input, output, ¶ms) == CSINN_TRUE) { - csi_clip(input, output, ¶ms); + if (csinn_clip_init(input, output, params) == CSINN_TRUE) { + csinn_clip(input, output, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); free(output->data); @@ -87,8 +82,7 @@ static void verify_clip_u8(float *input_data, free(src_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu(u8) for i805.\n"); verify_clip_u8(clip_input_0, clip_output_0, 0.0, 6.0, 79, 1.0); diff --git a/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c b/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c index 3a86e77d..ee8259ee 100644 --- a/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c +++ b/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c @@ -16,37 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/convolution_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/convolution_u8.dat" - +#include "test_utils.h" -void verify_conv2d_1x1_u8(float *input_data, - float *kernel_data, - float *bias_data, - float *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - float difference) +void verify_conv2d_1x1_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, + uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h, + uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, + uint16_t pad_y, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -60,13 +46,12 @@ void verify_conv2d_1x1_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = input_tmp; - - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; // O kernel->dim[1] = kernel_h; // H kernel->dim[2] = kernel_w; // W @@ -80,14 +65,13 @@ void verify_conv2d_1x1_u8(float *input_data, kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char)); - for(int i = 0; i < kernel_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); + for (int i = 0; i < kernel_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); } kernel->data = kernel_tmp; - - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT32; bias->layout = CSINN_LAYOUT_O; @@ -96,14 +80,13 @@ void verify_conv2d_1x1_u8(float *input_data, bias->data = (float *)bias_data; int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t)); - for(int i = 0; i < bias_size; i++) { + for (int i = 0; i < bias_size; i++) { bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale)); } bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale; bias->data = bias_tmp; - - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_h; output->dim[2] = out_w; @@ -117,29 +100,28 @@ void verify_conv2d_1x1_u8(float *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(out_size); - - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 1; - params.dilation_height = 1; - params.group = 1; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 1; + params->dilation_height = 1; + params->group = 1; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; + + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); @@ -153,12 +135,10 @@ void verify_conv2d_1x1_u8(float *input_data, free(bias_tmp); } - - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of pointwise convolution(u8) for i805.\n"); - verify_conv2d_1x1_u8(pwconv_input_0, pwconv_kernel_0, pwconv_bias_0, pwconv_output_0, - 1, 5, 9, 31, 5, 9, 63, 1, 1, 1, 1, 0, 0, 0.0f); + verify_conv2d_1x1_u8(pwconv_input_0, pwconv_kernel_0, pwconv_bias_0, pwconv_output_0, 1, 5, 9, + 31, 5, 9, 63, 1, 1, 1, 1, 0, 0, 0.0f); } diff --git a/tests/validation_xt800/u8_testcases/convolution_u8.c b/tests/validation_xt800/u8_testcases/convolution_u8.c index 9175eaa7..60da6fe8 100644 --- a/tests/validation_xt800/u8_testcases/convolution_u8.c +++ b/tests/validation_xt800/u8_testcases/convolution_u8.c @@ -16,37 +16,24 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" -#include "csi_nn.h" -#include "math_snr.h" #include "../valid_data/convolution_u8.dat" +#include "csi_nn.h" +#include "math_snr.h" +#include "test_utils.h" -void verify_conv2d_u8(float *input_data, - float *kernel_data, - float *bias_data, - float *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, +void verify_conv2d_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h, + uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -60,13 +47,12 @@ void verify_conv2d_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = input_tmp; - - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; // O kernel->dim[1] = kernel_h; // H kernel->dim[2] = kernel_w; // W @@ -80,15 +66,14 @@ void verify_conv2d_u8(float *input_data, kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char)); - for(int i = 0; i < kernel_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); + for (int i = 0; i < kernel_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); // printf("%d, ", kernel_tmp[i]); } kernel->data = kernel_tmp; - - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT32; bias->layout = CSINN_LAYOUT_O; @@ -97,14 +82,13 @@ void verify_conv2d_u8(float *input_data, bias->data = (float *)bias_data; int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t)); - for(int i = 0; i < bias_size; i++) { + for (int i = 0; i < bias_size; i++) { bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale)); } bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale; bias->data = bias_tmp; - - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = batch; output->dim[1] = out_h; output->dim[2] = out_w; @@ -118,29 +102,28 @@ void verify_conv2d_u8(float *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(out_size); - - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 1; - params.dilation_height = 1; - params.group = 1; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 1; + params->dilation_height = 1; + params->group = 1; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; + + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); @@ -154,12 +137,10 @@ void verify_conv2d_u8(float *input_data, free(bias_tmp); } - - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of convolution(u8) for i805.\n"); - verify_conv2d_u8(conv_input_0, conv_kernel_0, conv_bias_0, conv_output_0, - 1, 7, 7, 5, 7, 7, 11, 3, 3, 1, 1, 1, 1, 0.0f); + verify_conv2d_u8(conv_input_0, conv_kernel_0, conv_bias_0, conv_output_0, 1, 7, 7, 5, 7, 7, 11, + 3, 3, 1, 1, 1, 1, 0.0f); } diff --git a/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c b/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c index 4ceaccf7..a2b96e79 100644 --- a/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c +++ b/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c @@ -16,37 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/convolution_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/convolution_u8.dat" - +#include "test_utils.h" -void verify_dwconv2d_u8(float *input_data, - float *kernel_data, - float *bias_data, - float *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, +void verify_dwconv2d_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h, + uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -60,13 +46,12 @@ void verify_dwconv2d_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = input_tmp; - - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = 1; // O kernel->dim[1] = kernel_h; // H kernel->dim[2] = kernel_w; // W @@ -80,14 +65,13 @@ void verify_dwconv2d_u8(float *input_data, kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char)); - for(int i = 0; i < kernel_size; i++) { - kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); + for (int i = 0; i < kernel_size; i++) { + kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo); } kernel->data = kernel_tmp; - - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT32; bias->layout = CSINN_LAYOUT_O; @@ -96,14 +80,13 @@ void verify_dwconv2d_u8(float *input_data, bias->data = (float *)bias_data; int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t)); - for(int i = 0; i < bias_size; i++) { + for (int i = 0; i < bias_size; i++) { bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale)); } bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale; bias->data = bias_tmp; - - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_h; output->dim[2] = out_w; @@ -117,29 +100,28 @@ void verify_dwconv2d_u8(float *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(out_size); - - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 1; - params.dilation_height = 1; - params.group = in_c; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; - - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 1; + params->dilation_height = 1; + params->group = in_c; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; + + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); @@ -153,11 +135,10 @@ void verify_dwconv2d_u8(float *input_data, free(bias_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of depthwise convolution(u8) for i805.\n"); - verify_dwconv2d_u8(dwconv_input_0, dwconv_kernel_0, dwconv_bias_0, dwconv_output_0, - 1, 7, 7, 5, 7, 7, 5, 3, 3, 1, 1, 1, 1, 0.0f); + verify_dwconv2d_u8(dwconv_input_0, dwconv_kernel_0, dwconv_bias_0, dwconv_output_0, 1, 7, 7, 5, + 7, 7, 5, 3, 3, 1, 1, 1, 1, 0.0f); } diff --git a/tests/validation_xt800/u8_testcases/fullyconnected_u8.c b/tests/validation_xt800/u8_testcases/fullyconnected_u8.c index f20f3eb4..aa4a40c8 100644 --- a/tests/validation_xt800/u8_testcases/fullyconnected_u8.c +++ b/tests/validation_xt800/u8_testcases/fullyconnected_u8.c @@ -16,26 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" -#include "csi_nn.h" -#include "math_snr.h" #include "../valid_data/fullyconnected_u8.dat" +#include "csi_nn.h" +#include "math_snr.h" +#include "test_utils.h" -static void verify_fullyconnected_u8(float *input_data, - float *weights_data, - float *bias_data, - float *ref_data, - int32_t in_nodes, - int32_t out_nodes, +static void verify_fullyconnected_u8(float *input_data, float *weights_data, float *bias_data, + float *ref_data, int32_t in_nodes, int32_t out_nodes, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, weights_size, bias_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = in_nodes; input->dim_count = 2; @@ -47,13 +43,12 @@ static void verify_fullyconnected_u8(float *input_data, in_size = input->dim[0] * input->dim[1]; uint8_t *input_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = input_tmp; - - struct csi_tensor *weights = csi_alloc_tensor(NULL); + struct csinn_tensor *weights = csinn_alloc_tensor(NULL); weights->dim[0] = out_nodes; weights->dim[1] = in_nodes; weights->dim_count = 2; @@ -65,12 +60,12 @@ static void verify_fullyconnected_u8(float *input_data, weights_size = weights->dim[0] * weights->dim[1]; uint8_t *weights_tmp = malloc(weights_size * sizeof(char)); - for(int i = 0; i < weights_size; i++) { - weights_tmp[i] = csi_ref_quantize_f32_to_u8(weights_data[i], weights->qinfo); + for (int i = 0; i < weights_size; i++) { + weights_tmp[i] = shl_ref_quantize_f32_to_u8(weights_data[i], weights->qinfo); } weights->data = weights_tmp; - struct csi_tensor *bias = csi_alloc_tensor(NULL); + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); bias->dim[0] = out_nodes; bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT32; @@ -79,16 +74,14 @@ static void verify_fullyconnected_u8(float *input_data, bias->data = (float *)bias_data; bias_size = bias->dim[0]; - int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t)); - for(int i = 0; i < bias_size; i++) { + for (int i = 0; i < bias_size; i++) { bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * weights->qinfo->scale)); } bias->qinfo->scale = input->qinfo->scale * weights->qinfo->scale; bias->data = bias_tmp; - - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_nodes; output->dim_count = 2; @@ -101,20 +94,17 @@ static void verify_fullyconnected_u8(float *input_data, out_size = output->dim[0] * output->dim[1]; output->data = malloc(out_size); - struct fc_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.units = out_nodes; // out_nodes + struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->units = out_nodes; // out_nodes - - if (csi_fullyconnected_init(input, output, weights, bias, ¶ms) == CSINN_TRUE) { - csi_fullyconnected(input, output, weights, bias, ¶ms); + if (csinn_fullyconnected_init(input, output, weights, bias, params) == CSINN_TRUE) { + csinn_fullyconnected(input, output, weights, bias, params); } - - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); free(weights); @@ -127,8 +117,7 @@ static void verify_fullyconnected_u8(float *input_data, free(bias_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of fullyconnected(u8) for i805.\n"); diff --git a/tests/validation_xt800/u8_testcases/maxpool_u8.c b/tests/validation_xt800/u8_testcases/maxpool_u8.c index 49376826..e01ad0a0 100644 --- a/tests/validation_xt800/u8_testcases/maxpool_u8.c +++ b/tests/validation_xt800/u8_testcases/maxpool_u8.c @@ -16,35 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -void verify_maxpool2d_u8(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - float difference) +void verify_maxpool2d_u8(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h, + uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -55,7 +42,7 @@ void verify_maxpool2d_u8(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_h; output->dim[2] = out_w; @@ -66,28 +53,27 @@ void verify_maxpool2d_u8(void *input_data, output->name = "output"; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - struct pool_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.ceil_mode = 0; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.filter_height = kernel_h; - params.filter_width = kernel_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->ceil_mode = 0; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->filter_height = kernel_h; + params->filter_width = kernel_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)output_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)output_data; uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); output->data = output_tmp; - if (csi_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool2d(input, output, ¶ms); + if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool2d(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); @@ -97,10 +83,8 @@ void verify_maxpool2d_u8(void *input_data, free(reference); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of maxpool2d(u8) for i805.\n"); // verify_maxpool2d_u8(); - } \ No newline at end of file diff --git a/tests/validation_xt800/u8_testcases/mul_u8.c b/tests/validation_xt800/u8_testcases/mul_u8.c index 6b9339dc..fc5fb43e 100644 --- a/tests/validation_xt800/u8_testcases/mul_u8.c +++ b/tests/validation_xt800/u8_testcases/mul_u8.c @@ -16,25 +16,20 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/basic_math_func_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/basic_math_func_u8.dat" - - +#include "test_utils.h" -static void verify_mul_u8(float *input_0_data, - float *input_1_data, - float *ref_data, - int32_t size, +static void verify_mul_u8(float *input_0_data, float *input_1_data, float *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input0 = csi_alloc_tensor(NULL); + struct csinn_tensor *input0 = csinn_alloc_tensor(NULL); input0->dim[0] = 1; input0->dim[1] = 1; input0->dim[2] = 1; @@ -48,12 +43,12 @@ static void verify_mul_u8(float *input_0_data, in_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3]; uint8_t *src_tmp_0 = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp_0[i] = csi_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp_0[i] = shl_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo); } input0->data = src_tmp_0; - struct csi_tensor *input1 = csi_alloc_tensor(NULL); + struct csinn_tensor *input1 = csinn_alloc_tensor(NULL); input1->dim[0] = 1; input1->dim[1] = 1; input1->dim[2] = 1; @@ -67,12 +62,12 @@ static void verify_mul_u8(float *input_0_data, in_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3]; uint8_t *src_tmp_1 = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp_1[i] = csi_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp_1[i] = shl_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo); } input1->data = src_tmp_1; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -86,17 +81,16 @@ static void verify_mul_u8(float *input_0_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(size); - struct diso_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_mul_init(input0, input1, output, ¶ms) == CSINN_TRUE) { - csi_mul(input0, input1, output, ¶ms); + if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) { + csinn_mul(input0, input1, output, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input0->data, difference, out_size, false); free(input0); free(input1); @@ -107,8 +101,7 @@ static void verify_mul_u8(float *input_0_data, free(src_tmp_1); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of elementwise mul(u8) for i805.\n"); diff --git a/tests/validation_xt800/u8_testcases/relu6_u8.c b/tests/validation_xt800/u8_testcases/relu6_u8.c index e65737c3..772f6a8b 100644 --- a/tests/validation_xt800/u8_testcases/relu6_u8.c +++ b/tests/validation_xt800/u8_testcases/relu6_u8.c @@ -16,23 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/relu6_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/relu6_u8.dat" - +#include "test_utils.h" -static void verify_relu6_u8(float *input_data, - float *ref_data, - int32_t size, - float difference) +static void verify_relu6_u8(float *input_data, float *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = 1; input->dim[2] = 1; @@ -46,13 +42,13 @@ static void verify_relu6_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *src_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); // printf("%d, ", src_tmp[i]); } input->data = src_tmp; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -65,18 +61,17 @@ static void verify_relu6_u8(float *input_data, get_quant_info(output); out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - struct relu_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.n = 6.0f; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->n = 6.0f; - if (csi_relu6_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu6(input, output, ¶ms); + if (csinn_relu6_init(input, output, params) == CSINN_TRUE) { + csinn_relu6(input, output, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); free(output); @@ -84,8 +79,7 @@ static void verify_relu6_u8(float *input_data, free(src_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu6(u8) for i805.\n"); diff --git a/tests/validation_xt800/u8_testcases/relu_u8.c b/tests/validation_xt800/u8_testcases/relu_u8.c index 7dd7e8b8..70d38fdf 100644 --- a/tests/validation_xt800/u8_testcases/relu_u8.c +++ b/tests/validation_xt800/u8_testcases/relu_u8.c @@ -16,22 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/relu_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/relu_u8.dat" +#include "test_utils.h" -static void verify_relu_u8(float *input_data, - float *ref_data, - int32_t size, - float difference) +static void verify_relu_u8(float *input_data, float *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = 1; input->dim[2] = 1; @@ -45,12 +42,12 @@ static void verify_relu_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *src_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = src_tmp; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -63,17 +60,16 @@ static void verify_relu_u8(float *input_data, get_quant_info(output); out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - struct relu_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_relu_init(input, output, ¶ms) == CSINN_TRUE) { - csi_relu(input, output, ¶ms); + if (csinn_relu_init(input, output, params) == CSINN_TRUE) { + csinn_relu(input, output, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); free(output); @@ -81,8 +77,7 @@ static void verify_relu_u8(float *input_data, free(src_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of relu(u8) for i805.\n"); verify_relu_u8(relu_input_0, relu_output_0, 79, 1.0); diff --git a/tests/validation_xt800/u8_testcases/reshape_u8.c b/tests/validation_xt800/u8_testcases/reshape_u8.c index 8a75c3af..e8b4cec8 100644 --- a/tests/validation_xt800/u8_testcases/reshape_u8.c +++ b/tests/validation_xt800/u8_testcases/reshape_u8.c @@ -16,23 +16,19 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" +#include "../valid_data/reshape_u8.dat" #include "csi_nn.h" #include "math_snr.h" -#include "../valid_data/reshape_u8.dat" - +#include "test_utils.h" -static void verify_reshape_u8(float *input_data, - float *ref_data, - int32_t size, - float difference) +static void verify_reshape_u8(float *input_data, float *ref_data, int32_t size, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = 1; input->dim[1] = 1; input->dim[2] = 1; @@ -46,12 +42,12 @@ static void verify_reshape_u8(float *input_data, in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; uint8_t *src_tmp = malloc(in_size * sizeof(char)); - for(int i = 0; i < in_size; i++) { - src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo); + for (int i = 0; i < in_size; i++) { + src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo); } input->data = src_tmp; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = 1; output->dim[2] = 1; @@ -65,17 +61,17 @@ static void verify_reshape_u8(float *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->data = malloc(out_size); - struct reshape_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_reshape_params *params = + csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; - if (csi_reshape_init(input, output, ¶ms) == CSINN_TRUE) { - csi_reshape(input, output, ¶ms); + if (csinn_reshape_init(input, output, params) == CSINN_TRUE) { + csinn_reshape(input, output, params); } - reference->data = (float *)ref_data; + reference->data = (float *)ref_data; result_verify_8(reference->data, output, input->data, difference, out_size, false); free(input); free(output->data); @@ -84,8 +80,7 @@ static void verify_reshape_u8(float *input_data, free(src_tmp); } - -int main(int argc, char** argv) +int main(int argc, char **argv) { init_testsuite("Testing function of reshape(u8) for i805.\n"); diff --git a/tests/validation_xt800/verify_avgpool_q7.c b/tests/validation_xt800/verify_avgpool_q7.c index cc027b8e..517036d3 100644 --- a/tests/validation_xt800/verify_avgpool_q7.c +++ b/tests/validation_xt800/verify_avgpool_q7.c @@ -16,36 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -void verify_avgpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t out_lshift, - float difference) +void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h, + uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, uint16_t out_lshift, + float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -55,7 +42,7 @@ void verify_avgpool2d_q7(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_h; output->dim[2] = out_w; @@ -66,28 +53,27 @@ void verify_avgpool2d_q7(void *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->qinfo->shift = out_lshift; - struct pool_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.ceil_mode = 0; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.filter_height = kernel_h; - params.filter_width = kernel_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->ceil_mode = 0; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->filter_height = kernel_h; + params->filter_width = kernel_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)output_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)output_data; uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); output->data = output_tmp; - if (csi_avgpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_avgpool2d(input, output, ¶ms); + if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_avgpool2d(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation_xt800/verify_convolution_q15.c b/tests/validation_xt800/verify_convolution_q15.c index 747c4595..939e6f54 100644 --- a/tests/validation_xt800/verify_convolution_q15.c +++ b/tests/validation_xt800/verify_convolution_q15.c @@ -16,38 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -void verify_conv2d_q15(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference) +void verify_conv2d_q15(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h, + uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, + uint16_t bias_shift, uint16_t out_shift, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -57,7 +41,7 @@ void verify_conv2d_q15(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; // O kernel->dim[1] = in_c; // I kernel->dim[2] = kernel_h; // H @@ -67,15 +51,15 @@ void verify_conv2d_q15(void *input_data, kernel->name = "kernel"; kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT16; bias->name = "bias"; bias_size = bias->dim[0]; bias->qinfo->shift = bias_shift; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_h; output->dim[2] = out_w; @@ -86,32 +70,32 @@ void verify_conv2d_q15(void *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->qinfo->shift = out_shift; - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 0; - params.dilation_height = 0; - params.group = 1; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 0; + params->dilation_height = 0; + params->group = 1; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; - input->data = (uint16_t *)input_data; - kernel->data = (uint16_t *)kernel_data; - bias->data = (uint16_t *)bias_data; - reference->data = (uint16_t *)ref_data; + input->data = (uint16_t *)input_data; + kernel->data = (uint16_t *)kernel_data; + bias->data = (uint16_t *)bias_data; + reference->data = (uint16_t *)ref_data; uint16_t *output_tmp = (uint16_t *)malloc(out_size * sizeof(uint16_t)); - output->data = output_tmp; + output->data = output_tmp; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_q15(reference->data, output->data, input->data, difference, out_size, false); free(output_tmp); diff --git a/tests/validation_xt800/verify_convolution_q7.c b/tests/validation_xt800/verify_convolution_q7.c index f1eef0dd..d6f3fd69 100644 --- a/tests/validation_xt800/verify_convolution_q7.c +++ b/tests/validation_xt800/verify_convolution_q7.c @@ -16,37 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" -void verify_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference) +void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data, + uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h, + uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, + uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, + uint16_t bias_shift, uint16_t out_shift, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -56,7 +41,7 @@ void verify_conv2d_q7(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = out_c; // O kernel->dim[1] = in_c; // I kernel->dim[2] = kernel_h; // H @@ -66,15 +51,15 @@ void verify_conv2d_q7(void *input_data, kernel->name = "kernel"; kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT8; bias->name = "bias"; bias_size = bias->dim[0]; bias->qinfo->shift = bias_shift; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_h; output->dim[2] = out_w; @@ -85,33 +70,33 @@ void verify_conv2d_q7(void *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->qinfo->shift = out_shift; - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 0; - params.dilation_height = 0; - params.group = 1; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 0; + params->dilation_height = 0; + params->group = 1; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; - input->data = (uint8_t *)input_data; - kernel->data = (uint8_t *)kernel_data; - bias->data = (uint8_t *)bias_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + kernel->data = (uint8_t *)kernel_data; + bias->data = (uint8_t *)bias_data; + reference->data = (uint8_t *)ref_data; // uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); uint8_t output_tmp[out_size]; - output->data = output_tmp; + output->data = output_tmp; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); // free(output_tmp); diff --git a/tests/validation_xt800/verify_depthwise_conv2d_q7.c b/tests/validation_xt800/verify_depthwise_conv2d_q7.c index b5ccdf88..786f61e5 100644 --- a/tests/validation_xt800/verify_depthwise_conv2d_q7.c +++ b/tests/validation_xt800/verify_depthwise_conv2d_q7.c @@ -16,38 +16,23 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -void verify_depthwise_conv2d_q7(void *input_data, - void *kernel_data, - void *bias_data, - void *ref_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - uint16_t bias_shift, - uint16_t out_shift, - float difference) +void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, + void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w, + uint16_t in_c, uint16_t out_h, uint16_t out_w, uint16_t out_c, + uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h, + uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, + uint16_t bias_shift, uint16_t out_shift, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size, kernel_size = 0, bias_size = 0; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -57,7 +42,7 @@ void verify_depthwise_conv2d_q7(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *kernel = csi_alloc_tensor(NULL); + struct csinn_tensor *kernel = csinn_alloc_tensor(NULL); kernel->dim[0] = 1; // O kernel->dim[1] = in_c; // I kernel->dim[2] = kernel_h; // H @@ -67,15 +52,15 @@ void verify_depthwise_conv2d_q7(void *input_data, kernel->name = "kernel"; kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; - struct csi_tensor *bias = csi_alloc_tensor(NULL); - bias->dim[0] = out_c; // O + struct csinn_tensor *bias = csinn_alloc_tensor(NULL); + bias->dim[0] = out_c; // O bias->dim_count = 1; bias->dtype = CSINN_DTYPE_INT8; bias->name = "bias"; bias_size = bias->dim[0]; bias->qinfo->shift = bias_shift; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = 1; output->dim[1] = out_h; output->dim[2] = out_w; @@ -86,34 +71,34 @@ void verify_depthwise_conv2d_q7(void *input_data, out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; output->qinfo->shift = out_shift; - struct conv2d_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NHWC; - params.base.run_mode = CSINN_RM_LAYER; + struct csinn_conv2d_params *params = + csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NHWC; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; - params.dilation_width = 0; - params.dilation_height = 0; - params.group = input->dim[3]; - params.conv_extra.kernel_tm = NULL; - params.conv_extra.conv_mode = CSINN_DIRECT; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; + params->dilation_width = 0; + params->dilation_height = 0; + params->group = input->dim[3]; + params->conv_extra.kernel_tm = NULL; + params->conv_extra.conv_mode = CSINN_DIRECT; - input->data = (uint8_t *)input_data; - kernel->data = (uint8_t *)kernel_data; - bias->data = (uint8_t *)bias_data; - reference->data = (uint8_t *)ref_data; + input->data = (uint8_t *)input_data; + kernel->data = (uint8_t *)kernel_data; + bias->data = (uint8_t *)bias_data; + reference->data = (uint8_t *)ref_data; // uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); uint8_t output_tmp[out_size]; - output->data = output_tmp; + output->data = output_tmp; - if (csi_conv2d_init(input, output, kernel, bias, ¶ms) == CSINN_TRUE) { - csi_conv2d(input, output, kernel, bias, ¶ms); + if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) { + csinn_conv2d(input, output, kernel, bias, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); diff --git a/tests/validation_xt800/verify_maxpool_q7.c b/tests/validation_xt800/verify_maxpool_q7.c index 8682a717..62ad017d 100644 --- a/tests/validation_xt800/verify_maxpool_q7.c +++ b/tests/validation_xt800/verify_maxpool_q7.c @@ -16,35 +16,22 @@ * limitations under the License. */ -/* CSI-NN2 version 1.12.x */ +/* CSI-NN2 version 2.0.x */ -#include "test_utils.h" #include "csi_nn.h" #include "math_snr.h" +#include "test_utils.h" - -void verify_maxpool2d_q7(void *input_data, - void *output_data, - uint16_t batch, - uint16_t in_h, - uint16_t in_w, - uint16_t in_c, - uint16_t out_h, - uint16_t out_w, - uint16_t out_c, - uint16_t kernel_h, - uint16_t kernel_w, - uint16_t stride_h, - uint16_t stride_w, - uint16_t pad_x, - uint16_t pad_y, - float difference) +void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h, + uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w, + uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h, + uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference) { - struct csi_tensor *reference = csi_alloc_tensor(NULL); + struct csinn_tensor *reference = csinn_alloc_tensor(NULL); int in_size, out_size; - struct csi_tensor *input = csi_alloc_tensor(NULL); + struct csinn_tensor *input = csinn_alloc_tensor(NULL); input->dim[0] = batch; // N input->dim[1] = in_h; // H input->dim[2] = in_w; // W @@ -54,7 +41,7 @@ void verify_maxpool2d_q7(void *input_data, input->name = "input"; in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3]; - struct csi_tensor *output = csi_alloc_tensor(NULL); + struct csinn_tensor *output = csinn_alloc_tensor(NULL); output->dim[0] = input->dim[0]; output->dim[1] = out_h; output->dim[2] = out_w; @@ -64,28 +51,27 @@ void verify_maxpool2d_q7(void *input_data, output->name = "output"; out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3]; - struct pool_params params; - params.base.api = CSINN_API; - params.base.name = "params"; - params.base.layout = CSINN_LAYOUT_NCHW; - params.base.run_mode = CSINN_RM_LAYER; - params.ceil_mode = 0; - params.stride_height = stride_h; - params.stride_width = stride_w; - params.filter_height = kernel_h; - params.filter_width = kernel_w; - params.pad_left = pad_x; - params.pad_right = pad_x; - params.pad_top = pad_y; - params.pad_down = pad_y; + struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL); + params->base.api = CSINN_API; + params->base.name = "params"; + params->base.layout = CSINN_LAYOUT_NCHW; + params->ceil_mode = 0; + params->stride_height = stride_h; + params->stride_width = stride_w; + params->filter_height = kernel_h; + params->filter_width = kernel_w; + params->pad_left = pad_x; + params->pad_right = pad_x; + params->pad_top = pad_y; + params->pad_down = pad_y; - input->data = (uint8_t *)input_data; - reference->data = (uint8_t *)output_data; + input->data = (uint8_t *)input_data; + reference->data = (uint8_t *)output_data; uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t)); output->data = output_tmp; - if (csi_maxpool2d_init(input, output, ¶ms) == CSINN_TRUE) { - csi_maxpool2d(input, output, ¶ms); + if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) { + csinn_maxpool2d(input, output, params); } result_verify_q7(reference->data, output->data, input->data, difference, out_size, false); diff --git a/version b/version index 393ccdb5..e0102586 100644 --- a/version +++ b/version @@ -1 +1 @@ -1.12.10 +2.0.5